From 0bf19a357e0eaf03e757ac9482c45a797e40157a Mon Sep 17 00:00:00 2001 From: Siddarth G Date: Thu, 3 Apr 2025 15:43:45 +0530 Subject: selftests/mm: convert page_size to unsigned long Cppcheck warning: int result is assigned to long long variable. If the variable is long long to avoid loss of information, then you have loss of information. This patch changes the type of page_size from 'unsigned int' to 'unsigned long' instead of using ULL suffixes. Changing hpage_size to 'unsigned long' was considered, but since gethugepage() expects an int, this change was avoided. Link: https://lkml.kernel.org/r/20250403101345.29226-1-siddarthsgml@gmail.com Signed-off-by: Siddarth G Reported-by: David Binderman Closes: https://lore.kernel.org/all/AS8PR02MB10217315060BBFDB21F19643E9CA62@AS8PR02MB10217.eurprd02.prod.outlook.com/ Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 57b4bba2b45f..fe5ae8b25ff6 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -34,7 +34,7 @@ #define PAGEMAP "/proc/self/pagemap" int pagemap_fd; int uffd; -unsigned int page_size; +unsigned long page_size; unsigned int hpage_size; const char *progname; @@ -184,7 +184,7 @@ void *gethugetlb_mem(int size, int *shmid) int userfaultfd_tests(void) { - int mem_size, vec_size, written, num_pages = 16; + long mem_size, vec_size, written, num_pages = 16; char *mem, *vec; mem_size = num_pages * page_size; @@ -213,7 +213,7 @@ int userfaultfd_tests(void) written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, vec_size - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (written < 0) - ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", written, errno, strerror(errno)); ksft_test_result(written == 0, "%s all new pages must not be written (dirty)\n", __func__); @@ -995,7 +995,7 @@ int unmapped_region_tests(void) { void *start = (void *)0x10000000; int written, len = 0x00040000; - int vec_size = len / page_size; + long vec_size = len / page_size; struct page_region *vec = malloc(sizeof(struct page_region) * vec_size); /* 1. Get written pages */ @@ -1051,7 +1051,7 @@ static void test_simple(void) int sanity_tests(void) { unsigned long long mem_size, vec_size; - int ret, fd, i, buf_size; + long ret, fd, i, buf_size; struct page_region *vec; char *mem, *fmem; struct stat sbuf; @@ -1160,7 +1160,7 @@ int sanity_tests(void) ret = stat(progname, &sbuf); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); fmem = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (fmem == MAP_FAILED) -- cgit v1.2.3 From bc6fa711951185fa0fdf5974c50a1c4d0cd65be3 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 3 Mar 2025 13:20:52 +0200 Subject: selftests/ptrace: add a test case for PTRACE_SET_SYSCALL_INFO Check whether PTRACE_SET_SYSCALL_INFO semantics implemented in the kernel matches userspace expectations. Link: https://lkml.kernel.org/r/20250303112052.GG24170@strace.io Signed-off-by: Dmitry V. Levin Reviewed-by: Oleg Nesterov Cc: Alexander Gordeev Cc: Alexey Gladkov (Intel) Cc: Andreas Larsson Cc: anton ivanov Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Brian Cain Cc: Charlie Jenkins Cc: Christian Borntraeger Cc: Christian Zankel Cc: Christophe Leroy Cc: Dave Hansen Cc: Davide Berardi Cc: David S. Miller Cc: Dinh Nguyen Cc: Eugene Syromiatnikov Cc: Eugene Syromyatnikov Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Maciej W. Rozycki Cc: Madhavan Srinivasan Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Frysinger Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Renzo Davoi Cc: Richard Weinberger Cc: Rich Felker Cc: Russel King Cc: Shuah Khan Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- tools/testing/selftests/ptrace/Makefile | 2 +- tools/testing/selftests/ptrace/set_syscall_info.c | 519 ++++++++++++++++++++++ 2 files changed, 520 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/ptrace/set_syscall_info.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile index 1c631740a730..c5e0b76ba6ac 100644 --- a/tools/testing/selftests/ptrace/Makefile +++ b/tools/testing/selftests/ptrace/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -std=c99 -pthread -Wall $(KHDR_INCLUDES) -TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess get_set_sud +TEST_GEN_PROGS := get_syscall_info set_syscall_info peeksiginfo vmaccess get_set_sud include ../lib.mk diff --git a/tools/testing/selftests/ptrace/set_syscall_info.c b/tools/testing/selftests/ptrace/set_syscall_info.c new file mode 100644 index 000000000000..4198248ef874 --- /dev/null +++ b/tools/testing/selftests/ptrace/set_syscall_info.c @@ -0,0 +1,519 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2018-2025 Dmitry V. Levin + * All rights reserved. + * + * Check whether PTRACE_SET_SYSCALL_INFO semantics implemented in the kernel + * matches userspace expectations. + */ + +#include "../kselftest_harness.h" +#include +#include +#include +#include +#include +#include + +#if defined(_MIPS_SIM) && _MIPS_SIM == _MIPS_SIM_NABI32 +/* + * MIPS N32 is the only architecture where __kernel_ulong_t + * does not match the bitness of syscall arguments. + */ +typedef unsigned long long kernel_ulong_t; +#else +typedef __kernel_ulong_t kernel_ulong_t; +#endif + +struct si_entry { + int nr; + kernel_ulong_t args[6]; +}; +struct si_exit { + unsigned int is_error; + int rval; +}; + +static unsigned int ptrace_stop; +static pid_t tracee_pid; + +static int +kill_tracee(pid_t pid) +{ + if (!pid) + return 0; + + int saved_errno = errno; + + int rc = kill(pid, SIGKILL); + + errno = saved_errno; + return rc; +} + +static long +sys_ptrace(int request, pid_t pid, unsigned long addr, unsigned long data) +{ + return syscall(__NR_ptrace, request, pid, addr, data); +} + +#define LOG_KILL_TRACEE(fmt, ...) \ + do { \ + kill_tracee(tracee_pid); \ + TH_LOG("wait #%d: " fmt, \ + ptrace_stop, ##__VA_ARGS__); \ + } while (0) + +static void +check_psi_entry(struct __test_metadata *_metadata, + const struct ptrace_syscall_info *info, + const struct si_entry *exp_entry, + const char *text) +{ + unsigned int i; + int exp_nr = exp_entry->nr; +#if defined __s390__ || defined __s390x__ + /* s390 is the only architecture that has 16-bit syscall numbers */ + exp_nr &= 0xffff; +#endif + + ASSERT_EQ(PTRACE_SYSCALL_INFO_ENTRY, info->op) { + LOG_KILL_TRACEE("%s: entry stop mismatch", text); + } + ASSERT_TRUE(info->arch) { + LOG_KILL_TRACEE("%s: entry stop mismatch", text); + } + ASSERT_TRUE(info->instruction_pointer) { + LOG_KILL_TRACEE("%s: entry stop mismatch", text); + } + ASSERT_TRUE(info->stack_pointer) { + LOG_KILL_TRACEE("%s: entry stop mismatch", text); + } + ASSERT_EQ(exp_nr, info->entry.nr) { + LOG_KILL_TRACEE("%s: syscall nr mismatch", text); + } + for (i = 0; i < ARRAY_SIZE(exp_entry->args); ++i) { + ASSERT_EQ(exp_entry->args[i], info->entry.args[i]) { + LOG_KILL_TRACEE("%s: syscall arg #%u mismatch", + text, i); + } + } +} + +static void +check_psi_exit(struct __test_metadata *_metadata, + const struct ptrace_syscall_info *info, + const struct si_exit *exp_exit, + const char *text) +{ + ASSERT_EQ(PTRACE_SYSCALL_INFO_EXIT, info->op) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_TRUE(info->arch) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_TRUE(info->instruction_pointer) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_TRUE(info->stack_pointer) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_EQ(exp_exit->is_error, info->exit.is_error) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_EQ(exp_exit->rval, info->exit.rval) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } +} + +TEST(set_syscall_info) +{ + const pid_t tracer_pid = getpid(); + const kernel_ulong_t dummy[] = { + (kernel_ulong_t) 0xdad0bef0bad0fed0ULL, + (kernel_ulong_t) 0xdad1bef1bad1fed1ULL, + (kernel_ulong_t) 0xdad2bef2bad2fed2ULL, + (kernel_ulong_t) 0xdad3bef3bad3fed3ULL, + (kernel_ulong_t) 0xdad4bef4bad4fed4ULL, + (kernel_ulong_t) 0xdad5bef5bad5fed5ULL, + }; + int splice_in[2], splice_out[2]; + + ASSERT_EQ(0, pipe(splice_in)); + ASSERT_EQ(0, pipe(splice_out)); + ASSERT_EQ(sizeof(dummy), write(splice_in[1], dummy, sizeof(dummy))); + + const struct { + struct si_entry entry[2]; + struct si_exit exit[2]; + } si[] = { + /* change scno, keep non-error rval */ + { + { + { + __NR_gettid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_getppid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, tracer_pid }, { 0, tracer_pid } + } + }, + + /* set scno to -1, keep error rval */ + { + { + { + __NR_chdir, + { + (uintptr_t) ".", + dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + -1, + { + (uintptr_t) ".", + dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 1, -ENOSYS }, { 1, -ENOSYS } + } + }, + + /* keep scno, change non-error rval */ + { + { + { + __NR_getppid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_getppid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, tracer_pid }, { 0, tracer_pid + 1 } + } + }, + + /* change arg1, keep non-error rval */ + { + { + { + __NR_chdir, + { + (uintptr_t) "", + dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_chdir, + { + (uintptr_t) ".", + dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, 0 }, { 0, 0 } + } + }, + + /* set scno to -1, change error rval to non-error */ + { + { + { + __NR_gettid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + -1, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 1, -ENOSYS }, { 0, tracer_pid } + } + }, + + /* change scno, change non-error rval to error */ + { + { + { + __NR_chdir, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_getppid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, tracer_pid }, { 1, -EISDIR } + } + }, + + /* change scno and all args, change non-error rval */ + { + { + { + __NR_gettid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_splice, + { + splice_in[0], 0, splice_out[1], 0, + sizeof(dummy), SPLICE_F_NONBLOCK + } + } + }, { + { 0, sizeof(dummy) }, { 0, sizeof(dummy) + 1 } + } + }, + + /* change arg1, no exit stop */ + { + { + { + __NR_exit_group, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_exit_group, + { + 0, dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, 0 }, { 0, 0 } + } + }, + }; + + long rc; + unsigned int i; + + tracee_pid = fork(); + + ASSERT_LE(0, tracee_pid) { + TH_LOG("fork: %m"); + } + + if (tracee_pid == 0) { + /* get the pid before PTRACE_TRACEME */ + tracee_pid = getpid(); + ASSERT_EQ(0, sys_ptrace(PTRACE_TRACEME, 0, 0, 0)) { + TH_LOG("PTRACE_TRACEME: %m"); + } + ASSERT_EQ(0, kill(tracee_pid, SIGSTOP)) { + /* cannot happen */ + TH_LOG("kill SIGSTOP: %m"); + } + for (i = 0; i < ARRAY_SIZE(si); ++i) { + rc = syscall(si[i].entry[0].nr, + si[i].entry[0].args[0], + si[i].entry[0].args[1], + si[i].entry[0].args[2], + si[i].entry[0].args[3], + si[i].entry[0].args[4], + si[i].entry[0].args[5]); + if (si[i].exit[1].is_error) { + if (rc != -1 || errno != -si[i].exit[1].rval) + break; + } else { + if (rc != si[i].exit[1].rval) + break; + } + } + /* + * Something went wrong, but in this state tracee + * cannot reliably issue syscalls, so just crash. + */ + *(volatile unsigned char *) (uintptr_t) i = 42; + /* unreachable */ + _exit(i + 1); + } + + for (ptrace_stop = 0; ; ++ptrace_stop) { + struct ptrace_syscall_info info = { + .op = 0xff /* invalid PTRACE_SYSCALL_INFO_* op */ + }; + const size_t size = sizeof(info); + const int expected_entry_size = + (void *) &info.entry.args[6] - (void *) &info; + const int expected_exit_size = + (void *) (&info.exit.is_error + 1) - + (void *) &info; + int status; + + ASSERT_EQ(tracee_pid, wait(&status)) { + /* cannot happen */ + LOG_KILL_TRACEE("wait: %m"); + } + if (WIFEXITED(status)) { + tracee_pid = 0; /* the tracee is no more */ + ASSERT_EQ(0, WEXITSTATUS(status)) { + LOG_KILL_TRACEE("unexpected exit status %u", + WEXITSTATUS(status)); + } + break; + } + ASSERT_FALSE(WIFSIGNALED(status)) { + tracee_pid = 0; /* the tracee is no more */ + LOG_KILL_TRACEE("unexpected signal %u", + WTERMSIG(status)); + } + ASSERT_TRUE(WIFSTOPPED(status)) { + /* cannot happen */ + LOG_KILL_TRACEE("unexpected wait status %#x", status); + } + + ASSERT_LT(ptrace_stop, ARRAY_SIZE(si) * 2) { + LOG_KILL_TRACEE("ptrace stop overflow"); + } + + switch (WSTOPSIG(status)) { + case SIGSTOP: + ASSERT_EQ(0, ptrace_stop) { + LOG_KILL_TRACEE("unexpected signal stop"); + } + ASSERT_EQ(0, sys_ptrace(PTRACE_SETOPTIONS, tracee_pid, + 0, PTRACE_O_TRACESYSGOOD)) { + LOG_KILL_TRACEE("PTRACE_SETOPTIONS: %m"); + } + break; + + case SIGTRAP | 0x80: + ASSERT_LT(0, ptrace_stop) { + LOG_KILL_TRACEE("unexpected syscall stop"); + } + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #1: %m"); + } + if (ptrace_stop & 1) { + /* entering syscall */ + const struct si_entry *exp_entry = + &si[ptrace_stop / 2].entry[0]; + const struct si_entry *set_entry = + &si[ptrace_stop / 2].entry[1]; + + /* check ptrace_syscall_info before the changes */ + ASSERT_EQ(expected_entry_size, rc) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #1" + ": entry stop mismatch"); + } + check_psi_entry(_metadata, &info, exp_entry, + "PTRACE_GET_SYSCALL_INFO #1"); + + /* apply the changes */ + info.entry.nr = set_entry->nr; + for (i = 0; i < ARRAY_SIZE(set_entry->args); ++i) + info.entry.args[i] = set_entry->args[i]; + ASSERT_EQ(0, sys_ptrace(PTRACE_SET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info)) { + LOG_KILL_TRACEE("PTRACE_SET_SYSCALL_INFO: %m"); + } + + /* check ptrace_syscall_info after the changes */ + memset(&info, 0, sizeof(info)); + info.op = 0xff; + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO: %m"); + } + ASSERT_EQ(expected_entry_size, rc) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #2" + ": entry stop mismatch"); + } + check_psi_entry(_metadata, &info, set_entry, + "PTRACE_GET_SYSCALL_INFO #2"); + } else { + /* exiting syscall */ + const struct si_exit *exp_exit = + &si[ptrace_stop / 2 - 1].exit[0]; + const struct si_exit *set_exit = + &si[ptrace_stop / 2 - 1].exit[1]; + + /* check ptrace_syscall_info before the changes */ + ASSERT_EQ(expected_exit_size, rc) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #1" + ": exit stop mismatch"); + } + check_psi_exit(_metadata, &info, exp_exit, + "PTRACE_GET_SYSCALL_INFO #1"); + + /* apply the changes */ + info.exit.is_error = set_exit->is_error; + info.exit.rval = set_exit->rval; + ASSERT_EQ(0, sys_ptrace(PTRACE_SET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info)) { + LOG_KILL_TRACEE("PTRACE_SET_SYSCALL_INFO: %m"); + } + + /* check ptrace_syscall_info after the changes */ + memset(&info, 0, sizeof(info)); + info.op = 0xff; + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #2: %m"); + } + ASSERT_EQ(expected_exit_size, rc) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #2" + ": exit stop mismatch"); + } + check_psi_exit(_metadata, &info, set_exit, + "PTRACE_GET_SYSCALL_INFO #2"); + } + break; + + default: + LOG_KILL_TRACEE("unexpected stop signal %u", + WSTOPSIG(status)); + abort(); + } + + ASSERT_EQ(0, sys_ptrace(PTRACE_SYSCALL, tracee_pid, 0, 0)) { + LOG_KILL_TRACEE("PTRACE_SYSCALL: %m"); + } + } + + ASSERT_EQ(ptrace_stop, ARRAY_SIZE(si) * 2); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From a9562fd03a5edbc982ef26d729da3f5968533d23 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 24 Mar 2025 06:53:28 +0000 Subject: selftests/mm: add PAGEMAP_SCAN guard region test Add a selftest to verify the PAGEMAP_SCAN ioctl correctly reports guard regions using the newly introduced PAGE_IS_GUARD flag. Link: https://lkml.kernel.org/r/20250324065328.107678-4-avagin@google.com Signed-off-by: Andrei Vagin Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/guard-regions.c | 57 ++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index eba43ead13ae..0cd9d236649d 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -2075,4 +2076,60 @@ TEST_F(guard_regions, pagemap) ASSERT_EQ(munmap(ptr, 10 * page_size), 0); } +/* + * Assert that PAGEMAP_SCAN correctly reports guard region ranges. + */ +TEST_F(guard_regions, pagemap_scan) +{ + const unsigned long page_size = self->page_size; + struct page_region pm_regs[10]; + struct pm_scan_arg pm_scan_args = { + .size = sizeof(struct pm_scan_arg), + .category_anyof_mask = PAGE_IS_GUARD, + .return_mask = PAGE_IS_GUARD, + .vec = (long)&pm_regs, + .vec_len = ARRAY_SIZE(pm_regs), + }; + int proc_fd, i; + char *ptr; + + proc_fd = open("/proc/self/pagemap", O_RDONLY); + ASSERT_NE(proc_fd, -1); + + ptr = mmap_(self, variant, NULL, 10 * page_size, + PROT_READ | PROT_WRITE, 0, 0); + ASSERT_NE(ptr, MAP_FAILED); + + pm_scan_args.start = (long)ptr; + pm_scan_args.end = (long)ptr + 10 * page_size; + ASSERT_EQ(ioctl(proc_fd, PAGEMAP_SCAN, &pm_scan_args), 0); + ASSERT_EQ(pm_scan_args.walk_end, (long)ptr + 10 * page_size); + + /* Install a guard region in every other page. */ + for (i = 0; i < 10; i += 2) { + char *ptr_p = &ptr[i * page_size]; + + ASSERT_EQ(syscall(__NR_madvise, ptr_p, page_size, MADV_GUARD_INSTALL), 0); + } + + /* + * Assert ioctl() returns the count of located regions, where each + * region spans every other page within the range of 10 pages. + */ + ASSERT_EQ(ioctl(proc_fd, PAGEMAP_SCAN, &pm_scan_args), 5); + ASSERT_EQ(pm_scan_args.walk_end, (long)ptr + 10 * page_size); + + /* Re-read from pagemap, and assert guard regions are detected. */ + for (i = 0; i < 5; i++) { + long ptr_p = (long)&ptr[2 * i * page_size]; + + ASSERT_EQ(pm_regs[i].start, ptr_p); + ASSERT_EQ(pm_regs[i].end, ptr_p + page_size); + ASSERT_EQ(pm_regs[i].categories, PAGE_IS_GUARD); + } + + ASSERT_EQ(close(proc_fd), 0); + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From e487a5d513cb6a0faf7e48523416434b111318b3 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Mon, 7 Apr 2025 16:42:01 +0800 Subject: selftest/mm: make hugetlb_reparenting_test tolerant to async reparenting In cgroup v2, memory and hugetlb usage reparenting is asynchronous. This can cause test flakiness when immediately asserting usage after deleting a child cgroup. To address this, add a helper function `assert_with_retry()` that checks usage values with a timeout-based retry. This improves test stability without relying on fixed sleep delays. Also bump up the tolerance size to 7MB. To avoid False Positives: ... # Assert memory charged correctly for child only use. # actual a = 11 MB # expected a = 0 MB # fail # cleanup # [FAIL] not ok 11 hugetlb_reparenting_test.sh -cgroup-v2 # exit=1 # 0 # SUMMARY: PASS=10 SKIP=0 FAIL=1 Link: https://lkml.kernel.org/r/20250407084201.74492-1-liwang@redhat.com Signed-off-by: Li Wang Tested-by: Donet Tom Cc: Waiman Long Cc: Anshuman Khandual Cc: Dev Jain Cc: Kirill A. Shuemov Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/mm/hugetlb_reparenting_test.sh | 96 +++++++++------------- 1 file changed, 41 insertions(+), 55 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh index 0b0d4ba1af27..9245549b66cf 100755 --- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh +++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh @@ -36,7 +36,7 @@ else do_umount=1 fi fi -MNT='/mnt/huge/' +MNT='/mnt/huge' function get_machine_hugepage_size() { hpz=$(grep -i hugepagesize /proc/meminfo) @@ -60,6 +60,41 @@ function cleanup() { set -e } +function assert_with_retry() { + local actual_path="$1" + local expected="$2" + local tolerance=$((7 * 1024 * 1024)) + local timeout=20 + local interval=1 + local start_time + local now + local elapsed + local actual + + start_time=$(date +%s) + + while true; do + actual="$(cat "$actual_path")" + + if [[ $actual -ge $(($expected - $tolerance)) ]] && + [[ $actual -le $(($expected + $tolerance)) ]]; then + return 0 + fi + + now=$(date +%s) + elapsed=$((now - start_time)) + + if [[ $elapsed -ge $timeout ]]; then + echo "actual = $((${actual%% *} / 1024 / 1024)) MB" + echo "expected = $((${expected%% *} / 1024 / 1024)) MB" + cleanup + exit 1 + fi + + sleep $interval + done +} + function assert_state() { local expected_a="$1" local expected_a_hugetlb="$2" @@ -70,58 +105,13 @@ function assert_state() { expected_b="$3" expected_b_hugetlb="$4" fi - local tolerance=$((5 * 1024 * 1024)) - - local actual_a - actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)" - if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] || - [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then - echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB - echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - - local actual_a_hugetlb - actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)" - if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] || - [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then - echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB - echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - - if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then - return - fi - - local actual_b - actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)" - if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] || - [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then - echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB - echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - local actual_b_hugetlb - actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)" - if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] || - [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then - echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB - echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB - echo fail + assert_with_retry "$CGROUP_ROOT/a/memory.$usage_file" "$expected_a" + assert_with_retry "$CGROUP_ROOT/a/hugetlb.${MB}MB.$usage_file" "$expected_a_hugetlb" - cleanup - exit 1 + if [[ -n "$expected_b" && -n "$expected_b_hugetlb" ]]; then + assert_with_retry "$CGROUP_ROOT/a/b/memory.$usage_file" "$expected_b" + assert_with_retry "$CGROUP_ROOT/a/b/hugetlb.${MB}MB.$usage_file" "$expected_b_hugetlb" fi } @@ -174,7 +164,6 @@ size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages. cleanup -echo echo echo Test charge, rmdir, uncharge setup @@ -195,7 +184,6 @@ cleanup echo done echo -echo if [[ ! $cgroup2 ]]; then echo "Test parent and child hugetlb usage" setup @@ -212,7 +200,6 @@ if [[ ! $cgroup2 ]]; then assert_state 0 $(($size * 2)) 0 $size rmdir "$CGROUP_ROOT"/a/b - sleep 5 echo Assert memory reparent correctly. assert_state 0 $(($size * 2)) @@ -224,7 +211,6 @@ if [[ ! $cgroup2 ]]; then cleanup fi -echo echo echo "Test child only hugetlb usage" echo setup -- cgit v1.2.3 From 879bca0a2c4f40b08d09a95a2a0c3c6513060b5c Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 8 Apr 2025 10:29:31 +0100 Subject: mm/vma: fix incorrectly disallowed anonymous VMA merges Patch series "fix incorrectly disallowed anonymous VMA merges", v2. It appears that we have been incorrectly rejecting merge cases for 15 years, apparently by mistake. Imagine a range of anonymous mapped momemory divided into two VMAs like this, with incompatible protection bits: RW RWX unfaulted faulted |-----------|-----------| | prev | vma | |-----------|-----------| mprotect(RW) Now imagine mprotect()'ing vma so it is RW. This appears as if it should merge, it does not. Neither does this case, again mprotect()'ing vma RW: RWX RW faulted unfaulted |-----------|-----------| | vma | next | |-----------|-----------| mprotect(RW) Nor: RW RWX RW unfaulted faulted unfaulted |-----------|-----------|-----------| | prev | vma | next | |-----------|-----------|-----------| mprotect(RW) What's going on here? In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process server scalability issue"), from 2010, Rik von Riel took careful care to account for these cases - commenting that '[this is] easily overlooked: when mprotect shifts the boundary, make sure the expanding vma has anon_vma set if the shrinking vma had, to cover any anon pages imported.' However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs") introduced a little over a year later, appears to have accidentally disallowed this. By adjusting the is_mergeable_anon_vma() function to avoid lock contention across large trees of forked anon_vma's, this commit wrongly assumed the VMA being checked (the ostensible merge 'target') should be faulted, that is, have an anon_vma, and thus an anon_vma_chain list established, but only of length 1. This appears to have been unintentional, as disallowing empty target VMAs like this across the board makes no sense. We already have logic that accounts for this case, the same logic Rik introduced in 2010, now via dup_anon_vma() (and ultimately anon_vma_clone()), so there is no problem permitting this. This series fixes this mistake and also ensures that scalability concerns remain addressed by explicitly checking that whatever VMA is being merged has not been forked. A full set of self tests which reproduce the issue are provided, as well as updating userland VMA tests to assert this behaviour. The self tests additionally assert scalability concerns are addressed. This patch (of 3): anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process server scalability issue"). This patch was introduced in March 2010. As part of this change, careful attention was made to the instance of mprotect() causing a VMA merge, with one faulted (i.e. having anon_vma set) and another not: /* * Easily overlooked: when mprotect shifts the boundary, * make sure the expanding vma has anon_vma set if the * shrinking vma had, to cover any anon pages imported. */ In the modern VMA code, this is handled in dup_anon_vma() (and ultimately anon_vma_clone()). This case is one of the three configurations of adjacent VMA anon_vma state that we might encounter on merge (where dst is the VMA which will be merged into and src the one being merged into dst): 1. dst->anon_vma, src->anon_vma - These must be equal, no-op. 2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op. 3. !dst->anon_vma, src->anon_vma - The case in question here. In case 3, the instance addressed here - we duplicate the AVC connections from src and place into dst. However, in practice, we very often do NOT do this. This appears to be due to an inadvertent consequence of the change introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"), introduced in May 2011. This implies that this merge case was functional only for a little over a year, and has since been broken for ~15 years. Here, lock scalability concerns lead to us restricting anonymous merges only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a VMA that is not connected to any parent process's anon_vma. The mergeability test looks like this: static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, struct anon_vma *anon_vma2, struct vm_area_struct *vma) { if ((!anon_vma1 || !anon_vma2) && (!vma || !vma->anon_vma || list_is_singular(&vma->anon_vma_chain))) return true; return anon_vma1 == anon_vma2; } However, we have a problem here - typically the vma passed here is the destination VMA. For instance in vma_merge_existing_range() we invoke: can_vma_merge_left() -> [ check that there is an immediately adjacent prior VMA ] -> can_vma_merge_after() -> is_mergeable_vma() for general attribute check -> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev) So if we were considering a target unfaulted 'prev': unfaulted faulted |-----------|-----------| | prev | vma | |-----------|-----------| This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev). The list_is_singular() check for vma->anon_vma_chain, an empty list on fault, would cause this merge to _fail_ even though all else indicates a merge. Equally a simple merge into a next VMA would hit the same problem: faulted unfaulted |-----------|-----------| | vma | next | |-----------|-----------| can_vma_merge_right() -> [ check that there is an immediately adjacent succeeding VMA ] -> can_vma_merge_before() -> is_mergeable_vma() for general attribute check -> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next) For a 3-way merge, we'd also hit the same problem if it was configured like this for instance: unfaulted faulted unfaulted |-----------|-----------|-----------| | prev | vma | next | |-----------|-----------|-----------| As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for next, both of which would fail. vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as the new VMA would never already be faulted (it is a proposed new range). Because we already handle each of the aforementioned merge cases, and can absolutely therefore deal with an existing VMA merge with !dst->anon_vma, src->anon_vma, there is absolutely no reason to disallow this kind of merge. It seems that the intention of this patch is to ensure that, in the instance of merging unfaulted VMAs with faulted ones, we never wish to do so with those with multiple AVCs due to the fact that anon_vma lock's are held across both parent and child anon_vma's (actually, the 'root' parent anon_vma's lock is used). In fact, the original commit alludes to this - "find_mergeable_anon_vma() already considers this case". In find_mergeable_anon_vma() however, we check the anon_vma which will be merged from, if it is set, then we check list_is_singular(vma->anon_vma_chain). So to match this logic, update is_mergeable_anon_vma() to perform this scalability check on the VMA whose anon_vma we ultimately merge into. This matches existing behaviour with forked VMAs, only we no longer wrongly disallow ALL empty target merges. So we both allow merge cases and ensure the scalability check is correctly applied. We may wish to revisit these lock scalability concerns at a later date and ensure they are still valid. Additionally, correct userland VMA tests which were mistakenly not asserting these cases correctly previously to now correctly assert this, and to ensure vmg->anon_vma state is always consistent to account for newly introduced asserts. Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs") Signed-off-by: Lorenzo Stoakes Reviewed-by: Yeoreum Yun Cc: David Hildenbrand Cc: Jann Horn Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- tools/testing/vma/vma.c | 100 +++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 47 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 11f761769b5b..7cfd6e31db10 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -185,6 +185,15 @@ static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, vmg->__adjust_next_start = false; } +/* Helper function to set both the VMG range and its anon_vma. */ +static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long start, + unsigned long end, pgoff_t pgoff, vm_flags_t flags, + struct anon_vma *anon_vma) +{ + vmg_set_range(vmg, start, end, pgoff, flags); + vmg->anon_vma = anon_vma; +} + /* * Helper function to try to merge a new VMA. * @@ -265,6 +274,22 @@ static void dummy_close(struct vm_area_struct *) { } +static void __vma_set_dummy_anon_vma(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) +{ + vma->anon_vma = anon_vma; + INIT_LIST_HEAD(&vma->anon_vma_chain); + list_add(&avc->same_vma, &vma->anon_vma_chain); + avc->anon_vma = vma->anon_vma; +} + +static void vma_set_dummy_anon_vma(struct vm_area_struct *vma, + struct anon_vma_chain *avc) +{ + __vma_set_dummy_anon_vma(vma, avc, &dummy_anon_vma); +} + static bool test_simple_merge(void) { struct vm_area_struct *vma; @@ -953,6 +978,7 @@ static bool test_merge_existing(void) const struct vm_operations_struct vm_ops = { .close = dummy_close, }; + struct anon_vma_chain avc = {}; /* * Merge right case - partial span. @@ -968,10 +994,10 @@ static bool test_merge_existing(void) vma->vm_ops = &vm_ops; /* This should have no impact. */ vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ - vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, flags, &dummy_anon_vma); vmg.middle = vma; vmg.prev = vma; - vma->anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &avc); ASSERT_EQ(merge_existing(&vmg), vma_next); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_next->vm_start, 0x3000); @@ -1001,9 +1027,9 @@ static bool test_merge_existing(void) vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags); vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ - vmg_set_range(&vmg, 0x2000, 0x6000, 2, flags); + vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, flags, &dummy_anon_vma); vmg.middle = vma; - vma->anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &avc); ASSERT_EQ(merge_existing(&vmg), vma_next); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_next->vm_start, 0x2000); @@ -1030,11 +1056,10 @@ static bool test_merge_existing(void) vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ - vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; - vma->anon_vma = &dummy_anon_vma; - + vma_set_dummy_anon_vma(vma, &avc); ASSERT_EQ(merge_existing(&vmg), vma_prev); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); @@ -1064,10 +1089,10 @@ static bool test_merge_existing(void) vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); - vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; - vma->anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &avc); ASSERT_EQ(merge_existing(&vmg), vma_prev); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); @@ -1094,10 +1119,10 @@ static bool test_merge_existing(void) vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); - vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; - vma->anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &avc); ASSERT_EQ(merge_existing(&vmg), vma_prev); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); @@ -1180,12 +1205,9 @@ static bool test_anon_vma_non_mergeable(void) .mm = &mm, .vmi = &vmi, }; - struct anon_vma_chain dummy_anon_vma_chain1 = { - .anon_vma = &dummy_anon_vma, - }; - struct anon_vma_chain dummy_anon_vma_chain2 = { - .anon_vma = &dummy_anon_vma, - }; + struct anon_vma_chain dummy_anon_vma_chain_1 = {}; + struct anon_vma_chain dummy_anon_vma_chain_2 = {}; + struct anon_vma dummy_anon_vma_2; /* * In the case of modified VMA merge, merging both left and right VMAs @@ -1209,24 +1231,11 @@ static bool test_anon_vma_non_mergeable(void) * * However, when prev is compared to next, the merge should fail. */ - - INIT_LIST_HEAD(&vma_prev->anon_vma_chain); - list_add(&dummy_anon_vma_chain1.same_vma, &vma_prev->anon_vma_chain); - ASSERT_TRUE(list_is_singular(&vma_prev->anon_vma_chain)); - vma_prev->anon_vma = &dummy_anon_vma; - ASSERT_TRUE(is_mergeable_anon_vma(NULL, vma_prev->anon_vma, vma_prev)); - - INIT_LIST_HEAD(&vma_next->anon_vma_chain); - list_add(&dummy_anon_vma_chain2.same_vma, &vma_next->anon_vma_chain); - ASSERT_TRUE(list_is_singular(&vma_next->anon_vma_chain)); - vma_next->anon_vma = (struct anon_vma *)2; - ASSERT_TRUE(is_mergeable_anon_vma(NULL, vma_next->anon_vma, vma_next)); - - ASSERT_FALSE(is_mergeable_anon_vma(vma_prev->anon_vma, vma_next->anon_vma, NULL)); - - vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, NULL); vmg.prev = vma_prev; vmg.middle = vma; + vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1); + __vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2); ASSERT_EQ(merge_existing(&vmg), vma_prev); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); @@ -1253,17 +1262,12 @@ static bool test_anon_vma_non_mergeable(void) vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); - INIT_LIST_HEAD(&vma_prev->anon_vma_chain); - list_add(&dummy_anon_vma_chain1.same_vma, &vma_prev->anon_vma_chain); - vma_prev->anon_vma = (struct anon_vma *)1; - - INIT_LIST_HEAD(&vma_next->anon_vma_chain); - list_add(&dummy_anon_vma_chain2.same_vma, &vma_next->anon_vma_chain); - vma_next->anon_vma = (struct anon_vma *)2; - - vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, NULL); vmg.prev = vma_prev; + vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1); + __vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2); + vmg.anon_vma = NULL; ASSERT_EQ(merge_new(&vmg), vma_prev); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); @@ -1363,8 +1367,8 @@ static bool test_dup_anon_vma(void) vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); - - vma->anon_vma = &dummy_anon_vma; + vmg.anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain); vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -1392,7 +1396,7 @@ static bool test_dup_anon_vma(void) vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags); - vma->anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain); vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -1420,7 +1424,7 @@ static bool test_dup_anon_vma(void) vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, flags); vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); - vma->anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain); vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); vmg.prev = vma; vmg.middle = vma; @@ -1447,6 +1451,7 @@ static bool test_vmi_prealloc_fail(void) .mm = &mm, .vmi = &vmi, }; + struct anon_vma_chain avc = {}; struct vm_area_struct *vma_prev, *vma; /* @@ -1459,9 +1464,10 @@ static bool test_vmi_prealloc_fail(void) vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); vma->anon_vma = &dummy_anon_vma; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x5000, 3, flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; + vma_set_dummy_anon_vma(vma, &avc); fail_prealloc = true; -- cgit v1.2.3 From bd23f293a0d56fd18d51e5364ecf8c277c6e9531 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 8 Apr 2025 10:29:32 +0100 Subject: tools/testing: add PROCMAP_QUERY helper functions in mm self tests The PROCMAP_QUERY ioctl() is very useful - it allows for binary access to /proc/$pid/[s]maps data and thus convenient lookup of data contained there. This patch exposes this for convenient use by mm self tests so the state of VMAs can easily be queried. Link: https://lkml.kernel.org/r/ce83d877093d1fc594762cf4b82f0c27963030ee.1744104124.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Yeoreum Yun Reviewed-by: Wei Yang Cc: David Hildenbrand Cc: Jann Horn Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/vm_util.c | 62 ++++++++++++++++++++++++++++++++++++ tools/testing/selftests/mm/vm_util.h | 21 ++++++++++++ 2 files changed, 83 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index a36734fb62f3..1357e2d6a7b6 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include @@ -424,3 +425,64 @@ bool check_vmflag_io(void *addr) flags += flaglen; } } + +/* + * Open an fd at /proc/$pid/maps and configure procmap_out ready for + * PROCMAP_QUERY query. Returns 0 on success, or an error code otherwise. + */ +int open_procmap(pid_t pid, struct procmap_fd *procmap_out) +{ + char path[256]; + int ret = 0; + + memset(procmap_out, '\0', sizeof(*procmap_out)); + sprintf(path, "/proc/%d/maps", pid); + procmap_out->query.size = sizeof(procmap_out->query); + procmap_out->fd = open(path, O_RDONLY); + if (procmap_out < 0) + ret = -errno; + + return ret; +} + +/* Perform PROCMAP_QUERY. Returns 0 on success, or an error code otherwise. */ +int query_procmap(struct procmap_fd *procmap) +{ + int ret = 0; + + if (ioctl(procmap->fd, PROCMAP_QUERY, &procmap->query) == -1) + ret = -errno; + + return ret; +} + +/* + * Try to find the VMA at specified address, returns true if found, false if not + * found, and the test is failed if any other error occurs. + * + * On success, procmap->query is populated with the results. + */ +bool find_vma_procmap(struct procmap_fd *procmap, void *address) +{ + int err; + + procmap->query.query_flags = 0; + procmap->query.query_addr = (unsigned long)address; + err = query_procmap(procmap); + if (!err) + return true; + + if (err != -ENOENT) + ksft_exit_fail_msg("%s: Error %d on ioctl(PROCMAP_QUERY)\n", + __func__, err); + return false; +} + +/* + * Close fd used by PROCMAP_QUERY mechanism. Returns 0 on success, or an error + * code otherwise. + */ +int close_procmap(struct procmap_fd *procmap) +{ + return close(procmap->fd); +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 6effafdc4d8a..9211ba640d9c 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -6,6 +6,7 @@ #include /* ffsl() */ #include /* _SC_PAGESIZE */ #include "../kselftest.h" +#include #define BIT_ULL(nr) (1ULL << (nr)) #define PM_SOFT_DIRTY BIT_ULL(55) @@ -19,6 +20,15 @@ extern unsigned int __page_size; extern unsigned int __page_shift; +/* + * Represents an open fd and PROCMAP_QUERY state for binary (via ioctl) + * /proc/$pid/[s]maps lookup. + */ +struct procmap_fd { + int fd; + struct procmap_query query; +}; + static inline unsigned int psize(void) { if (!__page_size) @@ -73,6 +83,17 @@ int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, bool miss, bool wp, bool minor, uint64_t *ioctls); unsigned long get_free_hugepages(void); bool check_vmflag_io(void *addr); +int open_procmap(pid_t pid, struct procmap_fd *procmap_out); +int query_procmap(struct procmap_fd *procmap); +bool find_vma_procmap(struct procmap_fd *procmap, void *address); +int close_procmap(struct procmap_fd *procmap); + +static inline int open_self_procmap(struct procmap_fd *procmap_out) +{ + pid_t pid = getpid(); + + return open_procmap(pid, procmap_out); +} /* * On ppc64 this will only work with radix 2M hugepage size -- cgit v1.2.3 From 10d288964d48e52354f002f7e0f64d1df9496ab1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 8 Apr 2025 10:29:33 +0100 Subject: tools/testing/selftests: assert that anon merge cases behave as expected Prior to the recently applied commit that permits this merge, mprotect()'ing a faulted VMA, adjacent to an unfaulted VMA, such that the two share characteristics would fail to merge due to what appear to be unintended consequences of commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"). Now we have fixed this bug, assert that we can indeed merge anonymous VMAs this way. Also assert that forked source/target VMAs are equally rejected. Previously, all empty target anon merges with one VMA faulted and the other unfaulted would be rejected incorrectly, now we ensure that unforked merge, but forked do not. Additionally, add the new test file to the MEMORY MAPPING section in MAINTAINERS, as these tests are explicitly memory mapping related. Link: https://lkml.kernel.org/r/2b69330274a3b71721f7042c5eabe91143934415.1744104124.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Yeoreum Yun Cc: David Hildenbrand Cc: Jann Horn Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/merge.c | 455 ++++++++++++++++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 4 + 4 files changed, 461 insertions(+) create mode 100644 tools/testing/selftests/mm/merge.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index c5241b193db8..91db34941a14 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -58,3 +58,4 @@ hugetlb_dio pkey_sighandler_tests_32 pkey_sighandler_tests_64 guard-regions +merge diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 8270895039d1..ad4d6043a60f 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -98,6 +98,7 @@ TEST_GEN_FILES += hugetlb_madv_vs_map TEST_GEN_FILES += hugetlb_dio TEST_GEN_FILES += droppable TEST_GEN_FILES += guard-regions +TEST_GEN_FILES += merge ifneq ($(ARCH),arm64) TEST_GEN_FILES += soft-dirty diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c new file mode 100644 index 000000000000..c76646cdf6e6 --- /dev/null +++ b/tools/testing/selftests/mm/merge.c @@ -0,0 +1,455 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE +#include "../kselftest_harness.h" +#include +#include +#include +#include +#include +#include "vm_util.h" + +FIXTURE(merge) +{ + unsigned int page_size; + char *carveout; + struct procmap_fd procmap; +}; + +FIXTURE_SETUP(merge) +{ + self->page_size = psize(); + /* Carve out PROT_NONE region to map over. */ + self->carveout = mmap(NULL, 12 * self->page_size, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(self->carveout, MAP_FAILED); + /* Setup PROCMAP_QUERY interface. */ + ASSERT_EQ(open_self_procmap(&self->procmap), 0); +} + +FIXTURE_TEARDOWN(merge) +{ + ASSERT_EQ(munmap(self->carveout, 12 * self->page_size), 0); + ASSERT_EQ(close_procmap(&self->procmap), 0); +} + +TEST_F(merge, mprotect_unfaulted_left) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + char *ptr; + + /* + * Map 10 pages of R/W memory within. MAP_NORESERVE so we don't hit + * merge failure due to lack of VM_ACCOUNT flag by mistake. + * + * |-----------------------| + * | unfaulted | + * |-----------------------| + */ + ptr = mmap(&carveout[page_size], 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + /* + * Now make the first 5 pages read-only, splitting the VMA: + * + * RO RW + * |-----------|-----------| + * | unfaulted | unfaulted | + * |-----------|-----------| + */ + ASSERT_EQ(mprotect(ptr, 5 * page_size, PROT_READ), 0); + /* + * Fault in the first of the last 5 pages so it gets an anon_vma and + * thus the whole VMA becomes 'faulted': + * + * RO RW + * |-----------|-----------| + * | unfaulted | faulted | + * |-----------|-----------| + */ + ptr[5 * page_size] = 'x'; + /* + * Now mprotect() the RW region read-only, we should merge (though for + * ~15 years we did not! :): + * + * RO + * |-----------------------| + * | faulted | + * |-----------------------| + */ + ASSERT_EQ(mprotect(&ptr[5 * page_size], 5 * page_size, PROT_READ), 0); + + /* Assert that the merge succeeded using PROCMAP_QUERY. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 10 * page_size); +} + +TEST_F(merge, mprotect_unfaulted_right) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + char *ptr; + + /* + * |-----------------------| + * | unfaulted | + * |-----------------------| + */ + ptr = mmap(&carveout[page_size], 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + /* + * Now make the last 5 pages read-only, splitting the VMA: + * + * RW RO + * |-----------|-----------| + * | unfaulted | unfaulted | + * |-----------|-----------| + */ + ASSERT_EQ(mprotect(&ptr[5 * page_size], 5 * page_size, PROT_READ), 0); + /* + * Fault in the first of the first 5 pages so it gets an anon_vma and + * thus the whole VMA becomes 'faulted': + * + * RW RO + * |-----------|-----------| + * | faulted | unfaulted | + * |-----------|-----------| + */ + ptr[0] = 'x'; + /* + * Now mprotect() the RW region read-only, we should merge: + * + * RO + * |-----------------------| + * | faulted | + * |-----------------------| + */ + ASSERT_EQ(mprotect(ptr, 5 * page_size, PROT_READ), 0); + + /* Assert that the merge succeeded using PROCMAP_QUERY. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 10 * page_size); +} + +TEST_F(merge, mprotect_unfaulted_both) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + char *ptr; + + /* + * |-----------------------| + * | unfaulted | + * |-----------------------| + */ + ptr = mmap(&carveout[2 * page_size], 9 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + /* + * Now make the first and last 3 pages read-only, splitting the VMA: + * + * RO RW RO + * |-----------|-----------|-----------| + * | unfaulted | unfaulted | unfaulted | + * |-----------|-----------|-----------| + */ + ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0); + ASSERT_EQ(mprotect(&ptr[6 * page_size], 3 * page_size, PROT_READ), 0); + /* + * Fault in the first of the middle 3 pages so it gets an anon_vma and + * thus the whole VMA becomes 'faulted': + * + * RO RW RO + * |-----------|-----------|-----------| + * | unfaulted | faulted | unfaulted | + * |-----------|-----------|-----------| + */ + ptr[3 * page_size] = 'x'; + /* + * Now mprotect() the RW region read-only, we should merge: + * + * RO + * |-----------------------| + * | faulted | + * |-----------------------| + */ + ASSERT_EQ(mprotect(&ptr[3 * page_size], 3 * page_size, PROT_READ), 0); + + /* Assert that the merge succeeded using PROCMAP_QUERY. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 9 * page_size); +} + +TEST_F(merge, mprotect_faulted_left_unfaulted_right) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + char *ptr; + + /* + * |-----------------------| + * | unfaulted | + * |-----------------------| + */ + ptr = mmap(&carveout[2 * page_size], 9 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + /* + * Now make the last 3 pages read-only, splitting the VMA: + * + * RW RO + * |-----------------------|-----------| + * | unfaulted | unfaulted | + * |-----------------------|-----------| + */ + ASSERT_EQ(mprotect(&ptr[6 * page_size], 3 * page_size, PROT_READ), 0); + /* + * Fault in the first of the first 6 pages so it gets an anon_vma and + * thus the whole VMA becomes 'faulted': + * + * RW RO + * |-----------------------|-----------| + * | unfaulted | unfaulted | + * |-----------------------|-----------| + */ + ptr[0] = 'x'; + /* + * Now make the first 3 pages read-only, splitting the VMA: + * + * RO RW RO + * |-----------|-----------|-----------| + * | faulted | faulted | unfaulted | + * |-----------|-----------|-----------| + */ + ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0); + /* + * Now mprotect() the RW region read-only, we should merge: + * + * RO + * |-----------------------| + * | faulted | + * |-----------------------| + */ + ASSERT_EQ(mprotect(&ptr[3 * page_size], 3 * page_size, PROT_READ), 0); + + /* Assert that the merge succeeded using PROCMAP_QUERY. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 9 * page_size); +} + +TEST_F(merge, mprotect_unfaulted_left_faulted_right) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + char *ptr; + + /* + * |-----------------------| + * | unfaulted | + * |-----------------------| + */ + ptr = mmap(&carveout[2 * page_size], 9 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + /* + * Now make the first 3 pages read-only, splitting the VMA: + * + * RO RW + * |-----------|-----------------------| + * | unfaulted | unfaulted | + * |-----------|-----------------------| + */ + ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0); + /* + * Fault in the first of the last 6 pages so it gets an anon_vma and + * thus the whole VMA becomes 'faulted': + * + * RO RW + * |-----------|-----------------------| + * | unfaulted | faulted | + * |-----------|-----------------------| + */ + ptr[3 * page_size] = 'x'; + /* + * Now make the last 3 pages read-only, splitting the VMA: + * + * RO RW RO + * |-----------|-----------|-----------| + * | unfaulted | faulted | faulted | + * |-----------|-----------|-----------| + */ + ASSERT_EQ(mprotect(&ptr[6 * page_size], 3 * page_size, PROT_READ), 0); + /* + * Now mprotect() the RW region read-only, we should merge: + * + * RO + * |-----------------------| + * | faulted | + * |-----------------------| + */ + ASSERT_EQ(mprotect(&ptr[3 * page_size], 3 * page_size, PROT_READ), 0); + + /* Assert that the merge succeeded using PROCMAP_QUERY. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 9 * page_size); +} + +TEST_F(merge, forked_target_vma) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + pid_t pid; + char *ptr, *ptr2; + int i; + + /* + * |-----------| + * | unfaulted | + * |-----------| + */ + ptr = mmap(&carveout[page_size], 5 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* + * Fault in process. + * + * |-----------| + * | faulted | + * |-----------| + */ + ptr[0] = 'x'; + + pid = fork(); + ASSERT_NE(pid, -1); + + if (pid != 0) { + wait(NULL); + return; + } + + /* Child process below: */ + + /* Reopen for child. */ + ASSERT_EQ(close_procmap(&self->procmap), 0); + ASSERT_EQ(open_self_procmap(&self->procmap), 0); + + /* unCOWing everything does not cause the AVC to go away. */ + for (i = 0; i < 5 * page_size; i += page_size) + ptr[i] = 'x'; + + /* + * Map in adjacent VMA in child. + * + * forked + * |-----------|-----------| + * | faulted | unfaulted | + * |-----------|-----------| + * ptr ptr2 + */ + ptr2 = mmap(&ptr[5 * page_size], 5 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + ASSERT_NE(ptr2, MAP_FAILED); + + /* Make sure not merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 5 * page_size); +} + +TEST_F(merge, forked_source_vma) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + pid_t pid; + char *ptr, *ptr2; + int i; + + /* + * |-----------|------------| + * | unfaulted | | + * |-----------|------------| + */ + ptr = mmap(&carveout[page_size], 5 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* + * Fault in process. + * + * |-----------|------------| + * | faulted | | + * |-----------|------------| + */ + ptr[0] = 'x'; + + pid = fork(); + ASSERT_NE(pid, -1); + + if (pid != 0) { + wait(NULL); + return; + } + + /* Child process below: */ + + /* Reopen for child. */ + ASSERT_EQ(close_procmap(&self->procmap), 0); + ASSERT_EQ(open_self_procmap(&self->procmap), 0); + + /* unCOWing everything does not cause the AVC to go away. */ + for (i = 0; i < 5 * page_size; i += page_size) + ptr[i] = 'x'; + + /* + * Map in adjacent VMA in child, ptr2 after ptr, but incompatible. + * + * forked RW RWX + * |-----------|-----------| + * | faulted | unfaulted | + * |-----------|-----------| + * ptr ptr2 + */ + ptr2 = mmap(&carveout[6 * page_size], 5 * page_size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr2, MAP_FAILED); + + /* Make sure not merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr2)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr2); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr2 + 5 * page_size); + + /* + * Now mprotect forked region to RWX so it becomes the source for the + * merge to unfaulted region: + * + * forked RWX RWX + * |-----------|-----------| + * | faulted | unfaulted | + * |-----------|-----------| + * ptr ptr2 + * + * This should NOT result in a merge, as ptr was forked. + */ + ASSERT_EQ(mprotect(ptr, 5 * page_size, PROT_READ | PROT_WRITE | PROT_EXEC), 0); + /* Again, make sure not merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr2)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr2); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr2 + 5 * page_size); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 9aff33b10999..188b125bf1f6 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -79,6 +79,8 @@ separated by spaces: test prctl(PR_SET_MDWE, ...) - page_frag test handling of page fragment allocation and freeing +- vma_merge + test VMA merge cases behave as expected example: ./run_vmtests.sh -t "hmm mmap ksm" EOF @@ -421,6 +423,8 @@ CATEGORY="madv_guard" run_test ./guard-regions # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate +CATEGORY="vma_merge" run_test ./merge + if [ -x ./memfd_secret ] then (echo 0 > /proc/sys/kernel/yama/ptrace_scope 2>&1) | tap_prefix -- cgit v1.2.3 From f9d3a963fef4d3377b7ee122408cf2cdf37b3181 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 10 Apr 2025 19:14:42 +0000 Subject: maple_tree: use height and depth consistently For the maple tree, the root node is defined to have a depth of 0 with a height of 1. Each level down from the node, these values are incremented by 1. Various code paths define a root with depth 1 which is inconsisent with the definition. Modify the code to be consistent with this definition. In mas_spanning_rebalance(), l_mas.depth was being used to track the height based on the number of iterations done in the main loop. This information was then used in mas_put_in_tree() to set the height. Rather than overload the l_mas.depth field to track height, simply keep track of height in the local variable new_height and directly pass this to mas_wmb_replace() which will be passed into mas_put_in_tree(). This allows up to remove writes to l_mas.depth. Link: https://lkml.kernel.org/r/20250410191446.2474640-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Wei Yang Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index bc30050227fd..e0f8fabe8821 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -36248,6 +36248,21 @@ static noinline void __init check_mtree_dup(struct maple_tree *mt) extern void test_kmem_cache_bulk(void); +static inline void check_spanning_store_height(struct maple_tree *mt) +{ + int index = 0; + MA_STATE(mas, mt, 0, 0); + mas_lock(&mas); + while (mt_height(mt) != 3) { + mas_store_gfp(&mas, xa_mk_value(index), GFP_KERNEL); + mas_set(&mas, ++index); + } + mas_set_range(&mas, 90, 140); + mas_store_gfp(&mas, xa_mk_value(index), GFP_KERNEL); + MT_BUG_ON(mt, mas_mt_height(&mas) != 2); + mas_unlock(&mas); +} + /* callback function used for check_nomem_writer_race() */ static void writer2(void *maple_tree) { @@ -36414,6 +36429,10 @@ void farmer_tests(void) check_spanning_write(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_spanning_store_height(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_null_expand(&tree); mtree_destroy(&tree); -- cgit v1.2.3 From ad88fc17d2dafe45e40de2af80207f4b2e3b1f71 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 10 Apr 2025 19:14:43 +0000 Subject: maple_tree: use vacant nodes to reduce worst case allocations In order to determine the store type for a maple tree operation, a walk of the tree is done through mas_wr_walk(). This function descends the tree until a spanning write is detected or we reach a leaf node. While descending, keep track of the height at which we encounter a node with available space. This is done by checking if mas->end is less than the number of slots a given node type can fit. Now that the height of the vacant node is tracked, we can use the difference between the height of the tree and the height of the vacant node to know how many levels we will have to propagate creating new nodes. Update mas_prealloc_calc() to consider the vacant height and reduce the number of worst-case allocations. Rebalancing and spanning stores are not supported and fall back to using the full height of the tree for allocations. Update preallocation testing assertions to take into account vacant height. Link: https://lkml.kernel.org/r/20250410191446.2474640-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Wei Yang Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 79 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index e0f8fabe8821..e37a3ab2e921 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35475,15 +35475,65 @@ static void check_dfs_preorder(struct maple_tree *mt) } /* End of depth first search tests */ +/* get height of the lowest non-leaf node with free space */ +static unsigned char get_vacant_height(struct ma_wr_state *wr_mas, void *entry) +{ + struct ma_state *mas = wr_mas->mas; + char vacant_height = 0; + enum maple_type type; + unsigned long *pivots; + unsigned long min = 0; + unsigned long max = ULONG_MAX; + unsigned char offset; + + /* start traversal */ + mas_reset(mas); + mas_start(mas); + if (!xa_is_node(mas_root(mas))) + return 0; + + type = mte_node_type(mas->node); + wr_mas->type = type; + while (!ma_is_leaf(type)) { + mas_node_walk(mas, mte_to_node(mas->node), type, &min, &max); + offset = mas->offset; + mas->end = mas_data_end(mas); + pivots = ma_pivots(mte_to_node(mas->node), type); + + if (pivots) { + if (offset) + min = pivots[mas->offset - 1]; + if (offset < mas->end) + max = pivots[mas->offset]; + } + wr_mas->r_max = offset < mas->end ? pivots[offset] : mas->max; + + /* detect spanning write */ + if (mas_is_span_wr(wr_mas)) + break; + + if (mas->end < mt_slot_count(mas->node) - 1) + vacant_height = mas->depth + 1; + + mas_descend(mas); + type = mte_node_type(mas->node); + mas->depth++; + } + + return vacant_height; +} + /* Preallocation testing */ static noinline void __init check_prealloc(struct maple_tree *mt) { unsigned long i, max = 100; unsigned long allocated; unsigned char height; + unsigned char vacant_height; struct maple_node *mn; void *ptr = check_prealloc; MA_STATE(mas, mt, 10, 20); + MA_WR_STATE(wr_mas, &mas, ptr); mt_set_non_kernel(1000); for (i = 0; i <= max; i++) @@ -35494,8 +35544,9 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); + vacant_height = get_vacant_height(&wr_mas, ptr); MT_BUG_ON(mt, allocated == 0); - MT_BUG_ON(mt, allocated != 1 + height * 3); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); @@ -35503,8 +35554,9 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); + vacant_height = get_vacant_height(&wr_mas, ptr); MT_BUG_ON(mt, allocated == 0); - MT_BUG_ON(mt, allocated != 1 + height * 3); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); @@ -35514,7 +35566,8 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated != 1 + height * 3); + vacant_height = get_vacant_height(&wr_mas, ptr); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mn->parent = ma_parent_ptr(mn); @@ -35527,7 +35580,8 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated != 1 + height * 3); + vacant_height = get_vacant_height(&wr_mas, ptr); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); @@ -35540,7 +35594,8 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated != 1 + height * 3); + vacant_height = get_vacant_height(&wr_mas, ptr); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mas_push_node(&mas, mn); @@ -35553,7 +35608,8 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated != 1 + height * 3); + vacant_height = get_vacant_height(&wr_mas, ptr); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); @@ -35578,7 +35634,8 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated != 1 + height * 2); + vacant_height = get_vacant_height(&wr_mas, ptr); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 2); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); @@ -35595,8 +35652,14 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); + vacant_height = get_vacant_height(&wr_mas, ptr); MT_BUG_ON(mt, allocated == 0); - MT_BUG_ON(mt, allocated != 1 + height * 3); + /* + * vacant height cannot be used to compute the number of nodes needed + * as the root contains two entries which means it is on the verge of + * insufficiency. The worst case full height of the tree is needed. + */ + MT_BUG_ON(mt, allocated != height * 3 + 1); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mas_set_range(&mas, 0, 200); -- cgit v1.2.3 From 271152a973cb01c135d29e91d1a05f51fbd88a9c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 10 Apr 2025 19:14:45 +0000 Subject: maple_tree: add sufficient height In order to support rebalancing and spanning stores using less than the worst case number of nodes, we need to track more than just the vacant height. Using only vacant height to reduce the worst case maple node allocation count can lead to a shortcoming of nodes in the following scenarios. For rebalancing writes, when a leaf node becomes insufficient, it may be combined with a sibling into a single node. This means that the parent node which has entries for this children will lose one entry. If this parent node was just meeting the minimum entries, losing one entry will now cause this parent node to be insufficient. This leads to a cascading operation of rebalancing at different levels and can lead to more node allocations than simply using vacant height can return. For spanning writes, a similar situation occurs. At the location at which a spanning write is detected, the number of ancestor nodes may similarly need to rebalanced into a smaller number of nodes and the same cascading situation could occur. To use less than the full height of the tree for the number of allocations, we also need to track the height at which a non-leaf node cannot become insufficient. This means even if a rebalance occurs to a child of this node, it currently has enough entries that it can lose one without any further action. This field is stored in the maple write state as sufficient height. In mas_prealloc_calc() when figuring out how many nodes to allocate, we check if the vacant node is lower in the tree than a sufficient node (has a larger value). If it is, we cannot use the vacant height and must use the difference in the height and sufficient height as the basis for the number of nodes needed. An off by one bug was also discovered in mast_overflow() where it is using >= rather than >. This caused extra iterations of the mas_spanning_rebalance() loop and lead to unneeded allocations. A test is also added to check the number of allocations is correct. Link: https://lkml.kernel.org/r/20250410191446.2474640-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Wei Yang Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index e37a3ab2e921..2c0b38301253 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -36326,6 +36326,30 @@ static inline void check_spanning_store_height(struct maple_tree *mt) mas_unlock(&mas); } +/* + * Test to check the path of a spanning rebalance which results in + * a collapse where the rebalancing of the child node leads to + * insufficieny in the parent node. + */ +static void check_collapsing_rebalance(struct maple_tree *mt) +{ + int i = 0; + MA_STATE(mas, mt, ULONG_MAX, ULONG_MAX); + + /* create a height 6 tree */ + while (mt_height(mt) < 6) { + mtree_store_range(mt, i, i + 10, xa_mk_value(i), GFP_KERNEL); + i += 9; + } + + /* delete all entries one at a time, starting from the right */ + do { + mas_erase(&mas); + } while (mas_prev(&mas, 0) != NULL); + + mtree_unlock(mt); +} + /* callback function used for check_nomem_writer_race() */ static void writer2(void *maple_tree) { @@ -36496,6 +36520,10 @@ void farmer_tests(void) check_spanning_store_height(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_collapsing_rebalance(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_null_expand(&tree); mtree_destroy(&tree); -- cgit v1.2.3 From 585a9145886ad1d06b39ddcd72d457f37ebb4ff4 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Thu, 10 Apr 2025 05:07:48 -0500 Subject: selftests/mm: restore default nr_hugepages value during cleanup in hugetlb_reparenting_test.sh During cleanup, the value of /proc/sys/vm/nr_hugepages is currently being set to 0. At the end of the test, if all tests pass, the original nr_hugepages value is restored. However, if any test fails, it remains set to 0. With this patch, we ensure that the original nr_hugepages value is restored during cleanup, regardless of whether the test passes or fails. Link: https://lkml.kernel.org/r/20250410100748.2310-1-donettom@linux.ibm.com Fixes: 29750f71a9b4 ("hugetlb_cgroup: add hugetlb_cgroup reservation tests") Signed-off-by: Donet Tom Cc: Li Wang Cc: "Ritesh Harjani (IBM)" Cc: Shuah Khan Cc: Waiman Long Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb_reparenting_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh index 9245549b66cf..0dd31892ff67 100755 --- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh +++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh @@ -56,7 +56,7 @@ function cleanup() { rmdir "$CGROUP_ROOT"/a/b 2>/dev/null rmdir "$CGROUP_ROOT"/a 2>/dev/null rmdir "$CGROUP_ROOT"/test1 2>/dev/null - echo 0 >/proc/sys/vm/nr_hugepages + echo $nr_hugepgs >/proc/sys/vm/nr_hugepages set -e } -- cgit v1.2.3 From f736953e2b1f1d7096e70580180f98744c9c9c86 Mon Sep 17 00:00:00 2001 From: Enze Li Date: Fri, 11 Apr 2025 10:43:32 +0800 Subject: selftests/damon: remove the remaining test scripts for DAMON debugfs interface DAMON has dropped debugfs support; therefore, remove these unused scripts. Link: https://lkml.kernel.org/r/20250411024332.1373861-1-enze.li@linux.dev Fixes: 5ec4333b1967 ("mm/damon: remove DAMON debugfs interface") Signed-off-by: Enze Li Reviewed-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 2 +- tools/testing/selftests/damon/_chk_dependency.sh | 52 ------------------- tools/testing/selftests/damon/_debugfs_common.sh | 64 ------------------------ 3 files changed, 1 insertion(+), 117 deletions(-) delete mode 100644 tools/testing/selftests/damon/_chk_dependency.sh delete mode 100644 tools/testing/selftests/damon/_debugfs_common.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index ecbf07afc6dd..ff21524be458 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -3,7 +3,7 @@ TEST_GEN_FILES += access_memory access_memory_even -TEST_FILES = _chk_dependency.sh _damon_sysfs.py +TEST_FILES = _damon_sysfs.py # functionality tests TEST_PROGS += sysfs.sh diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh deleted file mode 100644 index dda3a87dc00a..000000000000 --- a/tools/testing/selftests/damon/_chk_dependency.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -DBGFS=$(grep debugfs /proc/mounts --max-count 1 | awk '{print $2}') -if [ "$DBGFS" = "" ] -then - echo "debugfs not mounted" - exit $ksft_skip -fi - -DBGFS+="/damon" - -if [ $EUID -ne 0 ]; -then - echo "Run as root" - exit $ksft_skip -fi - -if [ ! -d "$DBGFS" ] -then - echo "$DBGFS not found" - exit $ksft_skip -fi - -if [ -f "$DBGFS/monitor_on_DEPRECATED" ] -then - monitor_on_file="monitor_on_DEPRECATED" -else - monitor_on_file="monitor_on" -fi - -for f in attrs target_ids "$monitor_on_file" -do - if [ ! -f "$DBGFS/$f" ] - then - echo "$f not found" - exit 1 - fi -done - -permission_error="Operation not permitted" -for f in attrs target_ids "$monitor_on_file" -do - status=$( cat "$DBGFS/$f" 2>&1 ) - if [ "${status#*$permission_error}" != "$status" ]; then - echo "Permission for reading $DBGFS/$f denied; maybe secureboot enabled?" - exit $ksft_skip - fi -done diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh deleted file mode 100644 index 54d45791b0d9..000000000000 --- a/tools/testing/selftests/damon/_debugfs_common.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -test_write_result() { - file=$1 - content=$2 - orig_content=$3 - expect_reason=$4 - expected=$5 - - if [ "$expected" = "0" ] - then - echo "$content" > "$file" - else - echo "$content" > "$file" 2> /dev/null - fi - if [ $? -ne "$expected" ] - then - echo "writing $content to $file doesn't return $expected" - echo "expected because: $expect_reason" - echo "$orig_content" > "$file" - exit 1 - fi -} - -test_write_succ() { - test_write_result "$1" "$2" "$3" "$4" 0 -} - -test_write_fail() { - test_write_result "$1" "$2" "$3" "$4" 1 -} - -test_content() { - file=$1 - orig_content=$2 - expected=$3 - expect_reason=$4 - - content=$(cat "$file") - if [ "$content" != "$expected" ] - then - echo "reading $file expected $expected but $content" - echo "expected because: $expect_reason" - echo "$orig_content" > "$file" - exit 1 - fi -} - -source ./_chk_dependency.sh - -damon_onoff="$DBGFS/monitor_on" -if [ -f "$DBGFS/monitor_on_DEPRECATED" ] -then - damon_onoff="$DBGFS/monitor_on_DEPRECATED" -else - damon_onoff="$DBGFS/monitor_on" -fi - -if [ $(cat "$damon_onoff") = "on" ] -then - echo "monitoring is on" - exit $ksft_skip -fi -- cgit v1.2.3 From d48e8d27cd61d8485ef2c1187f3048de77af2d11 Mon Sep 17 00:00:00 2001 From: Siddarth G Date: Sun, 27 Apr 2025 15:56:39 +0530 Subject: selftests/mm: use long for dwRegionSize Change the type of 'dwRegionSize' in wp_init() and wp_free() from int to long to match callers that pass long or unsigned long long values. wp_addr_range function is left unchanged because it passes 'dwRegionSize' parameter directly to pagemap_ioctl, which expects an int. This patch does not fix any actual known issues. It aligns parameter types with their actual usage and avoids any potential future issues. Link: https://lkml.kernel.org/r/20250427102639.39978-1-siddarthsgml@gmail.com Signed-off-by: Siddarth G Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index fe5ae8b25ff6..b07acc86f4f0 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -112,7 +112,7 @@ int init_uffd(void) return 0; } -int wp_init(void *lpBaseAddress, int dwRegionSize) +int wp_init(void *lpBaseAddress, long dwRegionSize) { struct uffdio_register uffdio_register; struct uffdio_writeprotect wp; @@ -136,7 +136,7 @@ int wp_init(void *lpBaseAddress, int dwRegionSize) return 0; } -int wp_free(void *lpBaseAddress, int dwRegionSize) +int wp_free(void *lpBaseAddress, long dwRegionSize) { struct uffdio_register uffdio_register; -- cgit v1.2.3 From 4c78cc596bb8d39532f059e0198eeabf370c50f5 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Fri, 9 May 2025 00:46:19 -0700 Subject: memblock: add MEMBLOCK_RSRV_KERN flag Patch series "kexec: introduce Kexec HandOver (KHO)", v8. Kexec today considers itself purely a boot loader: When we enter the new kernel, any state the previous kernel left behind is irrelevant and the new kernel reinitializes the system. However, there are use cases where this mode of operation is not what we actually want. In virtualization hosts for example, we want to use kexec to update the host kernel while virtual machine memory stays untouched. When we add device assignment to the mix, we also need to ensure that IOMMU and VFIO states are untouched. If we add PCIe peer to peer DMA, we need to do the same for the PCI subsystem. If we want to kexec while an SEV-SNP enabled virtual machine is running, we need to preserve the VM context pages and physical memory. See "pkernfs: Persisting guest memory and kernel/device state safely across kexec" Linux Plumbers Conference 2023 presentation for details: https://lpc.events/event/17/contributions/1485/ To start us on the journey to support all the use cases above, this patch implements basic infrastructure to allow hand over of kernel state across kexec (Kexec HandOver, aka KHO). As a really simple example target, we use memblock's reserve_mem. With this patchset applied, memory that was reserved using "reserve_mem" command line options remains intact after kexec and it is guaranteed to reside at the same physical address. == Alternatives == There are alternative approaches to (parts of) the problems above: * Memory Pools [1] - preallocated persistent memory region + allocator * PRMEM [2] - resizable persistent memory regions with fixed metadata pointer on the kernel command line + allocator * Pkernfs [3] - preallocated file system for in-kernel data with fixed address location on the kernel command line * PKRAM [4] - handover of user space pages using a fixed metadata page specified via command line All of the approaches above fundamentally have the same problem: They require the administrator to explicitly carve out a physical memory location because they have no mechanism outside of the kernel command line to pass data (including memory reservations) between kexec'ing kernels. KHO provides that base foundation. We will determine later whether we still need any of the approaches above for fast bulk memory handover of for example IOMMU page tables. But IMHO they would all be users of KHO, with KHO providing the foundational primitive to pass metadata and bulk memory reservations as well as provide easy versioning for data. == Overview == We introduce a metadata file that the kernels pass between each other. How they pass it is architecture specific. The file's format is a Flattened Device Tree (fdt) which has a generator and parser already included in Linux. KHO is enabled in the kernel command line by `kho=on`. When the root user enables KHO through /sys/kernel/debug/kho/out/finalize, the kernel invokes callbacks to every KHO users to register preserved memory regions, which contain drivers' states. When the actual kexec happens, the fdt is part of the image set that we boot into. In addition, we keep "scratch regions" available for kexec: physically contiguous memory regions that are guaranteed to not have any memory that KHO would preserve. The new kernel bootstraps itself using the scratch regions and sets all handed over memory as in use. When drivers initialize that support KHO, they introspect the fdt, restore preserved memory regions, and retrieve their states stored in the preserved memory. == Limitations == Currently KHO is only implemented for file based kexec. The kernel interfaces in the patch set are already in place to support user space kexec as well, but it is still not implemented it yet inside kexec tools. == How to Use == To use the code, please boot the kernel with the "kho=on" command line parameter. KHO will automatically create scratch regions. If you want to set the scratch size explicitly you can use "kho_scratch=" command line parameter. For instance, "kho_scratch=16M,512M,256M" will reserve a 16 MiB low memory scratch area, a 512 MiB global scratch region, and 256 MiB per NUMA node scratch regions on boot. Make sure to have a reserved memory range requested with reserv_mem command line option, for example, "reserve_mem=64m:4k:n1". Then before you invoke file based "kexec -l", finalize KHO FDT: # echo 1 > /sys/kernel/debug/kho/out/finalize You can preview the generated FDT using `dtc`, # dtc /sys/kernel/debug/kho/out/fdt # dtc /sys/kernel/debug/kho/out/sub_fdts/memblock `dtc` is available on ubuntu by `sudo apt-get install device-tree-compiler`. Now kexec into the new kernel, # kexec -l Image --initrd=initrd -s # kexec -e (The order of KHO finalization and "kexec -l" does not matter.) The new kernel will boot up and contain the previous kernel's reserve_mem contents at the same physical address as the first kernel. You can also review the FDT passed from the old kernel, # dtc /sys/kernel/debug/kho/in/fdt # dtc /sys/kernel/debug/kho/in/sub_fdts/memblock This patch (of 17): To denote areas that were reserved for kernel use either directly with memblock_reserve_kern() or via memblock allocations. Link: https://lore.kernel.org/lkml/20250424083258.2228122-1-changyuanl@google.com/ Link: https://lore.kernel.org/lkml/aAeaJ2iqkrv_ffhT@kernel.org/ Link: https://lore.kernel.org/lkml/35c58191-f774-40cf-8d66-d1e2aaf11a62@intel.com/ Link: https://lore.kernel.org/lkml/20250424093302.3894961-1-arnd@kernel.org/ Link: https://lkml.kernel.org/r/20250509074635.3187114-1-changyuanl@google.com Link: https://lkml.kernel.org/r/20250509074635.3187114-2-changyuanl@google.com Signed-off-by: Mike Rapoport (Microsoft) Co-developed-by: Changyuan Lyu Signed-off-by: Changyuan Lyu Cc: Alexander Graf Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Arnd Bergmann Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Pratyush Yadav Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Cc: Dave Hansen Cc: Jason Gunthorpe Signed-off-by: Andrew Morton --- tools/testing/memblock/tests/alloc_api.c | 22 +++++++++++----------- tools/testing/memblock/tests/alloc_helpers_api.c | 4 ++-- tools/testing/memblock/tests/alloc_nid_api.c | 20 ++++++++++---------- 3 files changed, 23 insertions(+), 23 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/memblock/tests/alloc_api.c b/tools/testing/memblock/tests/alloc_api.c index 68f1a75cd72c..c55f67dd367d 100644 --- a/tools/testing/memblock/tests/alloc_api.c +++ b/tools/testing/memblock/tests/alloc_api.c @@ -134,7 +134,7 @@ static int alloc_top_down_before_check(void) PREFIX_PUSH(); setup_memblock(); - memblock_reserve(memblock_end_of_DRAM() - total_size, r1_size); + memblock_reserve_kern(memblock_end_of_DRAM() - total_size, r1_size); allocated_ptr = run_memblock_alloc(r2_size, SMP_CACHE_BYTES); @@ -182,7 +182,7 @@ static int alloc_top_down_after_check(void) total_size = r1.size + r2_size; - memblock_reserve(r1.base, r1.size); + memblock_reserve_kern(r1.base, r1.size); allocated_ptr = run_memblock_alloc(r2_size, SMP_CACHE_BYTES); @@ -231,8 +231,8 @@ static int alloc_top_down_second_fit_check(void) total_size = r1.size + r2.size + r3_size; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc(r3_size, SMP_CACHE_BYTES); @@ -285,8 +285,8 @@ static int alloc_in_between_generic_check(void) total_size = r1.size + r2.size + r3_size; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc(r3_size, SMP_CACHE_BYTES); @@ -422,7 +422,7 @@ static int alloc_limited_space_generic_check(void) setup_memblock(); /* Simulate almost-full memory */ - memblock_reserve(memblock_start_of_DRAM(), reserved_size); + memblock_reserve_kern(memblock_start_of_DRAM(), reserved_size); allocated_ptr = run_memblock_alloc(available_size, SMP_CACHE_BYTES); @@ -608,7 +608,7 @@ static int alloc_bottom_up_before_check(void) PREFIX_PUSH(); setup_memblock(); - memblock_reserve(memblock_start_of_DRAM() + r1_size, r2_size); + memblock_reserve_kern(memblock_start_of_DRAM() + r1_size, r2_size); allocated_ptr = run_memblock_alloc(r1_size, SMP_CACHE_BYTES); @@ -655,7 +655,7 @@ static int alloc_bottom_up_after_check(void) total_size = r1.size + r2_size; - memblock_reserve(r1.base, r1.size); + memblock_reserve_kern(r1.base, r1.size); allocated_ptr = run_memblock_alloc(r2_size, SMP_CACHE_BYTES); @@ -705,8 +705,8 @@ static int alloc_bottom_up_second_fit_check(void) total_size = r1.size + r2.size + r3_size; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc(r3_size, SMP_CACHE_BYTES); diff --git a/tools/testing/memblock/tests/alloc_helpers_api.c b/tools/testing/memblock/tests/alloc_helpers_api.c index 3ef9486da8a0..e5362cfd2ff3 100644 --- a/tools/testing/memblock/tests/alloc_helpers_api.c +++ b/tools/testing/memblock/tests/alloc_helpers_api.c @@ -163,7 +163,7 @@ static int alloc_from_top_down_no_space_above_check(void) min_addr = memblock_end_of_DRAM() - SMP_CACHE_BYTES * 2; /* No space above this address */ - memblock_reserve(min_addr, r2_size); + memblock_reserve_kern(min_addr, r2_size); allocated_ptr = memblock_alloc_from(r1_size, SMP_CACHE_BYTES, min_addr); @@ -199,7 +199,7 @@ static int alloc_from_top_down_min_addr_cap_check(void) start_addr = (phys_addr_t)memblock_start_of_DRAM(); min_addr = start_addr - SMP_CACHE_BYTES * 3; - memblock_reserve(start_addr + r1_size, MEM_SIZE - r1_size); + memblock_reserve_kern(start_addr + r1_size, MEM_SIZE - r1_size); allocated_ptr = memblock_alloc_from(r1_size, SMP_CACHE_BYTES, min_addr); diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index 49bb416d34ff..562e4701b0e0 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -324,7 +324,7 @@ static int alloc_nid_min_reserved_generic_check(void) min_addr = max_addr - r2_size; reserved_base = min_addr - r1_size; - memblock_reserve(reserved_base, r1_size); + memblock_reserve_kern(reserved_base, r1_size); allocated_ptr = run_memblock_alloc_nid(r2_size, SMP_CACHE_BYTES, min_addr, max_addr, @@ -374,7 +374,7 @@ static int alloc_nid_max_reserved_generic_check(void) max_addr = memblock_end_of_DRAM() - r1_size; min_addr = max_addr - r2_size; - memblock_reserve(max_addr, r1_size); + memblock_reserve_kern(max_addr, r1_size); allocated_ptr = run_memblock_alloc_nid(r2_size, SMP_CACHE_BYTES, min_addr, max_addr, @@ -436,8 +436,8 @@ static int alloc_nid_top_down_reserved_with_space_check(void) min_addr = r2.base + r2.size; max_addr = r1.base; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, @@ -499,8 +499,8 @@ static int alloc_nid_reserved_full_merge_generic_check(void) min_addr = r2.base + r2.size; max_addr = r1.base; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, @@ -563,8 +563,8 @@ static int alloc_nid_top_down_reserved_no_space_check(void) min_addr = r2.base + r2.size; max_addr = r1.base; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, @@ -909,8 +909,8 @@ static int alloc_nid_bottom_up_reserved_with_space_check(void) min_addr = r2.base + r2.size; max_addr = r1.base; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, -- cgit v1.2.3 From f60b6634cd88a749fdcd9edfeb2079c23aa05b66 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 24 Apr 2025 17:57:29 -0400 Subject: mm/selftests: add a test to verify mmap_changing race with -EAGAIN Add an unit test to verify the recent mmap_changing ABI breakage. Note that I used some tricks here and there to make the test simple, e.g. I abused UFFDIO_MOVE on top of shmem with the fact that I know what I want to test will be even earlier than the vma type check. Rich comments were added to explain trivial details. Before that fix, -EAGAIN would have been written to the copy field most of the time but not always; the test should be able to reliably trigger the outlier case. After the fix, it's written always, the test verifies that making sure corresponding field (e.g. copy.copy for UFFDIO_COPY) is updated. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20250424215729.194656-3-peterx@redhat.com Signed-off-by: Peter Xu Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-unit-tests.c | 202 +++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index e8fd9011c2a3..c73fd5d455c8 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -1231,6 +1231,182 @@ static void uffd_move_pmd_split_test(uffd_test_args_t *targs) uffd_move_pmd_handle_fault); } +static bool +uffdio_verify_results(const char *name, int ret, int error, long result) +{ + /* + * Should always return -1 with errno=EAGAIN, with corresponding + * result field updated in ioctl() args to be -EAGAIN too + * (e.g. copy.copy field for UFFDIO_COPY). + */ + if (ret != -1) { + uffd_test_fail("%s should have returned -1", name); + return false; + } + + if (error != EAGAIN) { + uffd_test_fail("%s should have errno==EAGAIN", name); + return false; + } + + if (result != -EAGAIN) { + uffd_test_fail("%s should have been updated for -EAGAIN", + name); + return false; + } + + return true; +} + +/* + * This defines a function to test one ioctl. Note that here "field" can + * be 1 or anything not -EAGAIN. With that initial value set, we can + * verify later that it should be updated by kernel (when -EAGAIN + * returned), by checking whether it is also updated to -EAGAIN. + */ +#define DEFINE_MMAP_CHANGING_TEST(name, ioctl_name, field) \ + static bool uffdio_mmap_changing_test_##name(int fd) \ + { \ + int ret; \ + struct uffdio_##name args = { \ + .field = 1, \ + }; \ + ret = ioctl(fd, ioctl_name, &args); \ + return uffdio_verify_results(#ioctl_name, ret, errno, args.field); \ + } + +DEFINE_MMAP_CHANGING_TEST(zeropage, UFFDIO_ZEROPAGE, zeropage) +DEFINE_MMAP_CHANGING_TEST(copy, UFFDIO_COPY, copy) +DEFINE_MMAP_CHANGING_TEST(move, UFFDIO_MOVE, move) +DEFINE_MMAP_CHANGING_TEST(poison, UFFDIO_POISON, updated) +DEFINE_MMAP_CHANGING_TEST(continue, UFFDIO_CONTINUE, mapped) + +typedef enum { + /* We actually do not care about any state except UNINTERRUPTIBLE.. */ + THR_STATE_UNKNOWN = 0, + THR_STATE_UNINTERRUPTIBLE, +} thread_state; + +static void sleep_short(void) +{ + usleep(1000); +} + +static thread_state thread_state_get(pid_t tid) +{ + const char *header = "State:\t"; + char tmp[256], *p, c; + FILE *fp; + + snprintf(tmp, sizeof(tmp), "/proc/%d/status", tid); + fp = fopen(tmp, "r"); + + if (!fp) + return THR_STATE_UNKNOWN; + + while (fgets(tmp, sizeof(tmp), fp)) { + p = strstr(tmp, header); + if (p) { + /* For example, "State:\tD (disk sleep)" */ + c = *(p + sizeof(header) - 1); + return c == 'D' ? + THR_STATE_UNINTERRUPTIBLE : THR_STATE_UNKNOWN; + } + } + + return THR_STATE_UNKNOWN; +} + +static void thread_state_until(pid_t tid, thread_state state) +{ + thread_state s; + + do { + s = thread_state_get(tid); + sleep_short(); + } while (s != state); +} + +static void *uffd_mmap_changing_thread(void *opaque) +{ + volatile pid_t *pid = opaque; + int ret; + + /* Unfortunately, it's only fetch-able from the thread itself.. */ + assert(*pid == 0); + *pid = syscall(SYS_gettid); + + /* Inject an event, this will hang solid until the event read */ + ret = madvise(area_dst, page_size, MADV_REMOVE); + if (ret) + err("madvise(MADV_REMOVE) failed"); + + return NULL; +} + +static void uffd_consume_message(int fd) +{ + struct uffd_msg msg = { 0 }; + + while (uffd_read_msg(fd, &msg)); +} + +static void uffd_mmap_changing_test(uffd_test_args_t *targs) +{ + /* + * This stores the real PID (which can be different from how tid is + * defined..) for the child thread, 0 means not initialized. + */ + pid_t pid = 0; + pthread_t tid; + int ret; + + if (uffd_register(uffd, area_dst, nr_pages * page_size, + true, false, false)) + err("uffd_register() failed"); + + /* Create a thread to generate the racy event */ + ret = pthread_create(&tid, NULL, uffd_mmap_changing_thread, &pid); + if (ret) + err("pthread_create() failed"); + + /* + * Wait until the thread setup the pid. Use volatile to make sure + * it reads from RAM not regs. + */ + while (!(volatile pid_t)pid) + sleep_short(); + + /* Wait until the thread hangs at REMOVE event */ + thread_state_until(pid, THR_STATE_UNINTERRUPTIBLE); + + if (!uffdio_mmap_changing_test_copy(uffd)) + return; + + if (!uffdio_mmap_changing_test_zeropage(uffd)) + return; + + if (!uffdio_mmap_changing_test_move(uffd)) + return; + + if (!uffdio_mmap_changing_test_poison(uffd)) + return; + + if (!uffdio_mmap_changing_test_continue(uffd)) + return; + + /* + * All succeeded above! Recycle everything. Start by reading the + * event so as to kick the thread roll again.. + */ + uffd_consume_message(uffd); + + ret = pthread_join(tid, NULL); + assert(ret == 0); + + uffd_test_pass(); +} + static int prevent_hugepages(const char **errmsg) { /* This should be done before source area is populated */ @@ -1470,6 +1646,32 @@ uffd_test_case_t uffd_tests[] = { .mem_targets = MEM_ALL, .uffd_feature_required = UFFD_FEATURE_POISON, }, + { + .name = "mmap-changing", + .uffd_fn = uffd_mmap_changing_test, + /* + * There's no point running this test over all mem types as + * they share the same code paths. + * + * Choose shmem for simplicity, because (1) shmem supports + * MINOR mode to cover UFFDIO_CONTINUE, and (2) shmem is + * almost always available (unlike hugetlb). Here we + * abused SHMEM for UFFDIO_MOVE, but the test we want to + * cover doesn't yet need the correct memory type.. + */ + .mem_targets = MEM_SHMEM, + /* + * Any UFFD_FEATURE_EVENT_* should work to trigger the + * race logically, but choose the simplest (REMOVE). + * + * Meanwhile, since we'll cover quite a few new ioctl()s + * (CONTINUE, POISON, MOVE), skip this test for old kernels + * by choosing all of them. + */ + .uffd_feature_required = UFFD_FEATURE_EVENT_REMOVE | + UFFD_FEATURE_MOVE | UFFD_FEATURE_POISON | + UFFD_FEATURE_MINOR_SHMEM, + }, }; static void usage(const char *prog) -- cgit v1.2.3 From 6c36ac1e124f1be97cf0485a220865fce5a2020d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Apr 2025 16:28:14 +0100 Subject: mm: establish mm/vma_exec.c for shared exec/mm VMA functionality Patch series "move all VMA allocation, freeing and duplication logic to mm", v3. Currently VMA allocation, freeing and duplication exist in kernel/fork.c, which is a violation of separation of concerns, and leaves these functions exposed to the rest of the kernel when they are in fact internal implementation details. Resolve this by moving this logic to mm, and making it internal to vma.c, vma.h. This also allows us, in future, to provide userland testing around this functionality. We additionally abstract dup_mmap() to mm, being careful to ensure kernel/fork.c acceses this via the mm internal header so it is not exposed elsewhere in the kernel. As part of this change, also abstract initial stack allocation performed in __bprm_mm_init() out of fs code into mm via the create_init_stack_vma(), as this code uses vm_area_alloc() and vm_area_free(). In order to do so sensibly, we introduce a new mm/vma_exec.c file, which contains the code that is shared by mm and exec. This file is added to both memory mapping and exec sections in MAINTAINERS so both sets of maintainers can maintain oversight. As part of this change, we also move relocate_vma_down() to mm/vma_exec.c so all shared mm/exec functionality is kept in one place. We add code shared between nommu and mmu-enabled configurations in order to share VMA allocation, freeing and duplication code correctly while also keeping these functions available in userland VMA testing. This is achieved by adding a mm/vma_init.c file which is also compiled by the userland tests. This patch (of 4): There is functionality that overlaps the exec and memory mapping subsystems. While it properly belongs in mm, it is important that exec maintainers maintain oversight of this functionality correctly. We can establish both goals by adding a new mm/vma_exec.c file which contains these 'glue' functions, and have fs/exec.c import them. As a part of this change, to ensure that proper oversight is achieved, add the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections. scripts/get_maintainer.pl can correctly handle files in multiple entries and this neatly handles the cross-over. [akpm@linux-foundation.org: fix comment typo] Link: https://lkml.kernel.org/r/80f0d0c6-0b68-47f9-ab78-0ab7f74677fc@lucifer.local Link: https://lkml.kernel.org/r/cover.1745853549.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/91f2cee8f17d65214a9d83abb7011aa15f1ea690.1745853549.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Reviewed-by: Pedro Falcato Reviewed-by: David Hildenbrand Reviewed-by: Kees Cook Reviewed-by: Vlastimil Babka Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jann Horn Signed-off-by: Andrew Morton --- tools/testing/vma/Makefile | 2 +- tools/testing/vma/vma.c | 1 + tools/testing/vma/vma_internal.h | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile index 860fd2311dcc..624040fcf193 100644 --- a/tools/testing/vma/Makefile +++ b/tools/testing/vma/Makefile @@ -9,7 +9,7 @@ include ../shared/shared.mk OFILES = $(SHARED_OFILES) vma.o maple-shim.o TARGETS = vma -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma.h +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h vma: $(OFILES) $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS) diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 7cfd6e31db10..5832ae5d797d 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<mas); @@ -1240,4 +1262,22 @@ static inline int mapping_map_writable(struct address_space *mapping) return 0; } +static inline unsigned long move_page_tables(struct pagetable_move_control *pmc) +{ + (void)pmc; + + return 0; +} + +static inline void free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + (void)tlb; + (void)addr; + (void)end; + (void)floor; + (void)ceiling; +} + #endif /* __MM_VMA_INTERNAL_H */ -- cgit v1.2.3 From dd7a6246f4fd6e8a6dcb08f1f51c899f3e0d3b83 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Apr 2025 16:28:15 +0100 Subject: mm: abstract initial stack setup to mm subsystem There are peculiarities within the kernel where what is very clearly mm code is performed elsewhere arbitrarily. This violates separation of concerns and makes it harder to refactor code to make changes to how fundamental initialisation and operation of mm logic is performed. One such case is the creation of the VMA containing the initial stack upon execve()'ing a new process. This is currently performed in __bprm_mm_init() in fs/exec.c. Abstract this operation to create_init_stack_vma(). This allows us to limit use of vma allocation and free code to fork and mm only. We previously did the same for the step at which we relocate the initial stack VMA downwards via relocate_vma_down(), now we move the initial VMA establishment too. Take the opportunity to also move insert_vm_struct() to mm/vma.c as it's no longer needed anywhere outside of mm. Link: https://lkml.kernel.org/r/118c950ef7a8dd19ab20a23a68c3603751acd30e.1745853549.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Reviewed-by: Pedro Falcato Reviewed-by: Kees Cook Reviewed-by: Vlastimil Babka Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jann Horn Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 0df19ca0000a..32e990313158 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -56,6 +56,8 @@ extern unsigned long dac_mmap_min_addr; #define VM_PFNMAP 0x00000400 #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 +#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ +#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ #define VM_DONTEXPAND 0x00040000 #define VM_LOCKONFAULT 0x00080000 #define VM_ACCOUNT 0x00100000 @@ -70,6 +72,20 @@ extern unsigned long dac_mmap_min_addr; #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +#ifdef CONFIG_STACK_GROWSUP +#define VM_STACK VM_GROWSUP +#define VM_STACK_EARLY VM_GROWSDOWN +#else +#define VM_STACK VM_GROWSDOWN +#define VM_STACK_EARLY 0 +#endif + +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_LOW DEFAULT_MAP_WINDOW +#define TASK_SIZE_MAX DEFAULT_MAP_WINDOW +#define STACK_TOP TASK_SIZE_LOW +#define STACK_TOP_MAX TASK_SIZE_MAX + /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) @@ -82,6 +98,10 @@ extern unsigned long dac_mmap_min_addr; #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) + #define RLIMIT_STACK 3 /* max stack size */ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ @@ -1280,4 +1300,16 @@ static inline void free_pgd_range(struct mmu_gather *tlb, (void)ceiling; } +static inline int ksm_execve(struct mm_struct *mm) +{ + (void)mm; + + return 0; +} + +static inline void ksm_exit(struct mm_struct *mm) +{ + (void)mm; +} + #endif /* __MM_VMA_INTERNAL_H */ -- cgit v1.2.3 From 3e43e260f1e44d21861815faa905a1829027600f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Apr 2025 16:28:17 +0100 Subject: mm: perform VMA allocation, freeing, duplication in mm Right now these are performed in kernel/fork.c which is odd and a violation of separation of concerns, as well as preventing us from integrating this and related logic into userland VMA testing going forward. There is a fly in the ointment - nommu - mmap.c is not compiled if CONFIG_MMU not set, and neither is vma.c. To square the circle, let's add a new file - vma_init.c. This will be compiled for both CONFIG_MMU and nommu builds, and will also form part of the VMA userland testing. This allows us to de-duplicate code, while maintaining separation of concerns and the ability for us to userland test this logic. Update the VMA userland tests accordingly, additionally adding a detach_free_vma() helper function to correctly detach VMAs before freeing them in test code, as this change was triggering the assert for this. [akpm@linux-foundation.org: remove stray newline, per Liam] Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Pedro Falcato Reviewed-by: David Hildenbrand Reviewed-by: Kees Cook Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jann Horn Signed-off-by: Andrew Morton --- tools/testing/vma/Makefile | 2 +- tools/testing/vma/vma.c | 26 ++++--- tools/testing/vma/vma_internal.h | 143 ++++++++++++++++++++++++++++++++------- 3 files changed, 136 insertions(+), 35 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile index 624040fcf193..66f3831a668f 100644 --- a/tools/testing/vma/Makefile +++ b/tools/testing/vma/Makefile @@ -9,7 +9,7 @@ include ../shared/shared.mk OFILES = $(SHARED_OFILES) vma.o maple-shim.o TARGETS = vma -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h vma: $(OFILES) $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS) diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 5832ae5d797d..2be7597a2ac2 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<vm_pgoff, 0); ASSERT_EQ(vma->vm_flags, flags); - vm_area_free(vma); + detach_free_vma(vma); mtree_destroy(&mm.mm_mt); return true; @@ -361,7 +368,7 @@ static bool test_simple_modify(void) ASSERT_EQ(vma->vm_end, 0x1000); ASSERT_EQ(vma->vm_pgoff, 0); - vm_area_free(vma); + detach_free_vma(vma); vma_iter_clear(&vmi); vma = vma_next(&vmi); @@ -370,7 +377,7 @@ static bool test_simple_modify(void) ASSERT_EQ(vma->vm_end, 0x2000); ASSERT_EQ(vma->vm_pgoff, 1); - vm_area_free(vma); + detach_free_vma(vma); vma_iter_clear(&vmi); vma = vma_next(&vmi); @@ -379,7 +386,7 @@ static bool test_simple_modify(void) ASSERT_EQ(vma->vm_end, 0x3000); ASSERT_EQ(vma->vm_pgoff, 2); - vm_area_free(vma); + detach_free_vma(vma); mtree_destroy(&mm.mm_mt); return true; @@ -407,7 +414,7 @@ static bool test_simple_expand(void) ASSERT_EQ(vma->vm_end, 0x3000); ASSERT_EQ(vma->vm_pgoff, 0); - vm_area_free(vma); + detach_free_vma(vma); mtree_destroy(&mm.mm_mt); return true; @@ -428,7 +435,7 @@ static bool test_simple_shrink(void) ASSERT_EQ(vma->vm_end, 0x1000); ASSERT_EQ(vma->vm_pgoff, 0); - vm_area_free(vma); + detach_free_vma(vma); mtree_destroy(&mm.mm_mt); return true; @@ -619,7 +626,7 @@ static bool test_merge_new(void) ASSERT_EQ(vma->vm_pgoff, 0); ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); - vm_area_free(vma); + detach_free_vma(vma); count++; } @@ -1668,6 +1675,7 @@ int main(void) int num_tests = 0, num_fail = 0; maple_tree_init(); + vma_state_init(); #define TEST(name) \ do { \ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 32e990313158..198abe66de5a 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -155,6 +155,10 @@ typedef __bitwise unsigned int vm_fault_t; */ #define pr_warn_once pr_err +#define data_race(expr) expr + +#define ASSERT_EXCLUSIVE_WRITER(x) + struct kref { refcount_t refcount; }; @@ -255,6 +259,8 @@ struct file { #define VMA_LOCK_OFFSET 0x40000000 +typedef struct { unsigned long v; } freeptr_t; + struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -264,9 +270,7 @@ struct vm_area_struct { unsigned long vm_start; unsigned long vm_end; }; -#ifdef CONFIG_PER_VMA_LOCK - struct rcu_head vm_rcu; /* Used for deferred freeing. */ -#endif + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ }; struct mm_struct *vm_mm; /* The address space we belong to. */ @@ -463,6 +467,65 @@ struct pagetable_move_control { .len_in = len_, \ } +struct kmem_cache_args { + /** + * @align: The required alignment for the objects. + * + * %0 means no specific alignment is requested. + */ + unsigned int align; + /** + * @useroffset: Usercopy region offset. + * + * %0 is a valid offset, when @usersize is non-%0 + */ + unsigned int useroffset; + /** + * @usersize: Usercopy region size. + * + * %0 means no usercopy region is specified. + */ + unsigned int usersize; + /** + * @freeptr_offset: Custom offset for the free pointer + * in &SLAB_TYPESAFE_BY_RCU caches + * + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer + * outside of the object. This might cause the object to grow in size. + * Cache creators that have a reason to avoid this can specify a custom + * free pointer offset in their struct where the free pointer will be + * placed. + * + * Note that placing the free pointer inside the object requires the + * caller to ensure that no fields are invalidated that are required to + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for + * details). + * + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset + * is specified, %use_freeptr_offset must be set %true. + * + * Note that @ctor currently isn't supported with custom free pointers + * as a @ctor requires an external free pointer. + */ + unsigned int freeptr_offset; + /** + * @use_freeptr_offset: Whether a @freeptr_offset is used. + */ + bool use_freeptr_offset; + /** + * @ctor: A constructor for the objects. + * + * The constructor is invoked for each object in a newly allocated slab + * page. It is the cache user's responsibility to free object in the + * same state as after calling the constructor, or deal appropriately + * with any differences between a freshly constructed and a reallocated + * object. + * + * %NULL means no constructor. + */ + void (*ctor)(void *); +}; + static inline void vma_iter_invalidate(struct vma_iterator *vmi) { mas_pause(&vmi->mas); @@ -547,31 +610,38 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_lock_seq = UINT_MAX; } -static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) -{ - struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct)); +struct kmem_cache { + const char *name; + size_t object_size; + struct kmem_cache_args *args; +}; - if (!vma) - return NULL; +static inline struct kmem_cache *__kmem_cache_create(const char *name, + size_t object_size, + struct kmem_cache_args *args) +{ + struct kmem_cache *ret = malloc(sizeof(struct kmem_cache)); - vma_init(vma, mm); + ret->name = name; + ret->object_size = object_size; + ret->args = args; - return vma; + return ret; } -static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) -{ - struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct)); +#define kmem_cache_create(__name, __object_size, __args, ...) \ + __kmem_cache_create((__name), (__object_size), (__args)) - if (!new) - return NULL; +static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +{ + (void)gfpflags; - memcpy(new, orig, sizeof(*new)); - refcount_set(&new->vm_refcnt, 0); - new->vm_lock_seq = UINT_MAX; - INIT_LIST_HEAD(&new->anon_vma_chain); + return calloc(s->object_size, 1); +} - return new; +static inline void kmem_cache_free(struct kmem_cache *s, void *x) +{ + free(x); } /* @@ -738,11 +808,6 @@ static inline void mpol_put(struct mempolicy *) { } -static inline void vm_area_free(struct vm_area_struct *vma) -{ - free(vma); -} - static inline void lru_add_drain(void) { } @@ -1312,4 +1377,32 @@ static inline void ksm_exit(struct mm_struct *mm) (void)mm; } +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) +{ + (void)vma; + (void)reset_refcnt; +} + +static inline void vma_numab_state_init(struct vm_area_struct *vma) +{ + (void)vma; +} + +static inline void vma_numab_state_free(struct vm_area_struct *vma) +{ + (void)vma; +} + +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + (void)orig_vma; + (void)new_vma; +} + +static inline void free_anon_vma_name(struct vm_area_struct *vma) +{ + (void)vma; +} + #endif /* __MM_VMA_INTERNAL_H */ -- cgit v1.2.3 From fa6b8b5d9f97778bc44c8a9fe33a0e4b8fae5f92 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 1 May 2025 21:04:42 -0400 Subject: selftests: memcg: allow low event with no memory.low and memory_recursiveprot on MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "memcg: Fix test_memcg_min/low test failures", v8. The test_memcontrol selftest consistently fails its test_memcg_low sub-test (with memory_recursiveprot enabled) and sporadically fails its test_memcg_min sub-test. This patchset fixes the test_memcg_min and test_memcg_low failures by adjusting the test_memcontrol selftest to fix these test failures. This patch (of 8): The test_memcontrol selftest consistently fails its test_memcg_low sub-test due to the fact that its 3rd test child cgroup which have a memmory.low of 0 have low event count. This happens when memory_recursiveprot mount option is enabled which is the default setting used by systemd to mount cgroup2 filesystem. This issue was originally fixed by commit cdc69458a5f3 ("cgroup: account for memory_recursiveprot in test_memcg_low()"). It was later reverted by commit 1d09069f5313 ("selftests: memcg: expect no low events in unprotected sibling") expecting the memory reclaim code would be fixed. However, it turns out the unprotected cgroup may still have some residual effective memory.low protection depending on the memory.low settings in its parent and its siblings. As a result, low events may still be triggered. One way to fix the test failure is to revert the revert commit. However, Michal suggested that it might be better to ignore the low event count with memory_recursiveprot enabled as low event may or may not happen depending on the actual test configuration. Modify the test_memcontrol.c to ignore low event in the 3rd child cgroup with memory_recursiveprot on. The 4th child cgroup has no memory usage and so has an effective low of 0. It has no low event count because the mem_cgroup_below_low() check in shrink_node_memcgs() is skipped as mem_cgroup_below_min() returns true. If we ever change mem_cgroup_below_min() in such a way that it no longer skips the no usage case, we will have to add code to explicitly skip it. With this patch applied, the test_memcg_low sub-test finishes successfully without failure in most cases. Though both test_memcg_low and test_memcg_min sub-tests may still fail occasionally if the memory.current values fall outside of the expected ranges. Link: https://lkml.kernel.org/r/20250502010443.106022-1-longman@redhat.com Link: https://lkml.kernel.org/r/20250502010443.106022-2-longman@redhat.com Signed-off-by: Waiman Long Suggested-by: Michal Koutný Acked-by: Michal Koutný Acked-by: Tejun Heo Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Waiman Long Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 16f5d74ae762..58602c1831f1 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -380,10 +380,11 @@ static bool reclaim_until(const char *memcg, long goal); * * Then it checks actual memory usages and expects that: * A/B memory.current ~= 50M - * A/B/C memory.current ~= 29M - * A/B/D memory.current ~= 21M - * A/B/E memory.current ~= 0 - * A/B/F memory.current = 0 + * A/B/C memory.current ~= 29M [memory.events:low > 0] + * A/B/D memory.current ~= 21M [memory.events:low > 0] + * A/B/E memory.current ~= 0 [memory.events:low == 0 if !memory_recursiveprot, + * undefined otherwise] + * A/B/F memory.current = 0 [memory.events:low == 0] * (for origin of the numbers, see model in memcg_protection.m.) * * After that it tries to allocate more than there is @@ -525,7 +526,14 @@ static int test_memcg_protection(const char *root, bool min) goto cleanup; } + /* + * Child 2 has memory.low=0, but some low protection may still be + * distributed down from its parent with memory.low=50M if cgroup2 + * memory_recursiveprot mount option is enabled. Ignore the low + * event count in this case. + */ for (i = 0; i < ARRAY_SIZE(children); i++) { + int ignore_low_events_index = has_recursiveprot ? 2 : -1; int no_low_events_index = 1; long low, oom; @@ -534,6 +542,8 @@ static int test_memcg_protection(const char *root, bool min) if (oom) goto cleanup; + if (i == ignore_low_events_index) + continue; if (i <= no_low_events_index && low <= 0) goto cleanup; if (i > no_low_events_index && low) -- cgit v1.2.3 From d2def68ae06ab6a1f38bc1a2449b06ee4f108412 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 1 May 2025 21:04:43 -0400 Subject: selftests: memcg: increase error tolerance of child memory.current check in test_memcg_protection() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test_memcg_protection() function is used for the test_memcg_min and test_memcg_low sub-tests. This function generates a set of parent/child cgroups like: parent: memory.min/low = 50M child 0: memory.min/low = 75M, memory.current = 50M child 1: memory.min/low = 25M, memory.current = 50M child 2: memory.min/low = 0, memory.current = 50M After applying memory pressure, the function expects the following actual memory usages. parent: memory.current ~= 50M child 0: memory.current ~= 29M child 1: memory.current ~= 21M child 2: memory.current ~= 0 In reality, the actual memory usages can differ quite a bit from the expected values. It uses an error tolerance of 10% with the values_close() helper. Both the test_memcg_min and test_memcg_low sub-tests can fail sporadically because the actual memory usage exceeds the 10% error tolerance. Below are a sample of the usage data of the tests runs that fail. Child Actual usage Expected usage %err ----- ------------ -------------- ---- 1 16990208 22020096 -12.9% 1 17252352 22020096 -12.1% 0 37699584 30408704 +10.7% 1 14368768 22020096 -21.0% 1 16871424 22020096 -13.2% The current 10% error tolerenace might be right at the time test_memcontrol.c was first introduced in v4.18 kernel, but memory reclaim have certainly evolved quite a bit since then which may result in a bit more run-to-run variation than previously expected. Increase the error tolerance to 15% for child 0 and 20% for child 1 to minimize the chance of this type of failure. The tolerance is bigger for child 1 because an upswing in child 0 corresponds to a smaller %err than a similar downswing in child 1 due to the way %err is used in values_close(). Before this patch, a 100 test runs of test_memcontrol produced the following results: 17 not ok 1 test_memcg_min 22 not ok 2 test_memcg_low After applying this patch, there were no test failure for test_memcg_min and test_memcg_low in 100 test runs. However, these tests may still fail once in a while if the memory usage goes beyond the newly extended range. Link: https://lkml.kernel.org/r/20250502010443.106022-3-longman@redhat.com Signed-off-by: Waiman Long Acked-by: Tejun Heo Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 58602c1831f1..d6534d7301a2 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -496,10 +496,10 @@ static int test_memcg_protection(const char *root, bool min) for (i = 0; i < ARRAY_SIZE(children); i++) c[i] = cg_read_long(children[i], "memory.current"); - if (!values_close(c[0], MB(29), 10)) + if (!values_close(c[0], MB(29), 15)) goto cleanup; - if (!values_close(c[1], MB(21), 10)) + if (!values_close(c[1], MB(21), 20)) goto cleanup; if (c[3] != 0) -- cgit v1.2.3 From c84bf6dd2b836b49bb2662668ff1692350d28236 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 9 May 2025 13:13:34 +0100 Subject: mm: introduce new .mmap_prepare() file callback Patch series "eliminate mmap() retry merge, add .mmap_prepare hook", v2. During the mmap() of a file-backed mapping, we invoke the underlying driver file's mmap() callback in order to perform driver/file system initialisation of the underlying VMA. This has been a source of issues in the past, including a significant security concern relating to unwinding of error state discovered by Jann Horn, as fixed in commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") which performed the recent, significant, rework of mmap() as a whole. However, we have had a fly in the ointment remain - drivers have a great deal of freedom in the .mmap() hook to manipulate VMA state (as well as page table state). This can be problematic, as we can no longer reason sensibly about VMA state once the call is complete (the ability to do - anything - here does rather interfere with that). In addition, callers may choose to do odd or unusual things which might interfere with subsequent steps in the mmap() process, and it may do so and then raise an error, requiring very careful unwinding of state about which we can make no assumptions. Rather than providing such an open-ended interface, this series provides an alternative, far more restrictive one - we expose a whitelist of fields which can be adjusted by the driver, along with immutable state upon which the driver can make such decisions: struct vm_area_desc { /* Immutable state. */ struct mm_struct *mm; unsigned long start; unsigned long end; /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *file; vm_flags_t vm_flags; pgprot_t page_prot; /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; }; The mmap logic then updates the state used to either merge with a VMA or establish a new VMA based upon this logic. This is achieved via new file hook .mmap_prepare(), which is, importantly, invoked very early on in the mmap() process. If an error arises, we can very simply abort the operation with very little unwinding of state required. The existing logic contains another, related, peccadillo - since the .mmap() callback might do anything, it may also cause a previously unmergeable VMA to become mergeable with adjacent VMAs. Right now the logic will retry a merge like this only if the driver changes VMA flags, and changes them in such a way that a merge might succeed (that is, the flags are not 'special', that is do not contain any of the flags specified in VM_SPECIAL). This has also been the source of a great deal of pain - it's hard to reason about an .mmap() callback that might do - anything - but it's also hard to reason about setting up a VMA and writing to the maple tree, only to do it again utilising a great deal of shared state. Since .mmap_prepare() sets fields before the first merge is even attempted, the use of this callback obviates the need for this retry merge logic. A driver may only specify .mmap_prepare() or the deprecated .mmap() callback. In future we may add futher callbacks beyond .mmap_prepare() to faciliate all use cass as we convert drivers. In researching this change, I examined every .mmap() callback, and discovered only a very few that set VMA state in such a way that a. the VMA flags changed and b. this would be mergeable. In the majority of cases, it turns out that drivers are mapping kernel memory and thus ultimately set VM_PFNMAP, VM_MIXEDMAP, or other unmergeable VM_SPECIAL flags. Of those that remain I identified a number of cases which are only applicable in DAX, setting the VM_HUGEPAGE flag: * dax_mmap() * erofs_file_mmap() * ext4_file_mmap() * xfs_file_mmap() For this remerge to not occur and to impact users, each of these cases would require a user to mmap() files using DAX, in parts, immediately adjacent to one another. This is a very unlikely usecase and so it does not appear to be worthwhile to adjust this functionality accordingly. We can, however, very quickly do so if needed by simply adding an .mmap_prepare() callback to these as required. There are two further non-DAX cases I idenitfied: * orangefs_file_mmap() - Clears VM_RAND_READ if set, replacing with VM_SEQ_READ. * usb_stream_hwdep_mmap() - Sets VM_DONTDUMP. Both of these cases again seem very unlikely to be mmap()'d immediately adjacent to one another in a fashion that would result in a merge. Finally, we are left with a viable case: * secretmem_mmap() - Set VM_LOCKED, VM_DONTDUMP. This is viable enough that the mm selftests trigger the logic as a matter of course. Therefore, this series replace the .secretmem_mmap() hook with .secret_mmap_prepare(). This patch (of 3): Provide a means by which drivers can specify which fields of those permitted to be changed should be altered to prior to mmap()'ing a range (which may either result from a merge or from mapping an entirely new VMA). Doing so is substantially safer than the existing .mmap() calback which provides unrestricted access to the part-constructed VMA and permits drivers and file systems to do 'creative' things which makes it hard to reason about the state of the VMA after the function returns. The existing .mmap() callback's freedom has caused a great deal of issues, especially in error handling, as unwinding the mmap() state has proven to be non-trivial and caused significant issues in the past, for instance those addressed in commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour"). It also necessitates a second attempt at merge once the .mmap() callback has completed, which has caused issues in the past, is awkward, adds overhead and is difficult to reason about. The .mmap_prepare() callback eliminates this requirement, as we can update fields prior to even attempting the first merge. It is safer, as we heavily restrict what can actually be modified, and being invoked very early in the mmap() process, error handling can be performed safely with very little unwinding of state required. The .mmap_prepare() and deprecated .mmap() callbacks are mutually exclusive, so we permit only one to be invoked at a time. Update vma userland test stubs to account for changes. Link: https://lkml.kernel.org/r/cover.1746792520.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/adb36a7c4affd7393b2fc4b54cc5cfe211e41f71.1746792520.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Al Viro Cc: Christian Brauner Cc: David Hildenbrand Cc: Jan Kara Cc: Jann Horn Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 66 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 198abe66de5a..f6e45e62da3a 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -253,8 +253,40 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access */ }; +struct vm_area_struct; + +/* + * Describes a VMA that is about to be mmap()'ed. Drivers may choose to + * manipulate mutable fields which will cause those fields to be updated in the + * resultant VMA. + * + * Helper functions are not required for manipulating any field. + */ +struct vm_area_desc { + /* Immutable state. */ + struct mm_struct *mm; + unsigned long start; + unsigned long end; + + /* Mutable fields. Populated with initial state. */ + pgoff_t pgoff; + struct file *file; + vm_flags_t vm_flags; + pgprot_t page_prot; + + /* Write-only fields. */ + const struct vm_operations_struct *vm_ops; + void *private_data; +}; + +struct file_operations { + int (*mmap)(struct file *, struct vm_area_struct *); + int (*mmap_prepare)(struct vm_area_desc *); +}; + struct file { struct address_space *f_mapping; + const struct file_operations *f_op; }; #define VMA_LOCK_OFFSET 0x40000000 @@ -1125,11 +1157,6 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, vma->__vm_flags &= ~flags; } -static inline int call_mmap(struct file *, struct vm_area_struct *) -{ - return 0; -} - static inline int shmem_zero_setup(struct vm_area_struct *) { return 0; @@ -1405,4 +1432,33 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) (void)vma; } +/* Did the driver provide valid mmap hook configuration? */ +static inline bool file_has_valid_mmap_hooks(struct file *file) +{ + bool has_mmap = file->f_op->mmap; + bool has_mmap_prepare = file->f_op->mmap_prepare; + + /* Hooks are mutually exclusive. */ + if (WARN_ON_ONCE(has_mmap && has_mmap_prepare)) + return false; + if (WARN_ON_ONCE(!has_mmap && !has_mmap_prepare)) + return false; + + return true; +} + +static inline int call_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (WARN_ON_ONCE(file->f_op->mmap_prepare)) + return -EINVAL; + + return file->f_op->mmap(file, vma); +} + +static inline int __call_mmap_prepare(struct file *file, + struct vm_area_desc *desc) +{ + return file->f_op->mmap_prepare(desc); +} + #endif /* __MM_VMA_INTERNAL_H */ -- cgit v1.2.3 From 2616b370323a953c437ed2bf40a277e9deaa3709 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 9 May 2025 17:30:32 +0200 Subject: selftests/mm: add simple VM_PFNMAP tests based on mmap'ing /dev/mem Let's test some basic functionality using /dev/mem. These tests will implicitly cover some PAT (Page Attribute Handling) handling on x86. These tests will only run when /dev/mem access to the first two pages in physical address space is possible and allowed; otherwise, the tests are skipped. On current x86-64 with PAT inside a VM, all tests pass: TAP version 13 1..6 # Starting 6 tests from 1 test cases. # RUN pfnmap.madvise_disallowed ... # OK pfnmap.madvise_disallowed ok 1 pfnmap.madvise_disallowed # RUN pfnmap.munmap_split ... # OK pfnmap.munmap_split ok 2 pfnmap.munmap_split # RUN pfnmap.mremap_fixed ... # OK pfnmap.mremap_fixed ok 3 pfnmap.mremap_fixed # RUN pfnmap.mremap_shrink ... # OK pfnmap.mremap_shrink ok 4 pfnmap.mremap_shrink # RUN pfnmap.mremap_expand ... # OK pfnmap.mremap_expand ok 5 pfnmap.mremap_expand # RUN pfnmap.fork ... # OK pfnmap.fork ok 6 pfnmap.fork # PASSED: 6 / 6 tests passed. # Totals: pass:6 fail:0 xfail:0 xpass:0 skip:0 error:0 However, we are able to trigger: [ 27.888251] x86/PAT: pfnmap:1790 freeing invalid memtype [mem 0x00000000-0x00000fff] There are probably more things worth testing in the future, such as MAP_PRIVATE handling. But this set of tests is sufficient to cover most of the things we will rework regarding PAT handling. Link: https://lkml.kernel.org/r/20250509153033.952746-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Shuah Khan Cc: Ingo Molnar Cc: Peter Xu Cc: Dev Jain Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/pfnmap.c | 196 ++++++++++++++++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 4 + 4 files changed, 202 insertions(+) create mode 100644 tools/testing/selftests/mm/pfnmap.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 91db34941a14..824266982aa3 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -20,6 +20,7 @@ mremap_test on-fault-limit transhuge-stress pagemap_ioctl +pfnmap *.tmp* protection_keys protection_keys_32 diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index ad4d6043a60f..ae6f994d3add 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -84,6 +84,7 @@ TEST_GEN_FILES += mremap_test TEST_GEN_FILES += mseal_test TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl +TEST_GEN_FILES += pfnmap TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c new file mode 100644 index 000000000000..8a9d19b6020c --- /dev/null +++ b/tools/testing/selftests/mm/pfnmap.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Basic VM_PFNMAP tests relying on mmap() of '/dev/mem' + * + * Copyright 2025, Red Hat, Inc. + * + * Author(s): David Hildenbrand + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" +#include "vm_util.h" + +static sigjmp_buf sigjmp_buf_env; + +static void signal_handler(int sig) +{ + siglongjmp(sigjmp_buf_env, -EFAULT); +} + +static int test_read_access(char *addr, size_t size, size_t pagesize) +{ + size_t offs; + int ret; + + if (signal(SIGSEGV, signal_handler) == SIG_ERR) + return -EINVAL; + + ret = sigsetjmp(sigjmp_buf_env, 1); + if (!ret) { + for (offs = 0; offs < size; offs += pagesize) + /* Force a read that the compiler cannot optimize out. */ + *((volatile char *)(addr + offs)); + } + if (signal(SIGSEGV, signal_handler) == SIG_ERR) + return -EINVAL; + + return ret; +} + +FIXTURE(pfnmap) +{ + size_t pagesize; + int dev_mem_fd; + char *addr1; + size_t size1; + char *addr2; + size_t size2; +}; + +FIXTURE_SETUP(pfnmap) +{ + self->pagesize = getpagesize(); + + self->dev_mem_fd = open("/dev/mem", O_RDONLY); + if (self->dev_mem_fd < 0) + SKIP(return, "Cannot open '/dev/mem'\n"); + + /* We'll require the first two pages throughout our tests ... */ + self->size1 = self->pagesize * 2; + self->addr1 = mmap(NULL, self->size1, PROT_READ, MAP_SHARED, + self->dev_mem_fd, 0); + if (self->addr1 == MAP_FAILED) + SKIP(return, "Cannot mmap '/dev/mem'\n"); + + /* ... and want to be able to read from them. */ + if (test_read_access(self->addr1, self->size1, self->pagesize)) + SKIP(return, "Cannot read-access mmap'ed '/dev/mem'\n"); + + self->size2 = 0; + self->addr2 = MAP_FAILED; +} + +FIXTURE_TEARDOWN(pfnmap) +{ + if (self->addr2 != MAP_FAILED) + munmap(self->addr2, self->size2); + if (self->addr1 != MAP_FAILED) + munmap(self->addr1, self->size1); + if (self->dev_mem_fd >= 0) + close(self->dev_mem_fd); +} + +TEST_F(pfnmap, madvise_disallowed) +{ + int advices[] = { + MADV_DONTNEED, + MADV_DONTNEED_LOCKED, + MADV_FREE, + MADV_WIPEONFORK, + MADV_COLD, + MADV_PAGEOUT, + MADV_POPULATE_READ, + MADV_POPULATE_WRITE, + }; + int i; + + /* All these advices must be rejected. */ + for (i = 0; i < ARRAY_SIZE(advices); i++) { + EXPECT_LT(madvise(self->addr1, self->pagesize, advices[i]), 0); + EXPECT_EQ(errno, EINVAL); + } +} + +TEST_F(pfnmap, munmap_split) +{ + /* + * Unmap the first page. This munmap() call is not really expected to + * fail, but we might be able to trigger other internal issues. + */ + ASSERT_EQ(munmap(self->addr1, self->pagesize), 0); + + /* + * Remap the first page while the second page is still mapped. This + * makes sure that any PAT tracking on x86 will allow for mmap()'ing + * a page again while some parts of the first mmap() are still + * around. + */ + self->size2 = self->pagesize; + self->addr2 = mmap(NULL, self->pagesize, PROT_READ, MAP_SHARED, + self->dev_mem_fd, 0); + ASSERT_NE(self->addr2, MAP_FAILED); +} + +TEST_F(pfnmap, mremap_fixed) +{ + char *ret; + + /* Reserve a destination area. */ + self->size2 = self->size1; + self->addr2 = mmap(NULL, self->size2, PROT_READ, MAP_ANON | MAP_PRIVATE, + -1, 0); + ASSERT_NE(self->addr2, MAP_FAILED); + + /* mremap() over our destination. */ + ret = mremap(self->addr1, self->size1, self->size2, + MREMAP_FIXED | MREMAP_MAYMOVE, self->addr2); + ASSERT_NE(ret, MAP_FAILED); +} + +TEST_F(pfnmap, mremap_shrink) +{ + char *ret; + + /* Shrinking is expected to work. */ + ret = mremap(self->addr1, self->size1, self->size1 - self->pagesize, 0); + ASSERT_NE(ret, MAP_FAILED); +} + +TEST_F(pfnmap, mremap_expand) +{ + /* + * Growing is not expected to work, and getting it right would + * be challenging. So this test primarily serves as an early warning + * that something that probably should never work suddenly works. + */ + self->size2 = self->size1 + self->pagesize; + self->addr2 = mremap(self->addr1, self->size1, self->size2, MREMAP_MAYMOVE); + ASSERT_EQ(self->addr2, MAP_FAILED); +} + +TEST_F(pfnmap, fork) +{ + pid_t pid; + int ret; + + /* fork() a child and test if the child can access the pages. */ + pid = fork(); + ASSERT_GE(pid, 0); + + if (!pid) { + EXPECT_EQ(test_read_access(self->addr1, self->size1, + self->pagesize), 0); + exit(0); + } + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + ASSERT_EQ(ret, 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 188b125bf1f6..dddd1dd8af14 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -63,6 +63,8 @@ separated by spaces: test soft dirty page bit semantics - pagemap test pagemap_scan IOCTL +- pfnmap + tests for VM_PFNMAP handling - cow test copy-on-write semantics - thp @@ -472,6 +474,8 @@ fi CATEGORY="pagemap" run_test ./pagemap_ioctl +CATEGORY="pfnmap" run_test ./pfnmap + # COW tests CATEGORY="cow" run_test ./cow -- cgit v1.2.3 From 5fc4b770fc35c05874dd2dd5bf1f03d354025b62 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sun, 18 May 2025 15:04:42 +0100 Subject: selftests/mm: deduplicate second mmap() of 5*PAGE_SIZE at base The map_fixed_noreplace test does two blocks of test starting from a mapping of 5 pages at the base address, logging a test result for each initial mapping. These are logged with the same test name, causing test automation software to see two reports for the same test in a single run. Tweak the log message for the second one to deduplicate. Link: https://lkml.kernel.org/r/20250518-selftests-mm-map-fixed-noreplace-dup-v1-1-1a11a62c5e9f@kernel.org Signed-off-by: Mark Brown Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/map_fixed_noreplace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c index d53de2486080..1e9980b8993c 100644 --- a/tools/testing/selftests/mm/map_fixed_noreplace.c +++ b/tools/testing/selftests/mm/map_fixed_noreplace.c @@ -96,7 +96,7 @@ int main(void) ksft_exit_fail_msg("Error:1: mmap() succeeded when it shouldn't have\n"); } ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - ksft_test_result_pass("mmap() 5*PAGE_SIZE at base\n"); + ksft_test_result_pass("Second mmap() 5*PAGE_SIZE at base\n"); /* * Second mapping contained within first: -- cgit v1.2.3 From 03f83209e8e72df7eb6ed0afc60adca492844082 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 12 May 2025 17:27:14 -0700 Subject: selftests/damon/_damon_sysfs: read tried regions directories in order Kdamond.update_schemes_tried_regions() reads and stores tried regions information out of address order. It makes debugging a test failure difficult. Change the behavior to do the reading and writing in the address order. Link: https://lkml.kernel.org/r/20250513002715.40126-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 6e136dc3df19..1e587e0b1a39 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -420,11 +420,16 @@ class Kdamond: tried_regions = [] tried_regions_dir = os.path.join( scheme.sysfs_dir(), 'tried_regions') + region_indices = [] for filename in os.listdir( os.path.join(scheme.sysfs_dir(), 'tried_regions')): tried_region_dir = os.path.join(tried_regions_dir, filename) if not os.path.isdir(tried_region_dir): continue + region_indices.append(int(filename)) + for region_idx in sorted(region_indices): + tried_region_dir = os.path.join(tried_regions_dir, + '%d' % region_idx) region_values = [] for f in ['start', 'end', 'nr_accesses', 'age']: content, err = read_file( -- cgit v1.2.3 From 19e0713bbe4a682b651dfd3773d2a06a9d61c47b Mon Sep 17 00:00:00 2001 From: Ryan Chung Date: Tue, 13 May 2025 16:44:11 +0900 Subject: selftests/eventfd: correct test name and improve messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename test from eventfd_chek_flag_cloexec_and_nonblock to eventfd_check_flag_cloexec_and_nonblock. - Make the RDWR‐flag comment declarative: “The kernel automatically adds the O_RDWR flag.” - Update semaphore‐flag failure message to: “eventfd semaphore flag check failed: …” Link: https://lkml.kernel.org/r/20250513074411.6965-1-seokwoo.chung130@gmail.com Signed-off-by: Ryan Chung Reviewed-by: Wen Yang Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/filesystems/eventfd/eventfd_test.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c index 85acb4e3ef00..72d51ad0ee0e 100644 --- a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c +++ b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c @@ -50,7 +50,7 @@ TEST(eventfd_check_flag_rdwr) ASSERT_GE(fd, 0); flags = fcntl(fd, F_GETFL); - // since the kernel automatically added O_RDWR. + // The kernel automatically adds the O_RDWR flag. EXPECT_EQ(flags, O_RDWR); close(fd); @@ -85,7 +85,7 @@ TEST(eventfd_check_flag_nonblock) close(fd); } -TEST(eventfd_chek_flag_cloexec_and_nonblock) +TEST(eventfd_check_flag_cloexec_and_nonblock) { int fd, flags; @@ -178,8 +178,7 @@ TEST(eventfd_check_flag_semaphore) // The semaphore could only be obtained from fdinfo. ret = verify_fdinfo(fd, &err, "eventfd-semaphore: ", 19, "1\n"); if (ret != 0) - ksft_print_msg("eventfd-semaphore check failed, msg: %s\n", - err.msg); + ksft_print_msg("eventfd semaphore flag check failed: %s\n", err.msg); EXPECT_EQ(ret, 0); close(fd); -- cgit v1.2.3