From bda7bf06840d4eb133abefa5a2fd75544277bd86 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 7 Oct 2025 18:29:35 +0800 Subject: selftests: update ksm inheritance tests for prctl fork/exec To reproduce the issue mentioned by [1], this add a setting of pages_to_scan and sleep_millisecs at the start of test_prctl_fork_exec(). The main change is just raise the scanning frequency of ksmd. [1] https://lore.kernel.org/all/202510012256278259zrhgATlLA2C510DMD3qI@zte.com.cn/ Link: https://lkml.kernel.org/r/20251007182935207jm31wCIgLpZg5XbXQY64S@zte.com.cn Signed-off-by: xu xin Cc: David Hildenbrand Cc: Jinjiang Tu Cc: Stefan Roesch Cc: Wang Yaxin Cc: Yang Yang Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/ksm_functional_tests.c | 57 +++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index ac136f04b8d6..95afa5cfc062 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -38,6 +38,8 @@ enum ksm_merge_mode { }; static int mem_fd; +static int pages_to_scan_fd; +static int sleep_millisecs_fd; static int pagemap_fd; static size_t pagesize; @@ -493,6 +495,46 @@ static void test_prctl_fork(void) ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } +static int start_ksmd_and_set_frequency(char *pages_to_scan, char *sleep_ms) +{ + int ksm_fd; + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + return -errno; + + if (write(ksm_fd, "1", 1) != 1) + return -errno; + + if (write(pages_to_scan_fd, pages_to_scan, strlen(pages_to_scan)) <= 0) + return -errno; + + if (write(sleep_millisecs_fd, sleep_ms, strlen(sleep_ms)) <= 0) + return -errno; + + return 0; +} + +static int stop_ksmd_and_restore_frequency(void) +{ + int ksm_fd; + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + return -errno; + + if (write(ksm_fd, "2", 1) != 1) + return -errno; + + if (write(pages_to_scan_fd, "100", 3) <= 0) + return -errno; + + if (write(sleep_millisecs_fd, "20", 2) <= 0) + return -errno; + + return 0; +} + static void test_prctl_fork_exec(void) { int ret, status; @@ -500,6 +542,9 @@ static void test_prctl_fork_exec(void) ksft_print_msg("[RUN] %s\n", __func__); + if (start_ksmd_and_set_frequency("2000", "0")) + ksft_test_result_fail("set ksmd's scanning frequency failed\n"); + ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); if (ret < 0 && errno == EINVAL) { ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); @@ -542,6 +587,11 @@ static void test_prctl_fork_exec(void) return; } + if (stop_ksmd_and_restore_frequency()) { + ksft_test_result_fail("restore ksmd frequency failed\n"); + return; + } + ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } @@ -656,6 +706,13 @@ static void init_global_file_handles(void) ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); if (ksm_get_self_merging_pages() < 0) ksft_exit_skip("accessing \"/proc/self/ksm_merging_pages\") failed\n"); + + pages_to_scan_fd = open("/sys/kernel/mm/ksm/pages_to_scan", O_RDWR); + if (pages_to_scan_fd < 0) + ksft_exit_fail_msg("opening /sys/kernel/mm/ksm/pages_to_scan failed\n"); + sleep_millisecs_fd = open("/sys/kernel/mm/ksm/sleep_millisecs", O_RDWR); + if (sleep_millisecs_fd < 0) + ksft_exit_fail_msg("opening /sys/kernel/mm/ksm/sleep_millisecs failed\n"); } int main(int argc, char **argv) -- cgit v1.2.3 From ac0a3fc9c07df79dc8a4ce9d274df00afc7bf12d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:27 +0100 Subject: mm: add ability to take further action in vm_area_desc Some drivers/filesystems need to perform additional tasks after the VMA is set up. This is typically in the form of pre-population. The forms of pre-population most likely to be performed are a PFN remap or the insertion of normal folios and PFNs into a mixed map. We start by implementing the PFN remap functionality, ensuring that we perform the appropriate actions at the appropriate time - that is setting flags at the point of .mmap_prepare, and performing the actual remap at the point at which the VMA is fully established. This prevents the driver from doing anything too crazy with a VMA at any stage, and we retain complete control over how the mm functionality is applied. Unfortunately callers still do often require some kind of custom action, so we add an optional success/error _hook to allow the caller to do something after the action has succeeded or failed. This is done at the point when the VMA has already been established, so the harm that can be done is limited. The error hook can be used to filter errors if necessary. There may be cases in which the caller absolutely must hold the file rmap lock until the operation is entirely complete. It is an edge case, but certainly the hugetlbfs mmap hook requires it. To accommodate this, we add the hide_from_rmap_until_complete flag to the mmap_action type. In this case, if a new VMA is allocated, we will hold the file rmap lock until the operation is entirely completed (including any success/error hooks). Note that we do not need to update __compat_vma_mmap() to accommodate this flag, as this function will be invoked from an .mmap handler whose VMA is not yet visible, so we implicitly hide it from the rmap. If any error arises on these final actions, we simply unmap the VMA altogether. Also update the stacked filesystem compatibility layer to utilise the action behaviour, and update the VMA tests accordingly. While we're here, rename __compat_vma_mmap_prepare() to __compat_vma_mmap() as we are now performing actions invoked by the mmap_prepare in addition to just the mmap_prepare hook. Link: https://lkml.kernel.org/r/2601199a7b2eaeadfcd8ab6e199c6d1706650c94.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 98 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 91 insertions(+), 7 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index dc976a285ad2..d873667704e8 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -275,6 +275,57 @@ struct mm_struct { struct vm_area_struct; + +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -298,6 +349,9 @@ struct vm_area_desc { /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; struct file_operations { @@ -1326,12 +1380,23 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, +static inline void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ +} + +static inline int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + return 0; +} + +static inline int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, - .file = vma->vm_file, + .file = file, .start = vma->vm_start, .end = vma->vm_end, @@ -1339,21 +1404,24 @@ static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; - set_vma_from_desc(vma, &desc); - return 0; + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); } -static inline int compat_vma_mmap_prepare(struct file *file, +static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap_prepare(file->f_op, file, vma); + return __compat_vma_mmap(file->f_op, file, vma); } /* Did the driver provide valid mmap hook configuration? */ @@ -1374,7 +1442,7 @@ static inline bool can_mmap_file(struct file *file) static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } @@ -1407,4 +1475,20 @@ static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, return vm_flags; } +static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ +} + +static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot) +{ + return 0; +} + +static inline int do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf) +{ + return 0; +} + #endif /* __MM_VMA_INTERNAL_H */ -- cgit v1.2.3 From badfa4361cb116fd9af71aaa2ea470236a8aa25b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:30 -0700 Subject: selftests/damon/_damon_sysfs: support obsolete_target file A DAMON sysfs file, namely obsolete_target, has been newly introduced. Add a support of that file to _damon_sysfs.py so that DAMON selftests for the file can be easily written. Link: https://lkml.kernel.org/r/20251023012535.69625-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index a0e6290833fb..748778b563cd 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -475,12 +475,14 @@ class Damos: class DamonTarget: pid = None + obsolete = None # todo: Support target regions if test is made idx = None context = None - def __init__(self, pid): + def __init__(self, pid, obsolete=False): self.pid = pid + self.obsolete = obsolete def sysfs_dir(self): return os.path.join( @@ -491,8 +493,13 @@ class DamonTarget: os.path.join(self.sysfs_dir(), 'regions', 'nr_regions'), '0') if err is not None: return err - return write_file( + err = write_file( os.path.join(self.sysfs_dir(), 'pid_target'), self.pid) + if err is not None: + return err + return write_file( + os.path.join(self.sysfs_dir(), 'obsolete_target'), + 'Y' if self.obsolete else 'N') class IntervalsGoal: access_bp = None -- cgit v1.2.3 From a00f18abef3750167eef7aa9ecd96da96f74fc3f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:31 -0700 Subject: drgn_dump_damon_status: dump damon_target->obsolete A new field of damon_target for pin-point target removal, namely obsolete, has newly been added. Extend drgn_dump_damon_status.py to dump it, for easily writing a future DAMON selftests of it. Link: https://lkml.kernel.org/r/20251023012535.69625-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/drgn_dump_damon_status.py | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py index 7233369a3a44..cb4fdbe68acb 100755 --- a/tools/testing/selftests/damon/drgn_dump_damon_status.py +++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py @@ -73,6 +73,7 @@ def target_to_dict(target): ['pid', int], ['nr_regions', int], ['regions_list', regions_to_list], + ['obsolete', bool], ]) def targets_to_list(targets): -- cgit v1.2.3 From 65a9033db722bef4226b41fc205dab304c3fec70 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:32 -0700 Subject: sysfs.py: extend assert_ctx_committed() for monitoring targets assert_ctx_committed() is not asserting monitoring targets commitment, since all existing callers of the function assume no target changes. Extend it for future usage. Link: https://lkml.kernel.org/r/20251023012535.69625-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index 2666c6f0f1a5..fd8d3698326e 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -164,6 +164,16 @@ def assert_monitoring_attrs_committed(attrs, dump): assert_true(dump['max_nr_regions'] == attrs.max_nr_regions, 'max_nr_regions', dump) +def assert_monitoring_target_committed(target, dump): + # target.pid is the pid "number", while dump['pid'] is 'struct pid' + # pointer, and hence cannot be compared. + assert_true(dump['obsolete'] == target.obsolete, 'target obsolete', dump) + +def assert_monitoring_targets_committed(targets, dump): + assert_true(len(targets) == len(dump), 'len_targets', dump) + for idx, target in enumerate(targets): + assert_monitoring_target_committed(target, dump[idx]) + def assert_ctx_committed(ctx, dump): ops_val = { 'vaddr': 0, @@ -172,6 +182,7 @@ def assert_ctx_committed(ctx, dump): } assert_true(dump['ops']['id'] == ops_val[ctx.ops], 'ops_id', dump) assert_monitoring_attrs_committed(ctx.monitoring_attrs, dump['attrs']) + assert_monitoring_targets_committed(ctx.targets, dump['adaptive_targets']) assert_schemes_committed(ctx.schemes, dump['schemes']) def assert_ctxs_committed(ctxs, dump): -- cgit v1.2.3 From 809ba69f9f4d17f9c04f4aaba5e680fdd71975dd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:33 -0700 Subject: selftests/damon/sysfs: add obsolete_target test A new DAMON sysfs file for pin-point target removal, namely obsolete_target, has been added. Add a test for the functionality. It starts DAMON with three monitoring target processes, mark one in the middle as obsolete, commit it, and confirm the internal DAMON status is updated to remove the target in the middle. Link: https://lkml.kernel.org/r/20251023012535.69625-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.py | 37 ++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index fd8d3698326e..b34aea0a6775 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -279,5 +279,42 @@ def main(): kdamonds.stop() + # test obsolete_target. + proc1 = subprocess.Popen(['sh'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + proc2 = subprocess.Popen(['sh'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + proc3 = subprocess.Popen(['sh'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + kdamonds = _damon_sysfs.Kdamonds( + [_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + ops='vaddr', + targets=[ + _damon_sysfs.DamonTarget(pid=proc1.pid), + _damon_sysfs.DamonTarget(pid=proc2.pid), + _damon_sysfs.DamonTarget(pid=proc3.pid), + ], + schemes=[_damon_sysfs.Damos()], + )])]) + err = kdamonds.start() + if err is not None: + print('kdamond start failed: %s' % err) + exit(1) + kdamonds.kdamonds[0].contexts[0].targets[1].obsolete = True + kdamonds.kdamonds[0].commit() + + status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) + if err is not None: + print(err) + kdamonds.stop() + exit(1) + + del kdamonds.kdamonds[0].contexts[0].targets[1] + + assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + + kdamonds.stop() + if __name__ == '__main__': main() -- cgit v1.2.3 From 3b12a53b64d0c86cf68cab772bd4137e451b17a5 Mon Sep 17 00:00:00 2001 From: Ankit Khushwaha Date: Sat, 8 Nov 2025 21:48:29 +0530 Subject: selftest/mm: fix pointer comparison in mremap_test Pointer arthemitic with 'void * addr' and 'ulong dest_alignment' triggers following warning: mremap_test.c:1035:31: warning: pointer comparison always evaluates to false [-Wtautological-compare] 1035 | if (addr + c.dest_alignment < addr) { | ^ this warning is raised from clang version 20.1.8 (Fedora 20.1.8-4.fc42). use 'void *tmp_addr' to do the pointer arthemitic. Link: https://lkml.kernel.org/r/20251108161829.25105-1-ankitkhushwaha.linux@gmail.com Signed-off-by: Ankit Khushwaha Acked-by: Mike Rapoport (Microsoft) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index bf2863b102e3..5f073504e0b1 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -994,7 +994,7 @@ static void mremap_move_multi_invalid_vmas(FILE *maps_fp, unsigned long page_siz static long long remap_region(struct config c, unsigned int threshold_mb, char *rand_addr) { - void *addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL; + void *addr, *tmp_addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL; unsigned long long t, d; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; @@ -1032,7 +1032,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb, /* Don't destroy existing mappings unless expected to overlap */ while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) { /* Check for unsigned overflow */ - if (addr + c.dest_alignment < addr) { + tmp_addr = addr + c.dest_alignment; + if (tmp_addr < addr) { ksft_print_msg("Couldn't find a valid region to remap to\n"); ret = -1; goto clean_up_src; -- cgit v1.2.3 From 340b59816bc417c306cd76b867914cfb4f386d2d Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 4 Nov 2025 16:57:09 +0800 Subject: mm: kill mm_wr_locked from unmap_vmas() and unmap_single_vma() Kill mm_wr_locked since commit f8e97613fed2 ("mm: convert VM_PFNMAP tracking to pfnmap_track() + pfnmap_untrack()") remove the user. Link: https://lkml.kernel.org/r/20251104085709.2688433-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: David Hildenbrand (Red Hat) Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index d873667704e8..c68d382dac81 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -848,8 +848,7 @@ static inline void update_hiwater_vm(struct mm_struct *mm) static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end, - bool mm_wr_locked) + unsigned long end_addr, unsigned long tree_end) { } -- cgit v1.2.3 From 5dba5cc2e0ffa76f2f6c8922a04469dc9602c396 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:43 +0000 Subject: mm: introduce VM_MAYBE_GUARD and make visible in /proc/$pid/smaps Patch series "introduce VM_MAYBE_GUARD and make it sticky", v4. Currently, guard regions are not visible to users except through /proc/$pid/pagemap, with no explicit visibility at the VMA level. This makes the feature less useful, as it isn't entirely apparent which VMAs may have these entries present, especially when performing actions which walk through memory regions such as those performed by CRIU. This series addresses this issue by introducing the VM_MAYBE_GUARD flag which fulfils this role, updating the smaps logic to display an entry for these. The semantics of this flag are that a guard region MAY be present if set (we cannot be sure, as we can't efficiently track whether an MADV_GUARD_REMOVE finally removes all the guard regions in a VMA) - but if not set the VMA definitely does NOT have any guard regions present. It's problematic to establish this flag without further action, because that means that VMAs with guard regions in them become non-mergeable with adjacent VMAs for no especially good reason. To work around this, this series also introduces the concept of 'sticky' VMA flags - that is flags which: a. if set in one VMA and not in another still permit those VMAs to be merged (if otherwise compatible). b. When they are merged, the resultant VMA must have the flag set. The VMA logic is updated to propagate these flags correctly. Additionally, VM_MAYBE_GUARD being an explicit VMA flag allows us to solve an issue with file-backed guard regions - previously these established an anon_vma object for file-backed mappings solely to have vma_needs_copy() correctly propagate guard region mappings to child processes. We introduce a new flag alias VM_COPY_ON_FORK (which currently only specifies VM_MAYBE_GUARD) and update vma_needs_copy() to check explicitly for this flag and to copy page tables if it is present, which resolves this issue. Additionally, we add the ability for allow-listed VMA flags to be atomically writable with only mmap/VMA read locks held. The only flag we allow so far is VM_MAYBE_GUARD, which we carefully ensure does not cause any races by being allowed to do so. This allows us to maintain guard region installation as a read-locked operation and not endure the overhead of obtaining a write lock here. Finally we introduce extensive VMA userland tests to assert that the sticky VMA logic behaves correctly as well as guard region self tests to assert that smaps visibility is correctly implemented. This patch (of 9): Currently, if a user needs to determine if guard regions are present in a range, they have to scan all VMAs (or have knowledge of which ones might have guard regions). Since commit 8e2f2aeb8b48 ("fs/proc/task_mmu: add guard region bit to pagemap") and the related commit a516403787e0 ("fs/proc: extend the PAGEMAP_SCAN ioctl to report guard regions"), users can use either /proc/$pid/pagemap or the PAGEMAP_SCAN functionality to perform this operation at a virtual address level. This is not ideal, and it gives no visibility at a /proc/$pid/smaps level that guard regions exist in ranges. This patch remedies the situation by establishing a new VMA flag, VM_MAYBE_GUARD, to indicate that a VMA may contain guard regions (it is uncertain because we cannot reasonably determine whether a MADV_GUARD_REMOVE call has removed all of the guard regions in a VMA, and additionally VMAs may change across merge/split). We utilise 0x800 for this flag which makes it available to 32-bit architectures also, a flag that was previously used by VM_DENYWRITE, which was removed in commit 8d0920bde5eb ("mm: remove VM_DENYWRITE") and hasn't bee reused yet. We also update the smaps logic and documentation to identify these VMAs. Another major use of this functionality is that we can use it to identify that we ought to copy page tables on fork. We do not actually implement usage of this flag in mm/madvise.c yet as we need to allow some VMA flags to be applied atomically under mmap/VMA read lock in order to avoid the need to acquire a write lock for this purpose. Link: https://lkml.kernel.org/r/cover.1763460113.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/cf8ef821eba29b6c5b5e138fffe95d6dcabdedb9.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lance Yang Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index c68d382dac81..46acb4df45de 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -56,6 +56,7 @@ extern unsigned long dac_mmap_min_addr; #define VM_MAYEXEC 0x00000040 #define VM_GROWSDOWN 0x00000100 #define VM_PFNMAP 0x00000400 +#define VM_MAYBE_GUARD 0x00000800 #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -- cgit v1.2.3 From 9119d6c2095bb20292cb9812dd70d37f17e3bd37 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:45 +0000 Subject: mm: update vma_modify_flags() to handle residual flags, document The vma_modify_*() family of functions each either perform splits, a merge or no changes at all in preparation for the requested modification to occur. When doing so for a VMA flags change, we currently don't account for any flags which may remain (for instance, VM_SOFTDIRTY) despite the requested change in the case that a merge succeeded. This is made more important by subsequent patches which will introduce the concept of sticky VMA flags which rely on this behaviour. This patch fixes this by passing the VMA flags parameter as a pointer and updating it accordingly on merge and updating callers to accommodate for this. Additionally, while we are here, we add kdocs for each of the vma_modify_*() functions, as the fact that the requested modification is not performed is confusing so it is useful to make this abundantly clear. We also update the VMA userland tests to account for this change. Link: https://lkml.kernel.org/r/23b5b549b0eaefb2922625626e58c2a352f3e93c.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/vma/vma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 656e1c75b711..fd37ce3b2628 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -339,6 +339,7 @@ static bool test_simple_modify(void) struct mm_struct mm = {}; struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags); VMA_ITERATOR(vmi, &mm, 0x1000); + vm_flags_t flags = VM_READ | VM_MAYREAD; ASSERT_FALSE(attach_vma(&mm, init_vma)); @@ -347,7 +348,7 @@ static bool test_simple_modify(void) * performs the merge/split only. */ vma = vma_modify_flags(&vmi, init_vma, init_vma, - 0x1000, 0x2000, VM_READ | VM_MAYREAD); + 0x1000, 0x2000, &flags); ASSERT_NE(vma, NULL); /* We modify the provided VMA, and on split allocate new VMAs. */ ASSERT_EQ(vma, init_vma); -- cgit v1.2.3 From 64212ba02e66e705cabce188453ba4e61e9d7325 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:46 +0000 Subject: mm: implement sticky VMA flags It is useful to be able to designate that certain flags are 'sticky', that is, if two VMAs are merged one with a flag of this nature and one without, the merged VMA sets this flag. As a result we ignore these flags for the purposes of determining VMA flag differences between VMAs being considered for merge. This patch therefore updates the VMA merge logic to perform this action, with flags possessing this property being described in the VM_STICKY bitmap. Those flags which ought to be ignored for the purposes of VMA merge are described in the VM_IGNORE_MERGE bitmap, which the VMA merge logic is also updated to use. As part of this change we place VM_SOFTDIRTY in VM_IGNORE_MERGE as it already had this behaviour, alongside VM_STICKY as sticky flags by implication must not disallow merge. Ultimately it seems that we should make VM_SOFTDIRTY a sticky flag in its own right, but this change is out of scope for this series. The only sticky flag designated as such is VM_MAYBE_GUARD, so as a result of this change, once the VMA flag is set upon guard region installation, VMAs with guard ranges will now not have their merge behaviour impacted as a result and can be freely merged with other VMAs without VM_MAYBE_GUARD set. Also update the comments for vma_modify_flags() to directly reference sticky flags now we have established the concept. We also update the VMA userland tests to account for the changes. Link: https://lkml.kernel.org/r/22ad5269f7669d62afb42ce0c79bad70b994c58d.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 46acb4df45de..73c2025777e6 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -117,6 +117,34 @@ extern unsigned long dac_mmap_min_addr; #define VM_SEALED VM_NONE #endif +/* + * Flags which should be 'sticky' on merge - that is, flags which, when one VMA + * possesses it but the other does not, the merged VMA should nonetheless have + * applied to it: + * + * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that + * mapped page tables may contain metadata not described by the + * VMA and thus any merged VMA may also contain this metadata, + * and thus we must make this flag sticky. + */ +#define VM_STICKY VM_MAYBE_GUARD + +/* + * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one + * of these flags and the other not does not preclude a merge. + * + * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but + * dirty bit -- the caller should mark merged VMA as dirty. If + * dirty bit won't be excluded from comparison, we increase + * pressure on the memory system forcing the kernel to generate + * new VMAs when old one could be extended instead. + * + * VM_STICKY - When merging VMAs, VMA flags must match, unless they are + * 'sticky'. If any sticky flags exist in either VMA, we simply + * set all of them on the merged VMA. + */ +#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) + #define FIRST_USER_ADDRESS 0UL #define USER_PGTABLES_CEILING 0UL -- cgit v1.2.3 From ab04b530e7e8bd5cf9fb0c1ad20e0deee8f569ec Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:47 +0000 Subject: mm: introduce copy-on-fork VMAs and make VM_MAYBE_GUARD one Gather all the VMA flags whose presence implies that page tables must be copied on fork into a single bitmap - VM_COPY_ON_FORK - and use this rather than specifying individual flags in vma_needs_copy(). We also add VM_MAYBE_GUARD to this list, as it being set on a VMA implies that there may be metadata contained in the page tables (that is - guard markers) which would will not and cannot be propagated upon fork. This was already being done manually previously in vma_needs_copy(), but this makes it very explicit, alongside VM_PFNMAP, VM_MIXEDMAP and VM_UFFD_WP all of which imply the same. Note that VM_STICKY flags ought generally to be marked VM_COPY_ON_FORK too - because equally a flag being VM_STICKY indicates that the VMA contains metadat that is not propagated by being faulted in - i.e. that the VMA metadata does not fully describe the VMA alone, and thus we must propagate whatever metadata there is on a fork. However, for maximum flexibility, we do not make this necessarily the case here. Link: https://lkml.kernel.org/r/5d41b24e7bc622cda0af92b6d558d7f4c0d1bc8c.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 73c2025777e6..233819a9e7ee 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -145,6 +145,32 @@ extern unsigned long dac_mmap_min_addr; */ #define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) +/* + * Flags which should result in page tables being copied on fork. These are + * flags which indicate that the VMA maps page tables which cannot be + * reconsistuted upon page fault, so necessitate page table copying upon + * + * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be + * reasonably reconstructed on page fault. + * + * VM_UFFD_WP - Encodes metadata about an installed uffd + * write protect handler, which cannot be + * reconstructed on page fault. + * + * We always copy pgtables when dst_vma has uffd-wp + * enabled even if it's file-backed + * (e.g. shmem). Because when uffd-wp is enabled, + * pgtable contains uffd-wp protection information, + * that's something we can't retrieve from page cache, + * and skip copying will lose those info. + * + * VM_MAYBE_GUARD - Could contain page guard region markers which + * by design are a property of the page tables + * only and thus cannot be reconstructed on page + * fault. + */ +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) + #define FIRST_USER_ADDRESS 0UL #define USER_PGTABLES_CEILING 0UL -- cgit v1.2.3 From 29bef05e6d90b6123275159b52a1e520722243cb Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:49 +0000 Subject: tools/testing/vma: add VMA sticky userland tests Modify existing merge new/existing userland VMA tests to assert that sticky VMA flags behave as expected. We do so by generating every possible permutation of VMAs being manipulated being sticky/not sticky and asserting that VMA flags with this property retain are retained upon merge. Link: https://lkml.kernel.org/r/5e2c7244485867befd052f8afc8188be6a4be670.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Pedro Falcato Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/vma/vma.c | 89 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 79 insertions(+), 10 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index fd37ce3b2628..be79ab2ea44b 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -48,6 +48,8 @@ static struct anon_vma dummy_anon_vma; #define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2)) #define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2)) +#define IS_SET(_val, _flags) ((_val & _flags) == _flags) + static struct task_struct __current; struct task_struct *get_current(void) @@ -442,7 +444,7 @@ static bool test_simple_shrink(void) return true; } -static bool test_merge_new(void) +static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, bool c_is_sticky) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; @@ -470,23 +472,32 @@ static bool test_merge_new(void) struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d; bool merged; + if (is_sticky) + vm_flags |= VM_STICKY; + /* * 0123456789abc * AA B CC */ vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); ASSERT_NE(vma_a, NULL); + if (a_is_sticky) + vm_flags_set(vma_a, VM_STICKY); /* We give each VMA a single avc so we can test anon_vma duplication. */ INIT_LIST_HEAD(&vma_a->anon_vma_chain); list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain); vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags); ASSERT_NE(vma_b, NULL); + if (b_is_sticky) + vm_flags_set(vma_b, VM_STICKY); INIT_LIST_HEAD(&vma_b->anon_vma_chain); list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain); vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, vm_flags); ASSERT_NE(vma_c, NULL); + if (c_is_sticky) + vm_flags_set(vma_c, VM_STICKY); INIT_LIST_HEAD(&vma_c->anon_vma_chain); list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain); @@ -521,6 +532,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); + if (is_sticky || a_is_sticky || b_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge to PREVIOUS VMA. @@ -538,6 +551,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); + if (is_sticky || a_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge to NEXT VMA. @@ -557,6 +572,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); + if (is_sticky) /* D uses is_sticky. */ + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge BOTH sides. @@ -575,6 +592,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); + if (is_sticky || a_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge to NEXT VMA. @@ -593,6 +612,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); + if (is_sticky || c_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge BOTH sides. @@ -610,6 +631,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 1); + if (is_sticky || a_is_sticky || c_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Final state. @@ -638,6 +661,20 @@ static bool test_merge_new(void) return true; } +static bool test_merge_new(void) +{ + int i, j, k, l; + + /* Generate every possible permutation of sticky flags. */ + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + for (l = 0; l < 2; l++) + ASSERT_TRUE(__test_merge_new(i, j, k, l)); + + return true; +} + static bool test_vma_merge_special_flags(void) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; @@ -974,9 +1011,11 @@ static bool test_vma_merge_new_with_close(void) return true; } -static bool test_merge_existing(void) +static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bool next_is_sticky) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t prev_flags = vm_flags; + vm_flags_t next_flags = vm_flags; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma, *vma_prev, *vma_next; @@ -989,6 +1028,13 @@ static bool test_merge_existing(void) }; struct anon_vma_chain avc = {}; + if (prev_is_sticky) + prev_flags |= VM_STICKY; + if (middle_is_sticky) + vm_flags |= VM_STICKY; + if (next_is_sticky) + next_flags |= VM_STICKY; + /* * Merge right case - partial span. * @@ -1001,7 +1047,7 @@ static bool test_merge_existing(void) */ vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma); vmg.middle = vma; @@ -1019,6 +1065,8 @@ static bool test_merge_existing(void) ASSERT_TRUE(vma_write_started(vma)); ASSERT_TRUE(vma_write_started(vma_next)); ASSERT_EQ(mm.map_count, 2); + if (middle_is_sticky || next_is_sticky) + ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY)); /* Clear down and reset. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); @@ -1034,7 +1082,7 @@ static bool test_merge_existing(void) * NNNNNNN */ vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, vm_flags, &dummy_anon_vma); vmg.middle = vma; @@ -1047,6 +1095,8 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_next)); ASSERT_EQ(mm.map_count, 1); + if (middle_is_sticky || next_is_sticky) + ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY)); /* Clear down and reset. We should have deleted vma. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1061,7 +1111,7 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPV */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ @@ -1081,6 +1131,8 @@ static bool test_merge_existing(void) ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); + if (prev_is_sticky || middle_is_sticky) + ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); /* Clear down and reset. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); @@ -1095,7 +1147,7 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPP */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma); @@ -1110,6 +1162,8 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_EQ(mm.map_count, 1); + if (prev_is_sticky || middle_is_sticky) + ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); /* Clear down and reset. We should have deleted vma. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1124,10 +1178,10 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPPPPP */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, next_flags); vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; @@ -1140,6 +1194,8 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_EQ(mm.map_count, 1); + if (prev_is_sticky || middle_is_sticky || next_is_sticky) + ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); /* Clear down and reset. We should have deleted prev and next. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1159,9 +1215,9 @@ static bool test_merge_existing(void) * PPPVVVVVNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, next_flags); vmg_set_range(&vmg, 0x4000, 0x5000, 4, vm_flags); vmg.prev = vma; @@ -1204,6 +1260,19 @@ static bool test_merge_existing(void) return true; } +static bool test_merge_existing(void) +{ + int i, j, k; + + /* Generate every possible permutation of sticky flags. */ + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + ASSERT_TRUE(__test_merge_existing(i, j, k)); + + return true; +} + static bool test_anon_vma_non_mergeable(void) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; -- cgit v1.2.3 From 89330ec89741d12970b513af94fd2c46997ae1db Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:50 +0000 Subject: tools/testing/selftests/mm: add MADV_COLLAPSE test case To ensure the retract_page_tables() logic functions correctly with the introduction of VM_MAYBE_GUARD, add a test to assert that madvise collapse fails when guard regions are established in the collapsed range in all cases. Unfortunately we cannot differentiate between e.g. CONFIG_READ_ONLY_THP_FOR_FS not being set vs. a file-backed VMA having collapse correctly disallowed, so in each instance we will get an assert pass here. We add an additional check to see whether guard regions are preserved across collapse in case of a bug causing the collapse to succeed, which will give us more data to debug with should this occur in future. Link: https://lkml.kernel.org/r/0748beeb864525b8ddfa51adad7128dd32eb3ac4.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Pedro Falcato Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/guard-regions.c | 65 ++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index 8dd81c0a4a5a..c549bcd6160b 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -2138,4 +2138,69 @@ TEST_F(guard_regions, pagemap_scan) ASSERT_EQ(munmap(ptr, 10 * page_size), 0); } +TEST_F(guard_regions, collapse) +{ + const unsigned long page_size = self->page_size; + const unsigned long size = 2 * HPAGE_SIZE; + const unsigned long num_pages = size / page_size; + char *ptr; + int i; + + /* Need file to be correct size for tests for non-anon. */ + if (variant->backing != ANON_BACKED) + ASSERT_EQ(ftruncate(self->fd, size), 0); + + /* + * We must close and re-open local-file backed as read-only for + * CONFIG_READ_ONLY_THP_FOR_FS to work. + */ + if (variant->backing == LOCAL_FILE_BACKED) { + ASSERT_EQ(close(self->fd), 0); + + self->fd = open(self->path, O_RDONLY); + ASSERT_GE(self->fd, 0); + } + + ptr = mmap_(self, variant, NULL, size, PROT_READ, 0, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Prevent being faulted-in as huge. */ + ASSERT_EQ(madvise(ptr, size, MADV_NOHUGEPAGE), 0); + /* Fault in. */ + ASSERT_EQ(madvise(ptr, size, MADV_POPULATE_READ), 0); + + /* Install guard regions in ever other page. */ + for (i = 0; i < num_pages; i += 2) { + char *ptr_page = &ptr[i * page_size]; + + ASSERT_EQ(madvise(ptr_page, page_size, MADV_GUARD_INSTALL), 0); + /* Accesses should now fail. */ + ASSERT_FALSE(try_read_buf(ptr_page)); + } + + /* Allow huge page throughout region. */ + ASSERT_EQ(madvise(ptr, size, MADV_HUGEPAGE), 0); + + /* + * Now collapse the entire region. This should fail in all cases. + * + * The madvise() call will also fail if CONFIG_READ_ONLY_THP_FOR_FS is + * not set for the local file case, but we can't differentiate whether + * this occurred or if the collapse was rightly rejected. + */ + EXPECT_NE(madvise(ptr, size, MADV_COLLAPSE), 0); + + /* + * If we introduce a bug that causes the collapse to succeed, gather + * data on whether guard regions are at least preserved. The test will + * fail at this point in any case. + */ + for (i = 0; i < num_pages; i += 2) { + char *ptr_page = &ptr[i * page_size]; + + /* Accesses should still fail. */ + ASSERT_FALSE(try_read_buf(ptr_page)); + } +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From c0ae966fac00bfd31490b6a9bf494d28865ff840 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:51 +0000 Subject: tools/testing/selftests/mm: add smaps visibility guard region test Assert that we observe guard regions appearing in /proc/$pid/smaps as expected, and when split/merge is performed too (with expected sticky behaviour). Also add handling for file systems which don't sanely handle mmap() VMA merging so we don't incorrectly encounter a test failure in this situation. Link: https://lkml.kernel.org/r/059e62b8c67e55e6d849878206a95ea1d7c1e885.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Pedro Falcato Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/guard-regions.c | 120 +++++++++++++++++++++++++++++ tools/testing/selftests/mm/vm_util.c | 5 ++ tools/testing/selftests/mm/vm_util.h | 1 + 3 files changed, 126 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index c549bcd6160b..795bf3f39f44 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -94,6 +94,7 @@ static void *mmap_(FIXTURE_DATA(guard_regions) * self, case ANON_BACKED: flags |= MAP_PRIVATE | MAP_ANON; fd = -1; + offset = 0; break; case SHMEM_BACKED: case LOCAL_FILE_BACKED: @@ -260,6 +261,54 @@ static bool is_buf_eq(char *buf, size_t size, char chr) return true; } +/* + * Some file systems have issues with merging due to changing merge-sensitive + * parameters in the .mmap callback, and prior to .mmap_prepare being + * implemented everywhere this will now result in an unexpected failure to + * merge (e.g. - overlayfs). + * + * Perform a simple test to see if the local file system suffers from this, if + * it does then we can skip test logic that assumes local file system merging is + * sane. + */ +static bool local_fs_has_sane_mmap(FIXTURE_DATA(guard_regions) * self, + const FIXTURE_VARIANT(guard_regions) * variant) +{ + const unsigned long page_size = self->page_size; + char *ptr, *ptr2; + struct procmap_fd procmap; + + if (variant->backing != LOCAL_FILE_BACKED) + return true; + + /* Map 10 pages. */ + ptr = mmap_(self, variant, NULL, 10 * page_size, PROT_READ | PROT_WRITE, 0, 0); + if (ptr == MAP_FAILED) + return false; + /* Unmap the middle. */ + munmap(&ptr[5 * page_size], page_size); + + /* Map again. */ + ptr2 = mmap_(self, variant, &ptr[5 * page_size], page_size, PROT_READ | PROT_WRITE, + MAP_FIXED, 5 * page_size); + + if (ptr2 == MAP_FAILED) + return false; + + /* Now make sure they all merged. */ + if (open_self_procmap(&procmap) != 0) + return false; + if (!find_vma_procmap(&procmap, ptr)) + return false; + if (procmap.query.vma_start != (unsigned long)ptr) + return false; + if (procmap.query.vma_end != (unsigned long)ptr + 10 * page_size) + return false; + close_procmap(&procmap); + + return true; +} + FIXTURE_SETUP(guard_regions) { self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); @@ -2203,4 +2252,75 @@ TEST_F(guard_regions, collapse) } } +TEST_F(guard_regions, smaps) +{ + const unsigned long page_size = self->page_size; + struct procmap_fd procmap; + char *ptr, *ptr2; + int i; + + /* Map a region. */ + ptr = mmap_(self, variant, NULL, 10 * page_size, PROT_READ | PROT_WRITE, 0, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* We shouldn't yet see a guard flag. */ + ASSERT_FALSE(check_vmflag_guard(ptr)); + + /* Install a single guard region. */ + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0); + + /* Now we should see a guard flag. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); + + /* + * Removing the guard region should not change things because we simply + * cannot accurately track whether a given VMA has had all of its guard + * regions removed. + */ + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_REMOVE), 0); + ASSERT_TRUE(check_vmflag_guard(ptr)); + + /* Install guard regions throughout. */ + for (i = 0; i < 10; i++) { + ASSERT_EQ(madvise(&ptr[i * page_size], page_size, MADV_GUARD_INSTALL), 0); + /* We should always see the guard region flag. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); + } + + /* Split into two VMAs. */ + ASSERT_EQ(munmap(&ptr[4 * page_size], page_size), 0); + + /* Both VMAs should have the guard flag set. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); + ASSERT_TRUE(check_vmflag_guard(&ptr[5 * page_size])); + + /* + * If the local file system is unable to merge VMAs due to having + * unusual characteristics, there is no point in asserting merge + * behaviour. + */ + if (!local_fs_has_sane_mmap(self, variant)) { + TH_LOG("local filesystem does not support sane merging skipping merge test"); + return; + } + + /* Map a fresh VMA between the two split VMAs. */ + ptr2 = mmap_(self, variant, &ptr[4 * page_size], page_size, + PROT_READ | PROT_WRITE, MAP_FIXED, 4 * page_size); + ASSERT_NE(ptr2, MAP_FAILED); + + /* + * Check the procmap to ensure that this VMA merged with the adjacent + * two. The guard region flag is 'sticky' so should not preclude + * merging. + */ + ASSERT_EQ(open_self_procmap(&procmap), 0); + ASSERT_TRUE(find_vma_procmap(&procmap, ptr)); + ASSERT_EQ(procmap.query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap.query.vma_end, (unsigned long)ptr + 10 * page_size); + ASSERT_EQ(close_procmap(&procmap), 0); + /* And, of course, this VMA should have the guard flag set. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index e33cda301dad..605cb58ea5c3 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -449,6 +449,11 @@ bool check_vmflag_pfnmap(void *addr) return check_vmflag(addr, "pf"); } +bool check_vmflag_guard(void *addr) +{ + return check_vmflag(addr, "gu"); +} + bool softdirty_supported(void) { char *addr; diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 26c30fdc0241..a8abdf414d46 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -98,6 +98,7 @@ int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, unsigned long get_free_hugepages(void); bool check_vmflag_io(void *addr); bool check_vmflag_pfnmap(void *addr); +bool check_vmflag_guard(void *addr); int open_procmap(pid_t pid, struct procmap_fd *procmap_out); int query_procmap(struct procmap_fd *procmap); bool find_vma_procmap(struct procmap_fd *procmap, void *address); -- cgit v1.2.3 From 2197bb60f89077603cc580ff752c5cf6388c1099 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 10 Nov 2025 20:32:01 +0000 Subject: mm: add vma_start_write_killable() Patch series "vma_start_write_killable"", v2. When we added the VMA lock, we made a major oversight in not adding a killable variant. That can run us into trouble where a thread takes the VMA lock for read (eg handling a page fault) and then goes out to lunch for an hour (eg doing reclaim). Another thread tries to modify the VMA, taking the mmap_lock for write, then attempts to lock the VMA for write. That blocks on the first thread, and ensures that every other page fault now tries to take the mmap_lock for read. Because everything's in an uninterruptible sleep, we can't kill the task, which makes me angry. This patchset just adds vma_start_write_killable() and converts one caller to use it. Most users are somewhat tricky to convert, so expect follow-up individual patches per call-site which need careful analysis to make sure we've done proper cleanup. This patch (of 2): The vma can be held read-locked for a substantial period of time, eg if memory allocation needs to go into reclaim. It's useful to be able to send fatal signals to threads which are waiting for the write lock. Link: https://lkml.kernel.org/r/20251110203204.1454057-1-willy@infradead.org Link: https://lkml.kernel.org/r/20251110203204.1454057-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Cc: Chris Li Cc: Jann Horn Cc: Matthew Wilcox (Oracle) Cc: Shakeel Butt Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 233819a9e7ee..73a899ba2686 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -952,6 +952,14 @@ static inline void vma_start_write(struct vm_area_struct *vma) vma->vm_lock_seq++; } +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + /* Used to indicate to tests that a write operation has begun. */ + vma->vm_lock_seq++; + return 0; +} + static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, -- cgit v1.2.3 From 1ec5d5810b6f17d5c247f1ffcaed0ed3e8e39609 Mon Sep 17 00:00:00 2001 From: Mehdi Ben Hadj Khelifa Date: Tue, 11 Nov 2025 21:54:27 +0100 Subject: selftests/mm/uffd: remove static address usage in shmem_allocate_area() The current shmem_allocate_area() implementation uses a hardcoded virtual base address (BASE_PMD_ADDR) as a hint for mmap() when creating shmem-backed test areas. This approach is fragile and may fail on systems with ASLR or different virtual memory layouts, where the chosen address is unavailable. Replace the static base address with a dynamically reserved address range obtained via mmap(NULL, ..., PROT_NONE). The memfd-backed areas and their alias are then mapped into that reserved region using MAP_FIXED, preserving the original layout and aliasing semantics while avoiding collisions with unrelated mappings. This change improves robustness and portability of the test suite without altering its behavior or coverage. [mehdi.benhadjkhelifa@gmail.com: make cleanup code more clear, per Mike] Link: https://lkml.kernel.org/r/20251113142050.108638-1-mehdi.benhadjkhelifa@gmail.com Link: https://lkml.kernel.org/r/20251111205739.420009-1-mehdi.benhadjkhelifa@gmail.com Signed-off-by: Mehdi Ben Hadj Khelifa Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport (Microsoft) Cc: David Hildenbrand Cc: David Hunter Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 994fe8c03923..edd02328f77b 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -10,7 +10,6 @@ uffd_test_ops_t *uffd_test_ops; uffd_test_case_ops_t *uffd_test_case_ops; -#define BASE_PMD_ADDR ((void *)(1UL << 30)) /* pthread_mutex_t starts at page offset 0 */ pthread_mutex_t *area_mutex(char *area, unsigned long nr, uffd_global_test_opts_t *gopts) @@ -142,30 +141,37 @@ static int shmem_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area unsigned long offset = is_src ? 0 : bytes; char *p = NULL, *p_alias = NULL; int mem_fd = uffd_mem_fd_create(bytes * 2, false); + size_t region_size = bytes * 2 + hpage_size; - /* TODO: clean this up. Use a static addr is ugly */ - p = BASE_PMD_ADDR; - if (!is_src) - /* src map + alias + interleaved hpages */ - p += 2 * (bytes + hpage_size); + void *reserve = mmap(NULL, region_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + if (reserve == MAP_FAILED) { + close(mem_fd); + return -errno; + } + + p = reserve; p_alias = p; p_alias += bytes; p_alias += hpage_size; /* Prevent src/dst VMA merge */ - *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, mem_fd, offset); if (*alloc_area == MAP_FAILED) { *alloc_area = NULL; + munmap(reserve, region_size); + close(mem_fd); return -errno; } if (*alloc_area != p) err("mmap of memfd failed at %p", p); - area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, mem_fd, offset); if (area_alias == MAP_FAILED) { - munmap(*alloc_area, bytes); *alloc_area = NULL; + munmap(reserve, region_size); + close(mem_fd); return -errno; } if (area_alias != p_alias) -- cgit v1.2.3 From 53298afe456e62ad2c2dc8bc7aa54bb86a67ba2f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:05 -0800 Subject: mm/damon: rename damos->filters to damos->core_filters DAMOS filters that are handled by the ops layer are linked to damos->ops_filters. Owing to the ops_ prefix on the name, it is easy to understand it is for ops layer handled filters. The other types of filters, which are handled by the core layer, are linked to damos->filters. Because of the name, it is easy to confuse the list is there for not only core layer handled ones but all filters. Avoid such confusions by renaming the field to core_filters. Link: https://lkml.kernel.org/r/20251112154114.66053-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/drgn_dump_damon_status.py | 8 ++++---- tools/testing/selftests/damon/sysfs.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py index cb4fdbe68acb..5374d18d1fa8 100755 --- a/tools/testing/selftests/damon/drgn_dump_damon_status.py +++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py @@ -175,11 +175,11 @@ def scheme_to_dict(scheme): ['target_nid', int], ['migrate_dests', damos_migrate_dests_to_dict], ]) - filters = [] + core_filters = [] for f in list_for_each_entry( - 'struct damos_filter', scheme.filters.address_of_(), 'list'): - filters.append(damos_filter_to_dict(f)) - dict_['filters'] = filters + 'struct damos_filter', scheme.core_filters.address_of_(), 'list'): + core_filters.append(damos_filter_to_dict(f)) + dict_['core_filters'] = core_filters ops_filters = [] for f in list_for_each_entry( 'struct damos_filter', scheme.ops_filters.address_of_(), 'list'): diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index b34aea0a6775..b4c5ef5c4d69 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -132,7 +132,7 @@ def assert_scheme_committed(scheme, dump): assert_watermarks_committed(scheme.watermarks, dump['wmarks']) # TODO: test filters directory for idx, f in enumerate(scheme.core_filters.filters): - assert_filter_committed(f, dump['filters'][idx]) + assert_filter_committed(f, dump['core_filters'][idx]) for idx, f in enumerate(scheme.ops_filters.filters): assert_filter_committed(f, dump['ops_filters'][idx]) -- cgit v1.2.3 From 675774adbe800b350714ce46e184f48fa101512d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:10 -0800 Subject: selftests/damon/sysfs.py: merge DAMON status dumping into commitment assertion For each test case, sysfs.py makes changes to DAMON, dumps DAMON internal status and asserts the expectation is met. The dumping part should be the same for all cases, so it is duplicated for each test case. Which means it is easy to make mistakes. Actually a few of those duplicates are not turning DAMON off in case of the dumping failure. It makes following selftests that need to turn DAMON on fails with -EBUSY. Merge the status dumping into commitment assertion with proper dumping failure handling, to deduplicate and avoid the unnecessary following tests failures. Link: https://lkml.kernel.org/r/20251112154114.66053-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.py | 43 ++++++++++------------------------ 1 file changed, 13 insertions(+), 30 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index b4c5ef5c4d69..9cca71eb0325 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -185,7 +185,15 @@ def assert_ctx_committed(ctx, dump): assert_monitoring_targets_committed(ctx.targets, dump['adaptive_targets']) assert_schemes_committed(ctx.schemes, dump['schemes']) -def assert_ctxs_committed(ctxs, dump): +def assert_ctxs_committed(kdamonds): + status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) + if err is not None: + print(err) + kdamonds.stop() + exit(1) + + ctxs = kdamonds.kdamonds[0].contexts + dump = status['contexts'] assert_true(len(ctxs) == len(dump), 'ctxs length', dump) for idx, ctx in enumerate(ctxs): assert_ctx_committed(ctx, dump[idx]) @@ -202,13 +210,7 @@ def main(): print('kdamond start failed: %s' % err) exit(1) - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - kdamonds.stop() - exit(1) - - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + assert_ctxs_committed(kdamonds) context = _damon_sysfs.DamonCtx( monitoring_attrs=_damon_sysfs.DamonAttrs( @@ -256,12 +258,7 @@ def main(): kdamonds.kdamonds[0].contexts = [context] kdamonds.kdamonds[0].commit() - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - exit(1) - - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + assert_ctxs_committed(kdamonds) # test online commitment of minimum context. context = _damon_sysfs.DamonCtx() @@ -270,12 +267,7 @@ def main(): kdamonds.kdamonds[0].contexts = [context] kdamonds.kdamonds[0].commit() - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - exit(1) - - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + assert_ctxs_committed(kdamonds) kdamonds.stop() @@ -303,17 +295,8 @@ def main(): exit(1) kdamonds.kdamonds[0].contexts[0].targets[1].obsolete = True kdamonds.kdamonds[0].commit() - - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - kdamonds.stop() - exit(1) - del kdamonds.kdamonds[0].contexts[0].targets[1] - - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) - + assert_ctxs_committed(kdamonds) kdamonds.stop() if __name__ == '__main__': -- cgit v1.2.3 From 6707915e030a3258868355f989b80140c1a45bbe Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 17 Nov 2025 17:33:38 +0000 Subject: mm: propagate VM_SOFTDIRTY on merge Patch series "make VM_SOFTDIRTY a sticky VMA flag", v2. Currently we set VM_SOFTDIRTY when a new mapping is set up (whether by establishing a new VMA, or via merge) as implemented in __mmap_complete() and do_brk_flags(). However, when performing a merge of existing mappings such as when performing mprotect(), we may lose the VM_SOFTDIRTY flag. Now we have the concept of making VMA flags 'sticky', that is that they both don't prevent merge and, importantly, are propagated to merged VMAs, this seems a sensible alternative to the existing special-casing of VM_SOFTDIRTY. We additionally add a self-test that demonstrates that this logic behaves as expected. This patch (of 2): Currently we set VM_SOFTDIRTY when a new mapping is set up (whether by establishing a new VMA, or via merge) as implemented in __mmap_complete() and do_brk_flags(). However, when performing a merge of existing mappings such as when performing mprotect(), we may lose the VM_SOFTDIRTY flag. This is because currently we simply ignore VM_SOFTDIRTY for the purposes of merge, so one VMA may possess the flag and another not, and whichever happens to be the target VMA will be the one upon which the merge is performed which may or may not have VM_SOFTDIRTY set. Now we have the concept of 'sticky' VMA flags, let's make VM_SOFTDIRTY one which solves this issue. Additionally update VMA userland tests to propagate changes. [akpm@linux-foundation.org: update comments, per Lorenzo] Link: https://lkml.kernel.org/r/0019e0b8-ee1e-4359-b5ee-94225cbe5588@lucifer.local Link: https://lkml.kernel.org/r/cover.1763399675.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/955478b5170715c895d1ef3b7f68e0cd77f76868.1763399675.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Suggested-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Pedro Falcato Acked-by: Andrey Vagin Reviewed-by: Vlastimil Babka Acked-by: Cyrill Gorcunov Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 73a899ba2686..81b501f51948 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -122,28 +122,22 @@ extern unsigned long dac_mmap_min_addr; * possesses it but the other does not, the merged VMA should nonetheless have * applied to it: * - * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that - * mapped page tables may contain metadata not described by the - * VMA and thus any merged VMA may also contain this metadata, - * and thus we must make this flag sticky. + * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any merged VMA + * should be considered soft-dirty also as it operates at a VMA + * granularity. */ -#define VM_STICKY VM_MAYBE_GUARD +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) /* * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one * of these flags and the other not does not preclude a merge. * - * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but - * dirty bit -- the caller should mark merged VMA as dirty. If - * dirty bit won't be excluded from comparison, we increase - * pressure on the memory system forcing the kernel to generate - * new VMAs when old one could be extended instead. - * * VM_STICKY - When merging VMAs, VMA flags must match, unless they are * 'sticky'. If any sticky flags exist in either VMA, we simply * set all of them on the merged VMA. */ -#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) +#define VM_IGNORE_MERGE VM_STICKY /* * Flags which should result in page tables being copied on fork. These are -- cgit v1.2.3 From c7ba92bcfea34f6b4afc744c3b65c8f7420fefe0 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 17 Nov 2025 17:33:39 +0000 Subject: testing/selftests/mm: add soft-dirty merge self-test Assert that we correctly merge VMAs containing VM_SOFTDIRTY flags now that we correctly handle these as sticky. In order to do so, we have to account for the fact the pagemap interface checks soft dirty PTEs and additionally that newly merged VMAs are marked VM_SOFTDIRTY. We do this by using use unfaulted anon VMAs, establishing one and clearing references on that one, before establishing another and merging the two before checking that soft-dirty is propagated as expected. We check that this functions correctly with mremap() and mprotect() as sample cases, because VMA merge of adjacent newly mapped VMAs will automatically be made soft-dirty due to existing logic which does so. We are therefore exercising other means of merging VMAs. Link: https://lkml.kernel.org/r/d5a0f735783fb4f30a604f570ede02ccc5e29be9.1763399675.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Andrey Vagin Cc: David Hildenbrand (Red Hat) Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/soft-dirty.c | 127 +++++++++++++++++++++++++++++++- 1 file changed, 126 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index 4ee4db3750c1..c3a9585de98c 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -184,6 +184,130 @@ static void test_mprotect(int pagemap_fd, int pagesize, bool anon) close(test_fd); } +static void test_merge(int pagemap_fd, int pagesize) +{ + char *reserved, *map, *map2; + + /* + * Reserve space for tests: + * + * ---padding to --- + * | avoid adj. | + * v merge v + * |---|---|---|---|---| + * | | 1 | 2 | 3 | | + * |---|---|---|---|---| + */ + reserved = mmap(NULL, 5 * pagesize, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); + if (reserved == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + munmap(reserved, 4 * pagesize); + + /* + * Establish initial VMA: + * + * S/D + * |---|---|---|---|---| + * | | 1 | | | | + * |---|---|---|---|---| + */ + map = mmap(&reserved[pagesize], pagesize, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* This will clear VM_SOFTDIRTY too. */ + clear_softdirty(); + + /* + * Now place a new mapping which will be marked VM_SOFTDIRTY. Away from + * map: + * + * - S/D + * |---|---|---|---|---| + * | | 1 | | 2 | | + * |---|---|---|---|---| + */ + map2 = mmap(&reserved[3 * pagesize], pagesize, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* + * Now remap it immediately adjacent to map, if the merge correctly + * propagates VM_SOFTDIRTY, we should then observe the VMA as a whole + * being marked soft-dirty: + * + * merge + * S/D + * |---|-------|---|---| + * | | 1 | | | + * |---|-------|---|---| + */ + map2 = mremap(map2, pagesize, pagesize, MREMAP_FIXED | MREMAP_MAYMOVE, + &reserved[2 * pagesize]); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mremap failed\n"); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-anon soft-dirty after remap merge 1st pg\n", + __func__); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s-anon soft-dirty after remap merge 2nd pg\n", + __func__); + + munmap(map, 2 * pagesize); + + /* + * Now establish another VMA: + * + * S/D + * |---|---|---|---|---| + * | | 1 | | | | + * |---|---|---|---|---| + */ + map = mmap(&reserved[pagesize], pagesize, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* Clear VM_SOFTDIRTY... */ + clear_softdirty(); + /* ...and establish incompatible adjacent VMA: + * + * - S/D + * |---|---|---|---|---| + * | | 1 | 2 | | | + * |---|---|---|---|---| + */ + map2 = mmap(&reserved[2 * pagesize], pagesize, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* + * Now mprotect() VMA 1 so it's compatible with 2 and therefore merges: + * + * merge + * S/D + * |---|-------|---|---| + * | | 1 | | | + * |---|-------|---|---| + */ + if (mprotect(map, pagesize, PROT_READ | PROT_WRITE | PROT_EXEC)) + ksft_exit_fail_msg("mprotect failed\n"); + + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-anon soft-dirty after mprotect merge 1st pg\n", + __func__); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s-anon soft-dirty after mprotect merge 2nd pg\n", + __func__); + + munmap(map, 2 * pagesize); +} + static void test_mprotect_anon(int pagemap_fd, int pagesize) { test_mprotect(pagemap_fd, pagesize, true); @@ -204,7 +328,7 @@ int main(int argc, char **argv) if (!softdirty_supported()) ksft_exit_skip("soft-dirty is not support\n"); - ksft_set_plan(15); + ksft_set_plan(19); pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); if (pagemap_fd < 0) ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); @@ -216,6 +340,7 @@ int main(int argc, char **argv) test_hugepage(pagemap_fd, pagesize); test_mprotect_anon(pagemap_fd, pagesize); test_mprotect_file(pagemap_fd, pagesize); + test_merge(pagemap_fd, pagesize); close(pagemap_fd); -- cgit v1.2.3 From 519071529d2ad8a041e2e9d75ead5f9e6fe60026 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:04 +1000 Subject: selftests/mm/hmm-tests: new tests for zone device THP migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new tests for migrating anon THP pages, including anon_huge, anon_huge_zero and error cases involving forced splitting of pages during migration. Link: https://lkml.kernel.org/r/20251001065707.920170-14-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hmm-tests.c | 410 +++++++++++++++++++++++++++++++++ 1 file changed, 410 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 15aadaf24a66..339a90183930 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -2055,4 +2055,414 @@ TEST_F(hmm, hmm_cow_in_device) hmm_buffer_free(buffer); } + +/* + * Migrate private anonymous huge empty page. + */ +TEST_F(hmm, migrate_anon_huge_empty) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge zero page. + */ +TEST_F(hmm, migrate_anon_huge_zero) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + int val; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize a read-only zero huge page. */ + val = *(int *)buffer->ptr; + ASSERT_EQ(val, 0); + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) { + ASSERT_EQ(ptr[i], 0); + /* If it asserts once, it probably will 500,000 times */ + if (ptr[i] != 0) + break; + } + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge page and free. + */ +TEST_F(hmm, migrate_anon_huge_free) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Try freeing it. */ + ret = madvise(map, size, MADV_FREE); + ASSERT_EQ(ret, 0); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge page and fault back to sysmem. + */ +TEST_F(hmm, migrate_anon_huge_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge page with allocation errors. + */ +TEST_F(hmm, migrate_anon_huge_err) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(2 * size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, 2 * size); + + old_ptr = mmap(NULL, 2 * size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device but force a THP allocation error. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) { + ASSERT_EQ(ptr[i], i); + if (ptr[i] != i) + break; + } + + /* Try faulting back a single (PAGE_SIZE) page. */ + ptr = buffer->ptr; + ASSERT_EQ(ptr[2048], 2048); + + /* unmap and remap the region to reset things. */ + ret = munmap(old_ptr, 2 * size); + ASSERT_EQ(ret, 0); + old_ptr = mmap(NULL, 2 * size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate THP to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* + * Force an allocation error when faulting back a THP resident in the + * device. + */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + + ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ptr = buffer->ptr; + ASSERT_EQ(ptr[2048], 2048); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge zero page with allocation errors. + */ +TEST_F(hmm, migrate_anon_huge_zero_err) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(2 * size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, 2 * size); + + old_ptr = mmap(NULL, 2 * size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Migrate memory to device but force a THP allocation error. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Try faulting back a single (PAGE_SIZE) page. */ + ptr = buffer->ptr; + ASSERT_EQ(ptr[2048], 0); + + /* unmap and remap the region to reset things. */ + ret = munmap(old_ptr, 2 * size); + ASSERT_EQ(ret, 0); + old_ptr = mmap(NULL, 2 * size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Initialize buffer in system memory (zero THP page). */ + ret = ptr[0]; + ASSERT_EQ(ret, 0); + + /* Migrate memory to device but force a THP allocation error. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Fault the device memory back and check it. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} TEST_HARNESS_MAIN -- cgit v1.2.3 From 24c2c5b8ffbd50d5dcd340c553c717948ca99aac Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Wed, 1 Oct 2025 16:57:05 +1000 Subject: selftests/mm/hmm-tests: partial unmap, mremap and anon_write tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add partial unmap test case which munmaps memory while in the device. Add tests exercising mremap on faulted-in memory (CPU and GPU) at various offsets and verify correctness. Update anon_write_child to read device memory after fork verifying this flow works in the kernel. Both THP and non-THP cases are updated. Link: https://lkml.kernel.org/r/20251001065707.920170-15-balbirs@nvidia.com Signed-off-by: Balbir Singh Signed-off-by: Matthew Brost Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hmm-tests.c | 312 ++++++++++++++++++++++++++------- 1 file changed, 252 insertions(+), 60 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 339a90183930..dedc1049bd4d 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -50,6 +50,8 @@ enum { HMM_COHERENCE_DEVICE_TWO, }; +#define ONEKB (1 << 10) +#define ONEMEG (1 << 20) #define TWOMEG (1 << 21) #define HMM_BUFFER_SIZE (1024 << 12) #define HMM_PATH_MAX 64 @@ -525,6 +527,8 @@ TEST_F(hmm, anon_write_prot) /* * Check that a device writing an anonymous private mapping * will copy-on-write if a child process inherits the mapping. + * + * Also verifies after fork() memory the device can be read by child. */ TEST_F(hmm, anon_write_child) { @@ -532,72 +536,101 @@ TEST_F(hmm, anon_write_child) unsigned long npages; unsigned long size; unsigned long i; + void *old_ptr; + void *map; int *ptr; pid_t pid; int child_fd; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer->ptr so we can tell if it is written. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; + int ret, use_thp, migrate; + + for (migrate = 0; migrate < 2; ++migrate) { + for (use_thp = 0; use_thp < 2; ++use_thp) { + npages = ALIGN(use_thp ? TWOMEG : HMM_BUFFER_SIZE, + self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size * 2; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size * 2, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + old_ptr = buffer->ptr; + if (use_thp) { + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + } + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + if (migrate) { + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + } + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did not change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); + continue; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + if (!migrate) { + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + } + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; + /* Check what the device wrote. */ + if (!migrate) { + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + } - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (pid != 0) { - waitpid(pid, &ret, 0); - ASSERT_EQ(WIFEXITED(ret), 1); - - /* Check that the parent's buffer did not change. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - return; + close(child_fd); + exit(0); + } } - - /* Check that we see the parent's values. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - /* The child process needs its own mirror to its own mm. */ - child_fd = hmm_open(0); - ASSERT_GE(child_fd, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - close(child_fd); - exit(0); } /* @@ -2289,6 +2322,165 @@ TEST_F(hmm, migrate_anon_huge_fault) hmm_buffer_free(buffer); } +/* + * Migrate memory and fault back to sysmem after partially unmapping. + */ +TEST_F(hmm, migrate_partial_unmap_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size = TWOMEG; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret, j, use_thp; + int offsets[] = { 0, 512 * ONEKB, ONEMEG }; + + for (use_thp = 0; use_thp < 2; ++use_thp) { + for (j = 0; j < ARRAY_SIZE(offsets); ++j) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + if (use_thp) + ret = madvise(map, size, MADV_HUGEPAGE); + else + ret = madvise(map, size, MADV_NOHUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + munmap(buffer->ptr + offsets[j], ONEMEG); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + if (i * sizeof(int) < offsets[j] || + i * sizeof(int) >= offsets[j] + ONEMEG) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); + } + } +} + +TEST_F(hmm, migrate_remap_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size = TWOMEG; + unsigned long i; + void *old_ptr, *new_ptr = NULL; + void *map; + int *ptr; + int ret, j, use_thp, dont_unmap, before; + int offsets[] = { 0, 512 * ONEKB, ONEMEG }; + + for (before = 0; before < 2; ++before) { + for (dont_unmap = 0; dont_unmap < 2; ++dont_unmap) { + for (use_thp = 0; use_thp < 2; ++use_thp) { + for (j = 0; j < ARRAY_SIZE(offsets); ++j) { + int flags = MREMAP_MAYMOVE | MREMAP_FIXED; + + if (dont_unmap) + flags |= MREMAP_DONTUNMAP; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 8 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, buffer->size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + if (use_thp) + ret = madvise(map, size, MADV_HUGEPAGE); + else + ret = madvise(map, size, MADV_NOHUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + munmap(map + size, size * 2); + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; + i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + if (before) { + new_ptr = mremap((void *)map, size, size, flags, + map + size + offsets[j]); + ASSERT_NE(new_ptr, MAP_FAILED); + buffer->ptr = new_ptr; + } + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; + i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + if (!before) { + new_ptr = mremap((void *)map, size, size, flags, + map + size + offsets[j]); + ASSERT_NE(new_ptr, MAP_FAILED); + buffer->ptr = new_ptr; + } + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; + i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + munmap(new_ptr, size); + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); + } + } + } + } +} + /* * Migrate private anonymous huge page with allocation errors. */ -- cgit v1.2.3 From 271a7b2e3c1370d36fde867bfee201bd74b53704 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:06 +1000 Subject: selftests/mm/hmm-tests: new throughput tests including THP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new benchmark style support to test transfer bandwidth for zone device memory operations. Link: https://lkml.kernel.org/r/20251001065707.920170-16-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hmm-tests.c | 197 ++++++++++++++++++++++++++++++++- 1 file changed, 196 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index dedc1049bd4d..5a1525f72daa 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -25,6 +25,7 @@ #include #include #include +#include /* @@ -209,8 +210,10 @@ static void hmm_buffer_free(struct hmm_buffer *buffer) if (buffer == NULL) return; - if (buffer->ptr) + if (buffer->ptr) { munmap(buffer->ptr, buffer->size); + buffer->ptr = NULL; + } free(buffer->mirror); free(buffer); } @@ -2657,4 +2660,196 @@ TEST_F(hmm, migrate_anon_huge_zero_err) buffer->ptr = old_ptr; hmm_buffer_free(buffer); } + +struct benchmark_results { + double sys_to_dev_time; + double dev_to_sys_time; + double throughput_s2d; + double throughput_d2s; +}; + +static double get_time_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000.0) + (tv.tv_usec / 1000.0); +} + +static inline struct hmm_buffer *hmm_buffer_alloc(unsigned long size) +{ + struct hmm_buffer *buffer; + + buffer = malloc(sizeof(*buffer)); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + memset(buffer->mirror, 0xFF, size); + return buffer; +} + +static void print_benchmark_results(const char *test_name, size_t buffer_size, + struct benchmark_results *thp, + struct benchmark_results *regular) +{ + double s2d_improvement = ((regular->sys_to_dev_time - thp->sys_to_dev_time) / + regular->sys_to_dev_time) * 100.0; + double d2s_improvement = ((regular->dev_to_sys_time - thp->dev_to_sys_time) / + regular->dev_to_sys_time) * 100.0; + double throughput_s2d_improvement = ((thp->throughput_s2d - regular->throughput_s2d) / + regular->throughput_s2d) * 100.0; + double throughput_d2s_improvement = ((thp->throughput_d2s - regular->throughput_d2s) / + regular->throughput_d2s) * 100.0; + + printf("\n=== %s (%.1f MB) ===\n", test_name, buffer_size / (1024.0 * 1024.0)); + printf(" | With THP | Without THP | Improvement\n"); + printf("---------------------------------------------------------------------\n"); + printf("Sys->Dev Migration | %.3f ms | %.3f ms | %.1f%%\n", + thp->sys_to_dev_time, regular->sys_to_dev_time, s2d_improvement); + printf("Dev->Sys Migration | %.3f ms | %.3f ms | %.1f%%\n", + thp->dev_to_sys_time, regular->dev_to_sys_time, d2s_improvement); + printf("S->D Throughput | %.2f GB/s | %.2f GB/s | %.1f%%\n", + thp->throughput_s2d, regular->throughput_s2d, throughput_s2d_improvement); + printf("D->S Throughput | %.2f GB/s | %.2f GB/s | %.1f%%\n", + thp->throughput_d2s, regular->throughput_d2s, throughput_d2s_improvement); +} + +/* + * Run a single migration benchmark + * fd: file descriptor for hmm device + * use_thp: whether to use THP + * buffer_size: size of buffer to allocate + * iterations: number of iterations + * results: where to store results + */ +static inline int run_migration_benchmark(int fd, int use_thp, size_t buffer_size, + int iterations, struct benchmark_results *results) +{ + struct hmm_buffer *buffer; + unsigned long npages = buffer_size / sysconf(_SC_PAGESIZE); + double start, end; + double s2d_total = 0, d2s_total = 0; + int ret, i; + int *ptr; + + buffer = hmm_buffer_alloc(buffer_size); + + /* Map memory */ + buffer->ptr = mmap(NULL, buffer_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (!buffer->ptr) + return -1; + + /* Apply THP hint if requested */ + if (use_thp) + ret = madvise(buffer->ptr, buffer_size, MADV_HUGEPAGE); + else + ret = madvise(buffer->ptr, buffer_size, MADV_NOHUGEPAGE); + + if (ret) + return ret; + + /* Initialize memory to make sure pages are allocated */ + ptr = (int *)buffer->ptr; + for (i = 0; i < buffer_size / sizeof(int); i++) + ptr[i] = i & 0xFF; + + /* Warmup iteration */ + ret = hmm_migrate_sys_to_dev(fd, buffer, npages); + if (ret) + return ret; + + ret = hmm_migrate_dev_to_sys(fd, buffer, npages); + if (ret) + return ret; + + /* Benchmark iterations */ + for (i = 0; i < iterations; i++) { + /* System to device migration */ + start = get_time_ms(); + + ret = hmm_migrate_sys_to_dev(fd, buffer, npages); + if (ret) + return ret; + + end = get_time_ms(); + s2d_total += (end - start); + + /* Device to system migration */ + start = get_time_ms(); + + ret = hmm_migrate_dev_to_sys(fd, buffer, npages); + if (ret) + return ret; + + end = get_time_ms(); + d2s_total += (end - start); + } + + /* Calculate average times and throughput */ + results->sys_to_dev_time = s2d_total / iterations; + results->dev_to_sys_time = d2s_total / iterations; + results->throughput_s2d = (buffer_size / (1024.0 * 1024.0 * 1024.0)) / + (results->sys_to_dev_time / 1000.0); + results->throughput_d2s = (buffer_size / (1024.0 * 1024.0 * 1024.0)) / + (results->dev_to_sys_time / 1000.0); + + /* Cleanup */ + hmm_buffer_free(buffer); + return 0; +} + +/* + * Benchmark THP migration with different buffer sizes + */ +TEST_F_TIMEOUT(hmm, benchmark_thp_migration, 120) +{ + struct benchmark_results thp_results, regular_results; + size_t thp_size = 2 * 1024 * 1024; /* 2MB - typical THP size */ + int iterations = 5; + + printf("\nHMM THP Migration Benchmark\n"); + printf("---------------------------\n"); + printf("System page size: %ld bytes\n", sysconf(_SC_PAGESIZE)); + + /* Test different buffer sizes */ + size_t test_sizes[] = { + thp_size / 4, /* 512KB - smaller than THP */ + thp_size / 2, /* 1MB - half THP */ + thp_size, /* 2MB - single THP */ + thp_size * 2, /* 4MB - two THPs */ + thp_size * 4, /* 8MB - four THPs */ + thp_size * 8, /* 16MB - eight THPs */ + thp_size * 128, /* 256MB - one twenty eight THPs */ + }; + + static const char *const test_names[] = { + "Small Buffer (512KB)", + "Half THP Size (1MB)", + "Single THP Size (2MB)", + "Two THP Size (4MB)", + "Four THP Size (8MB)", + "Eight THP Size (16MB)", + "One twenty eight THP Size (256MB)" + }; + + int num_tests = ARRAY_SIZE(test_sizes); + + /* Run all tests */ + for (int i = 0; i < num_tests; i++) { + /* Test with THP */ + ASSERT_EQ(run_migration_benchmark(self->fd, 1, test_sizes[i], + iterations, &thp_results), 0); + + /* Test without THP */ + ASSERT_EQ(run_migration_benchmark(self->fd, 0, test_sizes[i], + iterations, ®ular_results), 0); + + /* Print results */ + print_benchmark_results(test_names[i], test_sizes[i], + &thp_results, ®ular_results); + } +} TEST_HARNESS_MAIN -- cgit v1.2.3 From 218fbfad16341ae60b7d540ca65af016b9f83500 Mon Sep 17 00:00:00 2001 From: Peng Li Date: Mon, 17 Nov 2025 23:40:11 +0800 Subject: selftests/mm: gup_test: stop testing FOLL_TOUCH commit 0f20bba1688b ("mm/gup: explicitly define and check internal GUP flags, disallow FOLL_TOUCH") marked FOLL_TOUCH as a GUP-internal flag. This causes a warning to fire when running gup_test, for example: $ ./gup_test -L -r 100 -z dmesg: WARNING: CPU: 1 PID: 117 at mm/gup.c:2512 is_valid_gup_args+0x66/0x8c Therefore, remove the "FOLL_TOUCH" test code from gup_test.c. Link: https://lkml.kernel.org/r/20251117154012.197499-1-peng8420.li@gmail.com Signed-off-by: Peng Li Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand (Red Hat) Cc: Dan Williams Cc: Jason Gunthorpe Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/gup_test.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index 8900b840c17a..75f7134d529d 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -19,7 +19,6 @@ /* Just the flags we need, copied from mm.h: */ #define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ #define GUP_TEST_FILE "/sys/kernel/debug/gup_test" @@ -93,7 +92,7 @@ int main(int argc, char **argv) { struct gup_test gup = { 0 }; int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; - int flags = MAP_PRIVATE, touch = 0; + int flags = MAP_PRIVATE; char *file = "/dev/zero"; pthread_t *tid; char *p; @@ -170,10 +169,6 @@ int main(int argc, char **argv) case 'H': flags |= (MAP_HUGETLB | MAP_ANONYMOUS); break; - case 'z': - /* fault pages in gup, do not fault in userland */ - touch = 1; - break; default: ksft_exit_fail_msg("Wrong argument\n"); } @@ -244,18 +239,9 @@ int main(int argc, char **argv) else if (thp == 0) madvise(p, size, MADV_NOHUGEPAGE); - /* - * FOLL_TOUCH, in gup_test, is used as an either/or case: either - * fault pages in from the kernel via FOLL_TOUCH, or fault them - * in here, from user space. This allows comparison of performance - * between those two cases. - */ - if (touch) { - gup.gup_flags |= FOLL_TOUCH; - } else { - for (; (unsigned long)p < gup.addr + size; p += psize()) - p[0] = 0; - } + /* Fault them in here, from user space. */ + for (; (unsigned long)p < gup.addr + size; p += psize()) + p[0] = 0; tid = malloc(sizeof(pthread_t) * nthreads); assert(tid); -- cgit v1.2.3 From 3e700b715e1cef66371027cefd3761eb8fa6e0d8 Mon Sep 17 00:00:00 2001 From: Peng Li Date: Mon, 17 Nov 2025 23:40:12 +0800 Subject: selftests/mm: gup_test: fix comment regarding origin of FOLL_WRITE The 'FOLL_WRITE' of the copied source is located in mm_types.h of mm, not mm.h, so fix it. Link: https://lkml.kernel.org/r/20251117154012.197499-2-peng8420.li@gmail.com Signed-off-by: Peng Li Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand (Red Hat) Cc: Dan Williams Cc: Jason Gunthorpe Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/gup_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index 75f7134d529d..40c1538a17b4 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -17,7 +17,7 @@ #define MB (1UL << 20) -/* Just the flags we need, copied from mm.h: */ +/* Just the flags we need, copied from the kernel internals. */ #define FOLL_WRITE 0x01 /* check pte is writable */ #define GUP_TEST_FILE "/sys/kernel/debug/gup_test" -- cgit v1.2.3 From 277a1ae3879a82a15a2e2d6741e38e31ea6487ee Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 13 Nov 2025 15:28:01 +0800 Subject: mm: softdirty: add pgtable_supports_soft_dirty() Patch series "mm: Add soft-dirty and uffd-wp support for RISC-V", v15. This patchset adds support for Svrsw60t59b [1] extension which is ratified now, also add soft dirty and userfaultfd write protect tracking for RISC-V. The patches 1 and 2 add macros to allow architectures to define their own checks if the soft-dirty / uffd_wp PTE bits are available, in other words for RISC-V, the Svrsw60t59b extension is supported on which device the kernel is running. Also patch1-2 are removing "ifdef CONFIG_MEM_SOFT_DIRTY" "ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP" and "ifdef CONFIG_PTE_MARKER_UFFD_WP" in favor of checks which if not overridden by the architecture, no change in behavior is expected. This patchset has been tested with kselftest mm suite in which soft-dirty, madv_populate, test_unmerge_uffd_wp, and uffd-unit-tests run and pass, and no regressions are observed in any of the other tests. This patch (of 6): Some platforms can customize the PTE PMD entry soft-dirty bit making it unavailable even if the architecture provides the resource. Add an API which architectures can define their specific implementations to detect if soft-dirty bit is available on which device the kernel is running. This patch is removing "ifdef CONFIG_MEM_SOFT_DIRTY" in favor of pgtable_supports_soft_dirty() checks that defaults to IS_ENABLED(CONFIG_MEM_SOFT_DIRTY), if not overridden by the architecture, no change in behavior is expected. We make sure to never set VM_SOFTDIRTY if !pgtable_supports_soft_dirty(), so we will never run into VM_SOFTDIRTY checks. [lorenzo.stoakes@oracle.com: fix VMA selftests] Link: https://lkml.kernel.org/r/dac6ddfe-773a-43d5-8f69-021b9ca4d24b@lucifer.local Link: https://lkml.kernel.org/r/20251113072806.795029-1-zhangchunyan@iscas.ac.cn Link: https://lkml.kernel.org/r/20251113072806.795029-2-zhangchunyan@iscas.ac.cn Link: https://github.com/riscv-non-isa/riscv-iommu/pull/543 [1] Signed-off-by: Chunyan Zhang Acked-by: David Hildenbrand Cc: Albert Ou Cc: Alexandre Ghiti Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Christian Brauner Cc: Conor Dooley Cc: Deepak Gupta Cc: Jan Kara Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Rob Herring Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yuanchu Xie Cc: Alexandre Ghiti Cc: Andrew Jones Cc: Conor Dooley Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 81b501f51948..be99056c5d56 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -212,6 +212,8 @@ typedef __bitwise unsigned int vm_fault_t; #define ASSERT_EXCLUSIVE_WRITER(x) +#define pgtable_supports_soft_dirty() 1 + /** * swap - swap values of @a and @b * @a: first value -- cgit v1.2.3 From ccf9eb326b4aabcd61e71d87469cafd6c5f01308 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 21 Nov 2025 17:25:18 +0000 Subject: tools/testing/vma: add missing stub vm_flags_reset() is not available in the userland VMA tests, so add a stub which const-casts vma->vm_flags and avoids the upcoming removal of the vma->__vm_flags field. Link: https://lkml.kernel.org/r/4aff8bf7-d367-4ba3-90ad-13eef7a063fa@lucifer.local Fixes: c5c67c1de357 ("tools/testing/vma: eliminate dependency on vma->__vm_flags") Signed-off-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index be99056c5d56..8c2ac301a00e 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1549,4 +1549,11 @@ static inline int do_munmap(struct mm_struct *, unsigned long, size_t, return 0; } +static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) +{ + vm_flags_t *dst = (vm_flags_t *)(&vma->vm_flags); + + *dst = flags; +} + #endif /* __MM_VMA_INTERNAL_H */ -- cgit v1.2.3 From 2b6a3f061f11372af79b862d6184d43193ae927f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:00:59 +0000 Subject: mm: declare VMA flags by bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "initial work on making VMA flags a bitmap", v3. We are in the rather silly situation that we are running out of VMA flags as they are currently limited to a system word in size. This leads to absurd situations where we limit features to 64-bit architectures only because we simply do not have the ability to add a flag for 32-bit ones. This is very constraining and leads to hacks or, in the worst case, simply an inability to implement features we want for entirely arbitrary reasons. This also of course gives us something of a Y2K type situation in mm where we might eventually exhaust all of the VMA flags even on 64-bit systems. This series lays the groundwork for getting away from this limitation by establishing VMA flags as a bitmap whose size we can increase in future beyond 64 bits if required. This is necessarily a highly iterative process given the extensive use of VMA flags throughout the kernel, so we start by performing basic steps. Firstly, we declare VMA flags by bit number rather than by value, retaining the VM_xxx fields but in terms of these newly introduced VMA_xxx_BIT fields. While we are here, we use sparse annotations to ensure that, when dealing with VMA bit number parameters, we cannot be passed values which are not declared as such - providing some useful type safety. We then introduce an opaque VMA flag type, much like the opaque mm_struct flag type introduced in commit bb6525f2f8c4 ("mm: add bitmap mm->flags field"), which we establish in union with vma->vm_flags (but still set at system word size meaning there is no functional or data type size change). We update the vm_flags_xxx() helpers to use this new bitmap, introducing sensible helpers to do so. This series lays the foundation for further work to expand the use of bitmap VMA flags and eventually eliminate these arbitrary restrictions. This patch (of 4): In order to lay the groundwork for VMA flags being a bitmap rather than a system word in size, we need to be able to consistently refer to VMA flags by bit number rather than value. Take this opportunity to do so in an enum which we which is additionally useful for tooling to extract metadata from. This additionally makes it very clear which bits are being used for what at a glance. We use the VMA_ prefix for the bit values as it is logical to do so since these reference VMAs. We consistently suffix with _BIT to make it clear what the values refer to. We declare bit values even when the flags that use them would not be enabled by config options as this is simply clearer and clearly defines what bit numbers are used for what, at no additional cost. We declare a sparse-bitwise type vma_flag_t which ensures that users can't pass around invalid VMA flags by accident and prepares for future work towards VMA flags being a bitmap where we want to ensure bit values are type safe. To make life easier, we declare some macro helpers - DECLARE_VMA_BIT() allows us to avoid duplication in the enum bit number declarations (and maintaining the sparse __bitwise attribute), and INIT_VM_FLAG() is used to assist with declaration of flags. Unfortunately we can't declare both in the enum, as we run into issue with logic in the kernel requiring that flags are preprocessor definitions, and additionally we cannot have a macro which declares another macro so we must define each flag macro directly. Additionally, update the VMA userland testing vma_internal.h header to include these changes. We also have to fix the parameters to the vma_flag_*_atomic() functions since VMA_MAYBE_GUARD_BIT is now of type vma_flag_t and sparse will complain otherwise. We have to update some rather silly if-deffery found in mm/task_mmu.c which would otherwise break. Finally, we update the rust binding helper as now it cannot auto-detect the flags at all. Link: https://lkml.kernel.org/r/cover.1764064556.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/3a35e5a0bcfa00e84af24cbafc0653e74deda64a.1764064556.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 304 +++++++++++++++++++++++++++++++++------ 1 file changed, 259 insertions(+), 45 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 8c2ac301a00e..b7e8fc9ccdf4 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -46,42 +46,271 @@ extern unsigned long dac_mmap_min_addr; #define MMF_HAS_MDWE 28 +/* + * vm_flags in vm_area_struct, see mm_types.h. + * When changing, update also include/trace/events/mmflags.h + */ + #define VM_NONE 0x00000000 -#define VM_READ 0x00000001 -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 -#define VM_MAYREAD 0x00000010 -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_GROWSDOWN 0x00000100 -#define VM_PFNMAP 0x00000400 -#define VM_MAYBE_GUARD 0x00000800 -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ -#define VM_DONTEXPAND 0x00040000 -#define VM_LOCKONFAULT 0x00080000 -#define VM_ACCOUNT 0x00100000 -#define VM_NORESERVE 0x00200000 -#define VM_MIXEDMAP 0x10000000 -#define VM_STACK VM_GROWSDOWN -#define VM_SHADOW_STACK VM_NONE -#define VM_SOFTDIRTY 0 -#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_GROWSUP VM_NONE -#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/** + * typedef vma_flag_t - specifies an individual VMA flag by bit number. + * + * This value is made type safe by sparse to avoid passing invalid flag values + * around. + */ +typedef int __bitwise vma_flag_t; +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ +#ifdef CONFIG_MMU + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ +#else + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), +#endif /* CONFIG_MMU */ + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), +#endif + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), +#endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ #ifdef CONFIG_STACK_GROWSUP -#define VM_STACK VM_GROWSUP -#define VM_STACK_EARLY VM_GROWSDOWN + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), #else -#define VM_STACK VM_GROWSDOWN -#define VM_STACK_EARLY 0 + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), #endif +}; + +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) +#else +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE +#endif +#ifdef CONFIG_ARCH_HAS_PKEYS +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) +#if CONFIG_ARCH_PKEY_BITS > 3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) +#else +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ +#if CONFIG_ARCH_PKEY_BITS > 4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) +#else +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ +#endif /* CONFIG_ARCH_HAS_PKEYS */ +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE +#endif +#if defined(CONFIG_PPC64) +#define VM_SAO INIT_VM_FLAG(SAO) +#elif defined(CONFIG_PARISC) +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) +#elif defined(CONFIG_SPARC64) +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif defined(CONFIG_ARM64) +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif !defined(CONFIG_MMU) +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) +#endif +#ifndef VM_GROWSUP +#define VM_GROWSUP VM_NONE +#endif +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) +#else +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE +#endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) +#else +#define VM_UFFD_MINOR VM_NONE +#endif +#ifdef CONFIG_64BIT +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE +#endif +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) +#else +#define VM_DROPPABLE VM_NONE +#endif + +/* Bits set in the VMA until the stack is in its final location */ +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) + +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +/* Common data flag combinations */ +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ + VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#endif + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#endif + +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) + +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) + +/* VMA basic access permission flags */ +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) + +/* + * Special vmas that are non-mergable, non-mlock()able. + */ +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) #define TASK_SIZE_LOW DEFAULT_MAP_WINDOW @@ -97,26 +326,11 @@ extern unsigned long dac_mmap_min_addr; #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC - -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) - -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) -#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) - #define RLIMIT_STACK 3 /* max stack size */ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ #define CAP_IPC_LOCK 14 -#ifdef CONFIG_64BIT -#define VM_SEALED_BIT 42 -#define VM_SEALED BIT(VM_SEALED_BIT) -#else -#define VM_SEALED VM_NONE -#endif - /* * Flags which should be 'sticky' on merge - that is, flags which, when one VMA * possesses it but the other does not, the merged VMA should nonetheless have -- cgit v1.2.3 From 4c613f518f786fb0ca4850e4ca5f1933d6a4a304 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:01:01 +0000 Subject: tools/testing/vma: eliminate dependency on vma->__vm_flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The userland VMA test code relied on an internal implementation detail - the existence of vma->__vm_flags to directly access VMA flags. There is no need to do so when we have the vm_flags_*() helper functions available. This is ugly, but also a subsequent commit will eliminate this field altogether so this will shortly become broken. This patch has us utilise the helper functions instead. Link: https://lkml.kernel.org/r/6275c53a6bb20743edcbe92d3e130183b47d18d0.1764064557.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Acked-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/vma/vma.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index be79ab2ea44b..93d21bc7e112 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -69,18 +69,18 @@ static struct vm_area_struct *alloc_vma(struct mm_struct *mm, pgoff_t pgoff, vm_flags_t vm_flags) { - struct vm_area_struct *ret = vm_area_alloc(mm); + struct vm_area_struct *vma = vm_area_alloc(mm); - if (ret == NULL) + if (vma == NULL) return NULL; - ret->vm_start = start; - ret->vm_end = end; - ret->vm_pgoff = pgoff; - ret->__vm_flags = vm_flags; - vma_assert_detached(ret); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + vm_flags_reset(vma, vm_flags); + vma_assert_detached(vma); - return ret; + return vma; } /* Helper function to allocate a VMA and link it to the tree. */ @@ -714,7 +714,7 @@ static bool test_vma_merge_special_flags(void) for (i = 0; i < ARRAY_SIZE(special_flags); i++) { vm_flags_t special_flag = special_flags[i]; - vma_left->__vm_flags = vm_flags | special_flag; + vm_flags_reset(vma_left, vm_flags | special_flag); vmg.vm_flags = vm_flags | special_flag; vma = merge_new(&vmg); ASSERT_EQ(vma, NULL); @@ -736,7 +736,7 @@ static bool test_vma_merge_special_flags(void) for (i = 0; i < ARRAY_SIZE(special_flags); i++) { vm_flags_t special_flag = special_flags[i]; - vma_left->__vm_flags = vm_flags | special_flag; + vm_flags_reset(vma_left, vm_flags | special_flag); vmg.vm_flags = vm_flags | special_flag; vma = merge_existing(&vmg); ASSERT_EQ(vma, NULL); -- cgit v1.2.3 From 9ea35a25d51b13013b724943a177a7aaf4bfed71 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:01:02 +0000 Subject: mm: introduce VMA flags bitmap type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is useful to transition to using a bitmap for VMA flags so we can avoid running out of flags, especially for 32-bit kernels which are constrained to 32 flags, necessitating some features to be limited to 64-bit kernels only. By doing so, we remove any constraint on the number of VMA flags moving forwards no matter the platform and can decide in future to extend beyond 64 if required. We start by declaring an opaque types, vma_flags_t (which resembles mm_struct flags of type mm_flags_t), setting it to precisely the same size as vm_flags_t, and place it in union with vm_flags in the VMA declaration. We additionally update struct vm_area_desc equivalently placing the new opaque type in union with vm_flags. This change therefore does not impact the size of struct vm_area_struct or struct vm_area_desc. In order for the change to be iterative and to avoid impacting performance, we designate VM_xxx declared bitmap flag values as those which must exist in the first system word of the VMA flags bitmap. We therefore declare vma_flags_clear_all(), vma_flags_overwrite_word(), vma_flags_overwrite_word(), vma_flags_overwrite_word_once(), vma_flags_set_word() and vma_flags_clear_word() in order to allow us to update the existing vm_flags_*() functions to utilise these helpers. This is a stepping stone towards converting users to the VMA flags bitmap and behaves precisely as before. By doing this, we can eliminate the existing private vma->__vm_flags field in the vma->vm_flags union and replace it with the newly introduced opaque type vma_flags, which we call flags so we refer to the new bitmap field as vma->flags. We update vma_flag_[test, set]_atomic() to account for the change also. We adapt vm_flags_reset_once() to only clear those bits above the first system word providing write-once semantics to the first system word (which it is presumed the caller requires - and in all current use cases this is so). As we currently only specify that the VMA flags bitmap size is equal to BITS_PER_LONG number of bits, this is a noop, but is defensive in preparation for a future change that increases this. We additionally update the VMA userland test declarations to implement the same changes there. Finally, we update the rust code to reference vma->vm_flags on update rather than vma->__vm_flags which has been removed. This is safe for now, albeit it is implicitly performing a const cast. Once we introduce flag helpers we can improve this more. No functional change intended. Link: https://lkml.kernel.org/r/bab179d7b153ac12f221b7d65caac2759282cfe9.1764064557.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 150 +++++++++++++++++++++++++++++++-------- 1 file changed, 120 insertions(+), 30 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index b7e8fc9ccdf4..9f0a9f5ed0fe 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -524,6 +524,15 @@ typedef struct { __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } mm_flags_t; +/* + * Opaque type representing current VMA (vm_area_struct) flag state. Must be + * accessed via vma_flags_xxx() helper functions. + */ +#define NUM_VMA_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); +} __private vma_flags_t; + struct mm_struct { struct maple_tree mm_mt; int map_count; /* number of VMAs */ @@ -608,7 +617,10 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - vm_flags_t vm_flags; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; pgprot_t page_prot; /* Write-only fields. */ @@ -654,7 +666,7 @@ struct vm_area_struct { */ union { const vm_flags_t vm_flags; - vm_flags_t __private __vm_flags; + vma_flags_t flags; }; #ifdef CONFIG_PER_VMA_LOCK @@ -1368,26 +1380,6 @@ static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, return true; } -static inline void vm_flags_init(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma->__vm_flags = flags; -} - -static inline void vm_flags_set(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma->__vm_flags |= flags; -} - -static inline void vm_flags_clear(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma->__vm_flags &= ~flags; -} - static inline int shmem_zero_setup(struct vm_area_struct *vma) { return 0; @@ -1544,13 +1536,118 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } -# define ACCESS_PRIVATE(p, member) ((p)->member) +#define ACCESS_PRIVATE(p, member) ((p)->member) + +#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) + +static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) +{ + unsigned int len = bitmap_size(nbits); + + if (small_const_nbits(nbits)) + *dst = 0; + else + memset(dst, 0, len); +} static inline bool mm_flags_test(int flag, const struct mm_struct *mm) { return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } +/* Clears all bits in the VMA flags bitmap, non-atomically. */ +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); +} + +/* + * Copy value to the first system word of VMA flags, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +{ + *ACCESS_PRIVATE(flags, __vma_flags) = value; +} + +/* + * Copy value to the first system word of VMA flags ONCE, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + WRITE_ONCE(*bitmap, value); +} + +/* Update the first system word of VMA flags setting bits, non-atomically. */ +static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap |= value; +} + +/* Update the first system word of VMA flags clearing bits, non-atomically. */ +static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap &= ~value; +} + + +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); +} + +/* + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. + */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + /* + * The user should only be interested in avoiding reordering of + * assignment to the first word. + */ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word_once(&vma->flags, flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_set_word(&vma->flags, flags); +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_clear_word(&vma->flags, flags); +} + /* * Denies creating a writable executable mapping or gaining executable permissions. * @@ -1763,11 +1860,4 @@ static inline int do_munmap(struct mm_struct *, unsigned long, size_t, return 0; } -static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) -{ - vm_flags_t *dst = (vm_flags_t *)(&vma->vm_flags); - - *dst = flags; -} - #endif /* __MM_VMA_INTERNAL_H */ -- cgit v1.2.3 From 0384c8ea96bfe49e82e624e53bfd5f80c3230ea9 Mon Sep 17 00:00:00 2001 From: Ankit Khushwaha Date: Wed, 26 Nov 2025 21:38:30 +0530 Subject: selftests/mm/uffd: initialize char variable to Null In "uffd-stress.c" & "uffd-unit-tests.c". address of char variable having garbage value (uninitialized) is passed to 'write' syscall triggers warning. uffd-stress.c:246:39: warning: variable 'c' is uninitialized when passed as a const pointer argument here [-Wuninitialized-const-pointer] uffd-unit-tests.c:581:31: warning: variable 'c' is uninitialized when passed as a const pointer argument here [-Wuninitialized-const-pointer] so the fix is to assign char variable to '\0' to prevent writing of garbage value. Link: https://lkml.kernel.org/r/20251126160830.52124-1-ankitkhushwaha.linux@gmail.com Signed-off-by: Ankit Khushwaha Reviewed-by: Mike Rapoport (Microsoft) Cc: Bill Wendling Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Nathan Chancellor Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-stress.c | 2 +- tools/testing/selftests/mm/uffd-unit-tests.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index b51c89e1cd1a..700fbaa18d44 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -241,7 +241,7 @@ static int stress(struct uffd_args *args) return 1; for (cpu = 0; cpu < gopts->nr_parallel; cpu++) { - char c; + char c = '\0'; if (bounces & BOUNCE_POLL) { if (write(gopts->pipefd[cpu*2+1], &c, 1) != 1) err("pipefd write error"); diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index f917b4c4c943..f4807242c5b2 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -543,7 +543,7 @@ static void uffd_minor_test_common(uffd_global_test_opts_t *gopts, bool test_col { unsigned long p; pthread_t uffd_mon; - char c; + char c = '\0'; struct uffd_args args = { 0 }; args.gopts = gopts; @@ -759,7 +759,7 @@ static void uffd_sigbus_test_common(uffd_global_test_opts_t *gopts, bool wp) pthread_t uffd_mon; pid_t pid; int err; - char c; + char c = '\0'; struct uffd_args args = { 0 }; args.gopts = gopts; @@ -819,7 +819,7 @@ static void uffd_events_test_common(uffd_global_test_opts_t *gopts, bool wp) pthread_t uffd_mon; pid_t pid; int err; - char c; + char c = '\0'; struct uffd_args args = { 0 }; args.gopts = gopts; @@ -1125,7 +1125,7 @@ uffd_move_test_common(uffd_global_test_opts_t *gopts, { unsigned long nr; pthread_t uffd_mon; - char c; + char c = '\0'; unsigned long long count; struct uffd_args args = { 0 }; char *orig_area_src = NULL, *orig_area_dst = NULL; -- cgit v1.2.3