From c4876ff68716e5372224d17045b47610d667a0ee Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Mon, 9 Jan 2023 17:43:32 +0000 Subject: mm/debug: use valid physical memory for pmd/pud tests The page table debug tests need a physical address to validate low-level page table manipulation with. The memory at this address is not actually touched, it just encoded in the page table entries at various levels during the tests only. Since the memory is not used, the code just picks the physical address of the start_kernel symbol. This value is then truncated to get a properly aligned address that is to be used for various tests. Because of the truncation, the address might not actually exist, or might not describe a complete huge page. That's not a problem for most tests, but the arch-specific code may check for attribute validity and consistency. The x86 version of {pud,pmd}_set_huge actually validates the MTRRs for the PMD/PUD range. This may fail with an address derived from start_kernel, depending on where the kernel was loaded and what the physical memory layout of the system is. This then leads to false negatives for the {pud,pmd}_set_huge tests. Avoid this by finding a properly aligned memory range that exists and is usable. If such a range is not found, skip the tests that needed it. [fvdl@google.com: v3] Link: https://lkml.kernel.org/r/20230110181208.1633879-1-fvdl@google.com Link: https://lkml.kernel.org/r/20230109174332.329366-1-fvdl@google.com Fixes: 399145f9eb6c ("mm/debug: add tests validating architecture page table helpers") Signed-off-by: Frank van der Linden Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 102 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 83 insertions(+), 19 deletions(-) (limited to 'mm/debug_vm_pgtable.c') diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index c631ade3f1d2..bb3328f46126 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,7 @@ struct pgtable_debug_args { unsigned long pmd_pfn; unsigned long pte_pfn; + unsigned long fixed_alignment; unsigned long fixed_pgd_pfn; unsigned long fixed_p4d_pfn; unsigned long fixed_pud_pfn; @@ -430,7 +432,8 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args) { pmd_t pmd; - if (!arch_vmap_pmd_supported(args->page_prot)) + if (!arch_vmap_pmd_supported(args->page_prot) || + args->fixed_alignment < PMD_SIZE) return; pr_debug("Validating PMD huge\n"); @@ -449,7 +452,8 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args) { pud_t pud; - if (!arch_vmap_pud_supported(args->page_prot)) + if (!arch_vmap_pud_supported(args->page_prot) || + args->fixed_alignment < PUD_SIZE) return; pr_debug("Validating PUD huge\n"); @@ -1077,10 +1081,85 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) return page; } +/* + * Check if a physical memory range described by contains + * an area that is of size psize, and aligned to psize. + * + * Don't use address 0, an all-zeroes physical address might mask bugs, and + * it's not used on x86. + */ +static void __init phys_align_check(phys_addr_t pstart, + phys_addr_t pend, unsigned long psize, + phys_addr_t *physp, unsigned long *alignp) +{ + phys_addr_t aligned_start, aligned_end; + + if (pstart == 0) + pstart = PAGE_SIZE; + + aligned_start = ALIGN(pstart, psize); + aligned_end = aligned_start + psize; + + if (aligned_end > aligned_start && aligned_end <= pend) { + *alignp = psize; + *physp = aligned_start; + } +} + +static void __init init_fixed_pfns(struct pgtable_debug_args *args) +{ + u64 idx; + phys_addr_t phys, pstart, pend; + + /* + * Initialize the fixed pfns. To do this, try to find a + * valid physical range, preferably aligned to PUD_SIZE, + * but settling for aligned to PMD_SIZE as a fallback. If + * neither of those is found, use the physical address of + * the start_kernel symbol. + * + * The memory doesn't need to be allocated, it just needs to exist + * as usable memory. It won't be touched. + * + * The alignment is recorded, and can be checked to see if we + * can run the tests that require an actual valid physical + * address range on some architectures ({pmd,pud}_huge_test + * on x86). + */ + + phys = __pa_symbol(&start_kernel); + args->fixed_alignment = PAGE_SIZE; + + for_each_mem_range(idx, &pstart, &pend) { + /* First check for a PUD-aligned area */ + phys_align_check(pstart, pend, PUD_SIZE, &phys, + &args->fixed_alignment); + + /* If a PUD-aligned area is found, we're done */ + if (args->fixed_alignment == PUD_SIZE) + break; + + /* + * If no PMD-aligned area found yet, check for one, + * but continue the loop to look for a PUD-aligned area. + */ + if (args->fixed_alignment < PMD_SIZE) + phys_align_check(pstart, pend, PMD_SIZE, &phys, + &args->fixed_alignment); + } + + args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK); + args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK); + args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK); + args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK); + args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK); + WARN_ON(!pfn_valid(args->fixed_pte_pfn)); +} + + static int __init init_args(struct pgtable_debug_args *args) { struct page *page = NULL; - phys_addr_t phys; int ret = 0; /* @@ -1160,22 +1239,7 @@ static int __init init_args(struct pgtable_debug_args *args) args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp)); WARN_ON(!args->start_ptep); - /* - * PFN for mapping at PTE level is determined from a standard kernel - * text symbol. But pfns for higher page table levels are derived by - * masking lower bits of this real pfn. These derived pfns might not - * exist on the platform but that does not really matter as pfn_pxx() - * helpers will still create appropriate entries for the test. This - * helps avoid large memory block allocations to be used for mapping - * at higher page table levels in some of the tests. - */ - phys = __pa_symbol(&start_kernel); - args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK); - args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK); - args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK); - args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK); - args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK); - WARN_ON(!pfn_valid(args->fixed_pte_pfn)); + init_fixed_pfns(args); /* * Allocate (huge) pages because some of the tests need to access -- cgit v1.2.3 From 2321ba3e3733f513e46e29b9c70512ecddbf1085 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:01 +0100 Subject: mm/debug_vm_pgtable: more pte_swp_exclusive() sanity checks Patch series "mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE on all architectures with swap PTEs". This is the follow-up on [1]: [PATCH v2 0/8] mm: COW fixes part 3: reliable GUP R/W FOLL_GET of anonymous pages After we implemented __HAVE_ARCH_PTE_SWP_EXCLUSIVE on most prominent enterprise architectures, implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE on all remaining architectures that support swap PTEs. This makes sure that exclusive anonymous pages will stay exclusive, even after they were swapped out -- for example, making GUP R/W FOLL_GET of anonymous pages reliable. Details can be found in [1]. This primarily fixes remaining known O_DIRECT memory corruptions that can happen on concurrent swapout, whereby we can lose DMA reads to a page (modifying the user page by writing to it). To verify, there are two test cases (requiring swap space, obviously): (1) The O_DIRECT+swapout test case [2] from Andrea. This test case tries triggering a race condition. (2) My vmsplice() test case [3] that tries to detect if the exclusive marker was lost during swapout, not relying on a race condition. For example, on 32bit x86 (with and without PAE), my test case fails without these patches: $ ./test_swp_exclusive FAIL: page was replaced during COW But succeeds with these patches: $ ./test_swp_exclusive PASS: page was not replaced during COW Why implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE for all architectures, even the ones where swap support might be in a questionable state? This is the first step towards removing "readable_exclusive" migration entries, and instead using pte_swp_exclusive() also with (readable) migration entries instead (as suggested by Peter). The only missing piece for that is supporting pmd_swp_exclusive() on relevant architectures with THP migration support. As all relevant architectures now implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE,, we can drop __HAVE_ARCH_PTE_SWP_EXCLUSIVE in the last patch. I tried cross-compiling all relevant setups and tested on x86 and sparc64 so far. CCing arch maintainers only on this cover letter and on the respective patch(es). [1] https://lkml.kernel.org/r/20220329164329.208407-1-david@redhat.com [2] https://gitlab.com/aarcange/kernel-testcases-for-v5.11/-/blob/main/page_count_do_wp_page-swap.c [3] https://gitlab.com/davidhildenbrand/scratchspace/-/blob/main/test_swp_exclusive.c This patch (of 26): We want to implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE on all architectures. Let's extend our sanity checks, especially testing that our PTE bit does not affect: * is_swap_pte() -> pte_present() and pte_none() * the swap entry + type * pte_swp_soft_dirty() Especially, the pfn_pte() is dodgy when the swap PTE layout differs heavily from ordinary PTEs. Let's properly construct a swap PTE from swap type+offset. [david@redhat.com: fix build] Link: https://lkml.kernel.org/r/6aaad548-cf48-77fa-9d6c-db83d724b2eb@redhat.com Link: https://lkml.kernel.org/r/20230113171026.582290-1-david@redhat.com Link: https://lkml.kernel.org/r/20230113171026.582290-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anton Ivanov Cc: Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Guo Ren Cc: Helge Deller Cc: H. Peter Anvin (Intel) Cc: Huacai Chen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jason Gunthorpe Cc: Johannes Berg Cc: John Hubbard Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Xuerui Wang Cc: Yang Shi Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'mm/debug_vm_pgtable.c') diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index bb3328f46126..ff8d6f6af896 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -811,13 +811,36 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) { #ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE - pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + unsigned long max_swap_offset; + swp_entry_t entry, entry2; + pte_t pte; pr_debug("Validating PTE swap exclusive\n"); + + /* See generic_max_swapfile_size(): probe the maximum offset */ + max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); + + /* Create a swp entry with all possible bits set */ + entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset); + + pte = swp_entry_to_pte(entry); + WARN_ON(pte_swp_exclusive(pte)); + WARN_ON(!is_swap_pte(pte)); + entry2 = pte_to_swp_entry(pte); + WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + pte = pte_swp_mkexclusive(pte); WARN_ON(!pte_swp_exclusive(pte)); + WARN_ON(!is_swap_pte(pte)); + WARN_ON(pte_swp_soft_dirty(pte)); + entry2 = pte_to_swp_entry(pte); + WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + pte = pte_swp_clear_exclusive(pte); WARN_ON(pte_swp_exclusive(pte)); + WARN_ON(!is_swap_pte(pte)); + entry2 = pte_to_swp_entry(pte); + WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); #endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */ } -- cgit v1.2.3 From 950fe885a89770619e315f9b46301eebf0aab7b3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:26 +0100 Subject: mm: remove __HAVE_ARCH_PTE_SWP_EXCLUSIVE __HAVE_ARCH_PTE_SWP_EXCLUSIVE is now supported by all architectures that support swp PTEs, so let's drop it. Link: https://lkml.kernel.org/r/20230113171026.582290-27-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm/debug_vm_pgtable.c') diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index ff8d6f6af896..af59cc7bd307 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -810,7 +810,6 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) { -#ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE unsigned long max_swap_offset; swp_entry_t entry, entry2; pte_t pte; @@ -841,7 +840,6 @@ static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) WARN_ON(!is_swap_pte(pte)); entry2 = pte_to_swp_entry(pte); WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); -#endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */ } static void __init pte_swap_tests(struct pgtable_debug_args *args) -- cgit v1.2.3