From c4876ff68716e5372224d17045b47610d667a0ee Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Mon, 9 Jan 2023 17:43:32 +0000
Subject: mm/debug: use valid physical memory for pmd/pud tests

The page table debug tests need a physical address to validate low-level
page table manipulation with.  The memory at this address is not actually
touched, it just encoded in the page table entries at various levels
during the tests only.

Since the memory is not used, the code just picks the physical address of
the start_kernel symbol.  This value is then truncated to get a properly
aligned address that is to be used for various tests.  Because of the
truncation, the address might not actually exist, or might not describe a
complete huge page.  That's not a problem for most tests, but the
arch-specific code may check for attribute validity and consistency.  The
x86 version of {pud,pmd}_set_huge actually validates the MTRRs for the
PMD/PUD range.  This may fail with an address derived from start_kernel,
depending on where the kernel was loaded and what the physical memory
layout of the system is.  This then leads to false negatives for the
{pud,pmd}_set_huge tests.

Avoid this by finding a properly aligned memory range that exists and is
usable.  If such a range is not found, skip the tests that needed it.

[fvdl@google.com: v3]
  Link: https://lkml.kernel.org/r/20230110181208.1633879-1-fvdl@google.com
Link: https://lkml.kernel.org/r/20230109174332.329366-1-fvdl@google.com
Fixes: 399145f9eb6c ("mm/debug: add tests validating architecture page table helpers")
Signed-off-by: Frank van der Linden <fvdl@google.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/debug_vm_pgtable.c | 102 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 83 insertions(+), 19 deletions(-)

(limited to 'mm/debug_vm_pgtable.c')

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index c631ade3f1d2..bb3328f46126 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -15,6 +15,7 @@
 #include <linux/hugetlb.h>
 #include <linux/kernel.h>
 #include <linux/kconfig.h>
+#include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/mm_types.h>
@@ -80,6 +81,7 @@ struct pgtable_debug_args {
 	unsigned long		pmd_pfn;
 	unsigned long		pte_pfn;
 
+	unsigned long		fixed_alignment;
 	unsigned long		fixed_pgd_pfn;
 	unsigned long		fixed_p4d_pfn;
 	unsigned long		fixed_pud_pfn;
@@ -430,7 +432,8 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args)
 {
 	pmd_t pmd;
 
-	if (!arch_vmap_pmd_supported(args->page_prot))
+	if (!arch_vmap_pmd_supported(args->page_prot) ||
+	    args->fixed_alignment < PMD_SIZE)
 		return;
 
 	pr_debug("Validating PMD huge\n");
@@ -449,7 +452,8 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args)
 {
 	pud_t pud;
 
-	if (!arch_vmap_pud_supported(args->page_prot))
+	if (!arch_vmap_pud_supported(args->page_prot) ||
+	    args->fixed_alignment < PUD_SIZE)
 		return;
 
 	pr_debug("Validating PUD huge\n");
@@ -1077,10 +1081,85 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
 	return page;
 }
 
+/*
+ * Check if a physical memory range described by <pstart, pend> contains
+ * an area that is of size psize, and aligned to psize.
+ *
+ * Don't use address 0, an all-zeroes physical address might mask bugs, and
+ * it's not used on x86.
+ */
+static void  __init phys_align_check(phys_addr_t pstart,
+				     phys_addr_t pend, unsigned long psize,
+				     phys_addr_t *physp, unsigned long *alignp)
+{
+	phys_addr_t aligned_start, aligned_end;
+
+	if (pstart == 0)
+		pstart = PAGE_SIZE;
+
+	aligned_start = ALIGN(pstart, psize);
+	aligned_end = aligned_start + psize;
+
+	if (aligned_end > aligned_start && aligned_end <= pend) {
+		*alignp = psize;
+		*physp = aligned_start;
+	}
+}
+
+static void __init init_fixed_pfns(struct pgtable_debug_args *args)
+{
+	u64 idx;
+	phys_addr_t phys, pstart, pend;
+
+	/*
+	 * Initialize the fixed pfns. To do this, try to find a
+	 * valid physical range, preferably aligned to PUD_SIZE,
+	 * but settling for aligned to PMD_SIZE as a fallback. If
+	 * neither of those is found, use the physical address of
+	 * the start_kernel symbol.
+	 *
+	 * The memory doesn't need to be allocated, it just needs to exist
+	 * as usable memory. It won't be touched.
+	 *
+	 * The alignment is recorded, and can be checked to see if we
+	 * can run the tests that require an actual valid physical
+	 * address range on some architectures ({pmd,pud}_huge_test
+	 * on x86).
+	 */
+
+	phys = __pa_symbol(&start_kernel);
+	args->fixed_alignment = PAGE_SIZE;
+
+	for_each_mem_range(idx, &pstart, &pend) {
+		/* First check for a PUD-aligned area */
+		phys_align_check(pstart, pend, PUD_SIZE, &phys,
+				 &args->fixed_alignment);
+
+		/* If a PUD-aligned area is found, we're done */
+		if (args->fixed_alignment == PUD_SIZE)
+			break;
+
+		/*
+		 * If no PMD-aligned area found yet, check for one,
+		 * but continue the loop to look for a PUD-aligned area.
+		 */
+		if (args->fixed_alignment < PMD_SIZE)
+			phys_align_check(pstart, pend, PMD_SIZE, &phys,
+					 &args->fixed_alignment);
+	}
+
+	args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
+	args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
+	args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
+	args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
+	args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
+	WARN_ON(!pfn_valid(args->fixed_pte_pfn));
+}
+
+
 static int __init init_args(struct pgtable_debug_args *args)
 {
 	struct page *page = NULL;
-	phys_addr_t phys;
 	int ret = 0;
 
 	/*
@@ -1160,22 +1239,7 @@ static int __init init_args(struct pgtable_debug_args *args)
 	args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp));
 	WARN_ON(!args->start_ptep);
 
-	/*
-	 * PFN for mapping at PTE level is determined from a standard kernel
-	 * text symbol. But pfns for higher page table levels are derived by
-	 * masking lower bits of this real pfn. These derived pfns might not
-	 * exist on the platform but that does not really matter as pfn_pxx()
-	 * helpers will still create appropriate entries for the test. This
-	 * helps avoid large memory block allocations to be used for mapping
-	 * at higher page table levels in some of the tests.
-	 */
-	phys = __pa_symbol(&start_kernel);
-	args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
-	args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
-	args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
-	args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
-	args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
-	WARN_ON(!pfn_valid(args->fixed_pte_pfn));
+	init_fixed_pfns(args);
 
 	/*
 	 * Allocate (huge) pages because some of the tests need to access
-- 
cgit v1.2.3


From 2321ba3e3733f513e46e29b9c70512ecddbf1085 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 13 Jan 2023 18:10:01 +0100
Subject: mm/debug_vm_pgtable: more pte_swp_exclusive() sanity checks

Patch series "mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE on all
architectures with swap PTEs".

This is the follow-up on [1]:
	[PATCH v2 0/8] mm: COW fixes part 3: reliable GUP R/W FOLL_GET of
	anonymous pages

After we implemented __HAVE_ARCH_PTE_SWP_EXCLUSIVE on most prominent
enterprise architectures, implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE on all
remaining architectures that support swap PTEs.

This makes sure that exclusive anonymous pages will stay exclusive, even
after they were swapped out -- for example, making GUP R/W FOLL_GET of
anonymous pages reliable.  Details can be found in [1].

This primarily fixes remaining known O_DIRECT memory corruptions that can
happen on concurrent swapout, whereby we can lose DMA reads to a page
(modifying the user page by writing to it).

To verify, there are two test cases (requiring swap space, obviously):
(1) The O_DIRECT+swapout test case [2] from Andrea. This test case tries
    triggering a race condition.
(2) My vmsplice() test case [3] that tries to detect if the exclusive
    marker was lost during swapout, not relying on a race condition.


For example, on 32bit x86 (with and without PAE), my test case fails
without these patches:
	$ ./test_swp_exclusive
	FAIL: page was replaced during COW
But succeeds with these patches:
	$ ./test_swp_exclusive
	PASS: page was not replaced during COW


Why implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE for all architectures, even
the ones where swap support might be in a questionable state?  This is the
first step towards removing "readable_exclusive" migration entries, and
instead using pte_swp_exclusive() also with (readable) migration entries
instead (as suggested by Peter).  The only missing piece for that is
supporting pmd_swp_exclusive() on relevant architectures with THP
migration support.

As all relevant architectures now implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE,,
we can drop __HAVE_ARCH_PTE_SWP_EXCLUSIVE in the last patch.

I tried cross-compiling all relevant setups and tested on x86 and sparc64
so far.

CCing arch maintainers only on this cover letter and on the respective
patch(es).

[1] https://lkml.kernel.org/r/20220329164329.208407-1-david@redhat.com
[2] https://gitlab.com/aarcange/kernel-testcases-for-v5.11/-/blob/main/page_count_do_wp_page-swap.c
[3] https://gitlab.com/davidhildenbrand/scratchspace/-/blob/main/test_swp_exclusive.c


This patch (of 26):

We want to implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE on all architectures.
Let's extend our sanity checks, especially testing that our PTE bit does
not affect:

* is_swap_pte() -> pte_present() and pte_none()
* the swap entry + type
* pte_swp_soft_dirty()

Especially, the pfn_pte() is dodgy when the swap PTE layout differs
heavily from ordinary PTEs.  Let's properly construct a swap PTE from swap
type+offset.

[david@redhat.com: fix build]
  Link: https://lkml.kernel.org/r/6aaad548-cf48-77fa-9d6c-db83d724b2eb@redhat.com
Link: https://lkml.kernel.org/r/20230113171026.582290-1-david@redhat.com
Link: https://lkml.kernel.org/r/20230113171026.582290-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: <aou@eecs.berkeley.edu>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Chris Zankel <chris@zankel.net>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: H. Peter Anvin (Intel) <hpa@zytor.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuerui Wang <kernel@xen0n.name>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yoshinori Sato <ysato@users.osdn.me>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/debug_vm_pgtable.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'mm/debug_vm_pgtable.c')

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index bb3328f46126..ff8d6f6af896 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -811,13 +811,36 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) {
 static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
 {
 #ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE
-	pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
+	unsigned long max_swap_offset;
+	swp_entry_t entry, entry2;
+	pte_t pte;
 
 	pr_debug("Validating PTE swap exclusive\n");
+
+	/* See generic_max_swapfile_size(): probe the maximum offset */
+	max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
+
+	/* Create a swp entry with all possible bits set */
+	entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset);
+
+	pte = swp_entry_to_pte(entry);
+	WARN_ON(pte_swp_exclusive(pte));
+	WARN_ON(!is_swap_pte(pte));
+	entry2 = pte_to_swp_entry(pte);
+	WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
+
 	pte = pte_swp_mkexclusive(pte);
 	WARN_ON(!pte_swp_exclusive(pte));
+	WARN_ON(!is_swap_pte(pte));
+	WARN_ON(pte_swp_soft_dirty(pte));
+	entry2 = pte_to_swp_entry(pte);
+	WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
+
 	pte = pte_swp_clear_exclusive(pte);
 	WARN_ON(pte_swp_exclusive(pte));
+	WARN_ON(!is_swap_pte(pte));
+	entry2 = pte_to_swp_entry(pte);
+	WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
 #endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */
 }
 
-- 
cgit v1.2.3


From 950fe885a89770619e315f9b46301eebf0aab7b3 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 13 Jan 2023 18:10:26 +0100
Subject: mm: remove __HAVE_ARCH_PTE_SWP_EXCLUSIVE

__HAVE_ARCH_PTE_SWP_EXCLUSIVE is now supported by all architectures that
support swp PTEs, so let's drop it.

Link: https://lkml.kernel.org/r/20230113171026.582290-27-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/debug_vm_pgtable.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'mm/debug_vm_pgtable.c')

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index ff8d6f6af896..af59cc7bd307 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -810,7 +810,6 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) {
 
 static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
 {
-#ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE
 	unsigned long max_swap_offset;
 	swp_entry_t entry, entry2;
 	pte_t pte;
@@ -841,7 +840,6 @@ static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
 	WARN_ON(!is_swap_pte(pte));
 	entry2 = pte_to_swp_entry(pte);
 	WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
-#endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */
 }
 
 static void __init pte_swap_tests(struct pgtable_debug_args *args)
-- 
cgit v1.2.3