summaryrefslogtreecommitdiff
path: root/include/linux/pgtable.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/pgtable.h')
-rw-r--r--include/linux/pgtable.h376
1 files changed, 301 insertions, 75 deletions
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 652f287c1ef6..cdd68ed3ae1a 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -23,25 +23,6 @@
#endif
/*
- * On almost all architectures and configurations, 0 can be used as the
- * upper ceiling to free_pgtables(): on many architectures it has the same
- * effect as using TASK_SIZE. However, there is one configuration which
- * must impose a more careful limit, to avoid freeing kernel pgtables.
- */
-#ifndef USER_PGTABLES_CEILING
-#define USER_PGTABLES_CEILING 0UL
-#endif
-
-/*
- * This defines the first usable user address. Platforms
- * can override its value with custom FIRST_USER_ADDRESS
- * defined in their respective <asm/pgtable.h>.
- */
-#ifndef FIRST_USER_ADDRESS
-#define FIRST_USER_ADDRESS 0UL
-#endif
-
-/*
* This defines the generic helper for accessing PMD page
* table page. Although platforms can still override this
* via their respective <asm/pgtable.h>.
@@ -225,16 +206,156 @@ static inline int pmd_dirty(pmd_t pmd)
* up to date.
*
* In the general case, no lock is guaranteed to be held between entry and exit
- * of the lazy mode. So the implementation must assume preemption may be enabled
- * and cpu migration is possible; it must take steps to be robust against this.
- * (In practice, for user PTE updates, the appropriate page table lock(s) are
- * held, but for kernel PTE updates, no lock is held). Nesting is not permitted
- * and the mode cannot be used in interrupt context.
+ * of the lazy mode. (In practice, for user PTE updates, the appropriate page
+ * table lock(s) are held, but for kernel PTE updates, no lock is held).
+ * The implementation must therefore assume preemption may be enabled upon
+ * entry to the mode and cpu migration is possible; it must take steps to be
+ * robust against this. An implementation may handle this by disabling
+ * preemption, as a consequence generic code may not sleep while the lazy MMU
+ * mode is active.
+ *
+ * The mode is disabled in interrupt context and calls to the lazy_mmu API have
+ * no effect.
+ *
+ * The lazy MMU mode is enabled for a given block of code using:
+ *
+ * lazy_mmu_mode_enable();
+ * <code>
+ * lazy_mmu_mode_disable();
+ *
+ * Nesting is permitted: <code> may itself use an enable()/disable() pair.
+ * A nested call to enable() has no functional effect; however disable() causes
+ * any batched architectural state to be flushed regardless of nesting. After a
+ * call to disable(), the caller can therefore rely on all previous page table
+ * modifications to have taken effect, but the lazy MMU mode may still be
+ * enabled.
+ *
+ * In certain cases, it may be desirable to temporarily pause the lazy MMU mode.
+ * This can be done using:
+ *
+ * lazy_mmu_mode_pause();
+ * <code>
+ * lazy_mmu_mode_resume();
+ *
+ * pause() ensures that the mode is exited regardless of the nesting level;
+ * resume() re-enters the mode at the same nesting level. Any call to the
+ * lazy_mmu_mode_* API between those two calls has no effect. In particular,
+ * this means that pause()/resume() pairs may nest.
+ *
+ * is_lazy_mmu_mode_active() can be used to check whether the lazy MMU mode is
+ * currently enabled.
+ */
+#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE
+/**
+ * lazy_mmu_mode_enable() - Enable the lazy MMU mode.
+ *
+ * Enters a new lazy MMU mode section; if the mode was not already enabled,
+ * enables it and calls arch_enter_lazy_mmu_mode().
+ *
+ * Must be paired with a call to lazy_mmu_mode_disable().
+ *
+ * Has no effect if called:
+ * - While paused - see lazy_mmu_mode_pause()
+ * - In interrupt context
+ */
+static inline void lazy_mmu_mode_enable(void)
+{
+ struct lazy_mmu_state *state = &current->lazy_mmu_state;
+
+ if (in_interrupt() || state->pause_count > 0)
+ return;
+
+ VM_WARN_ON_ONCE(state->enable_count == U8_MAX);
+
+ if (state->enable_count++ == 0)
+ arch_enter_lazy_mmu_mode();
+}
+
+/**
+ * lazy_mmu_mode_disable() - Disable the lazy MMU mode.
+ *
+ * Exits the current lazy MMU mode section. If it is the outermost section,
+ * disables the mode and calls arch_leave_lazy_mmu_mode(). Otherwise (nested
+ * section), calls arch_flush_lazy_mmu_mode().
+ *
+ * Must match a call to lazy_mmu_mode_enable().
+ *
+ * Has no effect if called:
+ * - While paused - see lazy_mmu_mode_pause()
+ * - In interrupt context
+ */
+static inline void lazy_mmu_mode_disable(void)
+{
+ struct lazy_mmu_state *state = &current->lazy_mmu_state;
+
+ if (in_interrupt() || state->pause_count > 0)
+ return;
+
+ VM_WARN_ON_ONCE(state->enable_count == 0);
+
+ if (--state->enable_count == 0)
+ arch_leave_lazy_mmu_mode();
+ else /* Exiting a nested section */
+ arch_flush_lazy_mmu_mode();
+
+}
+
+/**
+ * lazy_mmu_mode_pause() - Pause the lazy MMU mode.
+ *
+ * Pauses the lazy MMU mode; if it is currently active, disables it and calls
+ * arch_leave_lazy_mmu_mode().
+ *
+ * Must be paired with a call to lazy_mmu_mode_resume(). Calls to the
+ * lazy_mmu_mode_* API have no effect until the matching resume() call.
+ *
+ * Has no effect if called:
+ * - While paused (inside another pause()/resume() pair)
+ * - In interrupt context
*/
-#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
-static inline void arch_enter_lazy_mmu_mode(void) {}
-static inline void arch_leave_lazy_mmu_mode(void) {}
-static inline void arch_flush_lazy_mmu_mode(void) {}
+static inline void lazy_mmu_mode_pause(void)
+{
+ struct lazy_mmu_state *state = &current->lazy_mmu_state;
+
+ if (in_interrupt())
+ return;
+
+ VM_WARN_ON_ONCE(state->pause_count == U8_MAX);
+
+ if (state->pause_count++ == 0 && state->enable_count > 0)
+ arch_leave_lazy_mmu_mode();
+}
+
+/**
+ * lazy_mmu_mode_resume() - Resume the lazy MMU mode.
+ *
+ * Resumes the lazy MMU mode; if it was active at the point where the matching
+ * call to lazy_mmu_mode_pause() was made, re-enables it and calls
+ * arch_enter_lazy_mmu_mode().
+ *
+ * Must match a call to lazy_mmu_mode_pause().
+ *
+ * Has no effect if called:
+ * - While paused (inside another pause()/resume() pair)
+ * - In interrupt context
+ */
+static inline void lazy_mmu_mode_resume(void)
+{
+ struct lazy_mmu_state *state = &current->lazy_mmu_state;
+
+ if (in_interrupt())
+ return;
+
+ VM_WARN_ON_ONCE(state->pause_count == 0);
+
+ if (--state->pause_count == 0 && state->enable_count > 0)
+ arch_enter_lazy_mmu_mode();
+}
+#else
+static inline void lazy_mmu_mode_enable(void) {}
+static inline void lazy_mmu_mode_disable(void) {}
+static inline void lazy_mmu_mode_pause(void) {}
+static inline void lazy_mmu_mode_resume(void) {}
#endif
#ifndef pte_batch_hint
@@ -289,7 +410,7 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned int nr)
{
- page_table_check_ptes_set(mm, ptep, pte, nr);
+ page_table_check_ptes_set(mm, addr, ptep, pte, nr);
for (;;) {
set_pte(ptep, pte);
@@ -370,64 +491,63 @@ static inline pgd_t pgdp_get(pgd_t *pgdp)
#endif
#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long address,
- pte_t *ptep)
+static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
{
pte_t pte = ptep_get(ptep);
- int r = 1;
+ bool young = true;
+
if (!pte_young(pte))
- r = 0;
+ young = false;
else
set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
- return r;
+ return young;
}
#endif
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
-static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmdp)
+static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
{
pmd_t pmd = *pmdp;
- int r = 1;
+ bool young = true;
+
if (!pmd_young(pmd))
- r = 0;
+ young = false;
else
set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
- return r;
+ return young;
}
#else
-static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmdp)
+static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
{
BUILD_BUG();
- return 0;
+ return false;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
#endif
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-int ptep_clear_flush_young(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep);
+bool ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
#endif
#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp);
+bool pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
#else
/*
* Despite relevant to THP only, this API is called from generic rmap code
* under PageTransHuge(), hence needs a dummy implementation for !THP
*/
-static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
+static inline bool pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
{
BUILD_BUG();
- return 0;
+ return false;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
@@ -494,7 +614,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
{
pte_t pte = ptep_get(ptep);
pte_clear(mm, address, ptep);
- page_table_check_pte_clear(mm, pte);
+ page_table_check_pte_clear(mm, address, pte);
return pte;
}
#endif
@@ -553,7 +673,7 @@ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
* No need for ptep_get_and_clear(): page table check doesn't care about
* any bits that could have been set by HW concurrently.
*/
- page_table_check_pte_clear(mm, pte);
+ page_table_check_pte_clear(mm, addr, pte);
}
#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH
@@ -648,7 +768,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
pmd_t pmd = *pmdp;
pmd_clear(pmdp);
- page_table_check_pmd_clear(mm, pmd);
+ page_table_check_pmd_clear(mm, address, pmd);
return pmd;
}
@@ -661,7 +781,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
pud_t pud = *pudp;
pud_clear(pudp);
- page_table_check_pud_clear(mm, pud);
+ page_table_check_pud_clear(mm, address, pud);
return pud;
}
@@ -947,6 +1067,78 @@ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
}
#endif
+#ifndef clear_flush_young_ptes
+/**
+ * clear_flush_young_ptes - Mark PTEs that map consecutive pages of the same
+ * folio as old and flush the TLB.
+ * @vma: The virtual memory area the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear access bit.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_clear_flush_young().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline bool clear_flush_young_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, unsigned int nr)
+{
+ bool young = false;
+
+ for (;;) {
+ young |= ptep_clear_flush_young(vma, addr, ptep);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+
+ return young;
+}
+#endif
+
+#ifndef test_and_clear_young_ptes
+/**
+ * test_and_clear_young_ptes - Mark PTEs that map consecutive pages of the same
+ * folio as old
+ * @vma: The virtual memory area the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear access bit.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_test_and_clear_young().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ *
+ * Returns: whether any PTE was young.
+ */
+static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, unsigned int nr)
+{
+ bool young = false;
+
+ for (;;) {
+ young |= ptep_test_and_clear_young(vma, addr, ptep);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+
+ return young;
+}
+#endif
+
/*
* On some architectures hardware does not set page access bit when accessing
* memory page, it is responsibility of software setting this bit. It brings
@@ -1490,6 +1682,25 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end);
#endif /* CONFIG_MMU */
/*
+ * On almost all architectures and configurations, 0 can be used as the
+ * upper ceiling to free_pgtables(): on many architectures it has the same
+ * effect as using TASK_SIZE. However, there is one configuration which
+ * must impose a more careful limit, to avoid freeing kernel pgtables.
+ */
+#ifndef USER_PGTABLES_CEILING
+#define USER_PGTABLES_CEILING 0UL
+#endif
+
+/*
+ * This defines the first usable user address. Platforms
+ * can override its value with custom FIRST_USER_ADDRESS
+ * defined in their respective <asm/pgtable.h>.
+ */
+#ifndef FIRST_USER_ADDRESS
+#define FIRST_USER_ADDRESS 0UL
+#endif
+
+/*
* No-op macros that just return the current protection value. Defined here
* because these macros can be used even if CONFIG_MMU is not defined.
*/
@@ -1742,41 +1953,56 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot)
pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot);
}
-#ifdef CONFIG_MMU
+/*
+ * ZERO_PAGE() is global shared page(s) that is always zero. It is used for
+ * zero-mapped memory areas, CoW etc.
+ *
+ * On architectures that __HAVE_COLOR_ZERO_PAGE there are several such pages
+ * for different ranges in the virtual address space.
+ *
+ * zero_page_pfn identifies the first (or the only) pfn for these pages.
+ *
+ * For architectures that don't __HAVE_COLOR_ZERO_PAGE the zero page lives in
+ * empty_zero_page in BSS.
+ */
+void arch_setup_zero_pages(void);
+
#ifdef __HAVE_COLOR_ZERO_PAGE
static inline int is_zero_pfn(unsigned long pfn)
{
- extern unsigned long zero_pfn;
- unsigned long offset_from_zero_pfn = pfn - zero_pfn;
+ extern unsigned long zero_page_pfn;
+ unsigned long offset_from_zero_pfn = pfn - zero_page_pfn;
+
return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
}
-#define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
+#define zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
#else
static inline int is_zero_pfn(unsigned long pfn)
{
- extern unsigned long zero_pfn;
- return pfn == zero_pfn;
-}
+ extern unsigned long zero_page_pfn;
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
- extern unsigned long zero_pfn;
- return zero_pfn;
+ return pfn == zero_page_pfn;
}
-#endif
-#else
-static inline int is_zero_pfn(unsigned long pfn)
+
+static inline unsigned long zero_pfn(unsigned long addr)
{
- return 0;
+ extern unsigned long zero_page_pfn;
+
+ return zero_page_pfn;
}
-static inline unsigned long my_zero_pfn(unsigned long addr)
+extern uint8_t empty_zero_page[PAGE_SIZE];
+extern struct page *__zero_page;
+
+static inline struct page *_zero_page(unsigned long addr)
{
- return 0;
+ return __zero_page;
}
-#endif /* CONFIG_MMU */
+#define ZERO_PAGE(vaddr) _zero_page(vaddr)
+
+#endif /* __HAVE_COLOR_ZERO_PAGE */
#ifdef CONFIG_MMU
@@ -1814,7 +2040,7 @@ static inline int pud_trans_unstable(pud_t *pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
- pud_t pudval = READ_ONCE(*pud);
+ pud_t pudval = pudp_get(pud);
if (pud_none(pudval) || pud_trans_huge(pudval))
return 1;