diff options
| author | David Hildenbrand (Red Hat) <david@kernel.org> | 2026-01-19 23:07:07 +0100 |
|---|---|---|
| committer | Andrew Morton <akpm@linux-foundation.org> | 2026-02-06 15:47:19 -0800 |
| commit | 4c640eb4181cf8de74c8b9e7c9cf16bf8d26b73e (patch) | |
| tree | 916b03d49c6dfa6e088860f2e58079173399a338 /mm/memory.c | |
| parent | 9c8c02df3f8742f6db927e787ab971fd0b5ac08a (diff) | |
mm: move pte table reclaim code to memory.c
Some cleanups for PT table reclaim code, triggered by a false-positive
warning we might start to see soon after we unlocked pt-reclaim on
architectures besides x86-64.
This patch (of 2):
The pte-table reclaim code is only called from memory.c, while zapping
pages, and it better also stays that way in the long run. If we ever have
to call it from other files, we should expose proper high-level helpers
for zapping if the existing helpers are not good enough.
So, let's move the code over (it's not a lot) and slightly clean it up a
bit by:
- Renaming the functions.
- Dropping the "Check if it is empty PTE page" comment, which is now
self-explaining given the function name.
- Making zap_pte_table_if_empty() return whether zapping worked so the
caller can free it.
- Adding a comment in pte_table_reclaim_possible().
- Inlining free_pte() in the last remaining user.
- In zap_empty_pte_table(), switch from pmdp_get_lcokless() to
pmd_clear(), we are holding the PMD PT lock.
By moving the code over, compilers can also easily figure out when
zap_empty_pte_table() does not initialize the pmdval variable, avoiding
false-positive warnings about the variable possibly not being initialized.
Link: https://lkml.kernel.org/r/20260119220708.3438514-1-david@kernel.org
Link: https://lkml.kernel.org/r/20260119220708.3438514-2-david@kernel.org
Signed-off-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
| -rw-r--r-- | mm/memory.c | 68 |
1 files changed, 62 insertions, 6 deletions
diff --git a/mm/memory.c b/mm/memory.c index 2a347e31a077..de22710bb217 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1821,11 +1821,68 @@ static inline int do_zap_pte_range(struct mmu_gather *tlb, return nr; } +static bool pte_table_reclaim_possible(unsigned long start, unsigned long end, + struct zap_details *details) +{ + if (!IS_ENABLED(CONFIG_PT_RECLAIM)) + return false; + /* Only zap if we are allowed to and cover the full page table. */ + return details && details->reclaim_pt && (end - start >= PMD_SIZE); +} + +static bool zap_empty_pte_table(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval) +{ + spinlock_t *pml = pmd_lockptr(mm, pmd); + + if (!spin_trylock(pml)) + return false; + + *pmdval = pmdp_get(pmd); + pmd_clear(pmd); + spin_unlock(pml); + return true; +} + +static bool zap_pte_table_if_empty(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, pmd_t *pmdval) +{ + spinlock_t *pml, *ptl = NULL; + pte_t *start_pte, *pte; + int i; + + pml = pmd_lock(mm, pmd); + start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, pmdval, &ptl); + if (!start_pte) + goto out_ptl; + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + + for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(ptep_get(pte))) + goto out_ptl; + } + pte_unmap(start_pte); + + pmd_clear(pmd); + + if (ptl != pml) + spin_unlock(ptl); + spin_unlock(pml); + return true; +out_ptl: + if (start_pte) + pte_unmap_unlock(start_pte, ptl); + if (ptl != pml) + spin_unlock(pml); + return false; +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { + bool can_reclaim_pt = pte_table_reclaim_possible(addr, end, details); bool force_flush = false, force_break = false; struct mm_struct *mm = tlb->mm; int rss[NR_MM_COUNTERS]; @@ -1834,7 +1891,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *pte; pmd_t pmdval; unsigned long start = addr; - bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); bool direct_reclaim = true; int nr; @@ -1875,7 +1931,7 @@ retry: * from being repopulated by another thread. */ if (can_reclaim_pt && direct_reclaim && addr == end) - direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); + direct_reclaim = zap_empty_pte_table(mm, pmd, &pmdval); add_mm_rss_vec(mm, rss); lazy_mmu_mode_disable(); @@ -1904,10 +1960,10 @@ retry: } if (can_reclaim_pt) { - if (direct_reclaim) - free_pte(mm, start, tlb, pmdval); - else - try_to_free_pte(mm, pmd, start, tlb); + if (direct_reclaim || zap_pte_table_if_empty(mm, pmd, start, &pmdval)) { + pte_free_tlb(tlb, pmd_pgtable(pmdval), addr); + mm_dec_nr_ptes(mm); + } } return addr; |
