summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-12 11:32:37 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-12 11:32:37 -0800
commit4cff5c05e076d2ee4e34122aa956b84a2eaac587 (patch)
tree6717207240b3881d1b48ff7cd86b193506756e6c /mm/memory.c
parent541c43310e85dbf35368b43b720c6724bc8ad8ec (diff)
parentfb4ddf2085115ed28dedc427d9491707b476bbfe (diff)
Merge tag 'mm-stable-2026-02-11-19-22' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - "powerpc/64s: do not re-activate batched TLB flush" makes arch_{enter|leave}_lazy_mmu_mode() nest properly (Alexander Gordeev) It adds a generic enter/leave layer and switches architectures to use it. Various hacks were removed in the process. - "zram: introduce compressed data writeback" implements data compression for zram writeback (Richard Chang and Sergey Senozhatsky) - "mm: folio_zero_user: clear page ranges" adds clearing of contiguous page ranges for hugepages. Large improvements during demand faulting are demonstrated (David Hildenbrand) - "memcg cleanups" tidies up some memcg code (Chen Ridong) - "mm/damon: introduce {,max_}nr_snapshots and tracepoint for damos stats" improves DAMOS stat's provided information, deterministic control, and readability (SeongJae Park) - "selftests/mm: hugetlb cgroup charging: robustness fixes" fixes a few issues in the hugetlb cgroup charging selftests (Li Wang) - "Fix va_high_addr_switch.sh test failure - again" addresses several issues in the va_high_addr_switch test (Chunyu Hu) - "mm/damon/tests/core-kunit: extend existing test scenarios" improves the KUnit test coverage for DAMON (Shu Anzai) - "mm/khugepaged: fix dirty page handling for MADV_COLLAPSE" fixes a glitch in khugepaged which was causing madvise(MADV_COLLAPSE) to transiently return -EAGAIN (Shivank Garg) - "arch, mm: consolidate hugetlb early reservation" reworks and consolidates a pile of straggly code related to reservation of hugetlb memory from bootmem and creation of CMA areas for hugetlb (Mike Rapoport) - "mm: clean up anon_vma implementation" cleans up the anon_vma implementation in various ways (Lorenzo Stoakes) - "tweaks for __alloc_pages_slowpath()" does a little streamlining of the page allocator's slowpath code (Vlastimil Babka) - "memcg: separate private and public ID namespaces" cleans up the memcg ID code and prevents the internal-only private IDs from being exposed to userspace (Shakeel Butt) - "mm: hugetlb: allocate frozen gigantic folio" cleans up the allocation of frozen folios and avoids some atomic refcount operations (Kefeng Wang) - "mm/damon: advance DAMOS-based LRU sorting" improves DAMOS's movement of memory betewwn the active and inactive LRUs and adds auto-tuning of the ratio-based quotas and of monitoring intervals (SeongJae Park) - "Support page table check on PowerPC" makes CONFIG_PAGE_TABLE_CHECK_ENFORCED work on powerpc (Andrew Donnellan) - "nodemask: align nodes_and{,not} with underlying bitmap ops" makes nodes_and() and nodes_andnot() propagate the return values from the underlying bit operations, enabling some cleanup in calling code (Yury Norov) - "mm/damon: hide kdamond and kdamond_lock from API callers" cleans up some DAMON internal interfaces (SeongJae Park) - "mm/khugepaged: cleanups and scan limit fix" does some cleanup work in khupaged and fixes a scan limit accounting issue (Shivank Garg) - "mm: balloon infrastructure cleanups" goes to town on the balloon infrastructure and its page migration function. Mainly cleanups, also some locking simplification (David Hildenbrand) - "mm/vmscan: add tracepoint and reason for kswapd_failures reset" adds additional tracepoints to the page reclaim code (Jiayuan Chen) - "Replace wq users and add WQ_PERCPU to alloc_workqueue() users" is part of Marco's kernel-wide migration from the legacy workqueue APIs over to the preferred unbound workqueues (Marco Crivellari) - "Various mm kselftests improvements/fixes" provides various unrelated improvements/fixes for the mm kselftests (Kevin Brodsky) - "mm: accelerate gigantic folio allocation" greatly speeds up gigantic folio allocation, mainly by avoiding unnecessary work in pfn_range_valid_contig() (Kefeng Wang) - "selftests/damon: improve leak detection and wss estimation reliability" improves the reliability of two of the DAMON selftests (SeongJae Park) - "mm/damon: cleanup kdamond, damon_call(), damos filter and DAMON_MIN_REGION" does some cleanup work in the core DAMON code (SeongJae Park) - "Docs/mm/damon: update intro, modules, maintainer profile, and misc" performs maintenance work on the DAMON documentation (SeongJae Park) - "mm: add and use vma_assert_stabilised() helper" refactors and cleans up the core VMA code. The main aim here is to be able to use the mmap write lock's lockdep state to perform various assertions regarding the locking which the VMA code requires (Lorenzo Stoakes) - "mm, swap: swap table phase II: unify swapin use" removes some old swap code (swap cache bypassing and swap synchronization) which wasn't working very well. Various other cleanups and simplifications were made. The end result is a 20% speedup in one benchmark (Kairui Song) - "enable PT_RECLAIM on more 64-bit architectures" makes PT_RECLAIM available on 64-bit alpha, loongarch, mips, parisc, and um. Various cleanups were performed along the way (Qi Zheng) * tag 'mm-stable-2026-02-11-19-22' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (325 commits) mm/memory: handle non-split locks correctly in zap_empty_pte_table() mm: move pte table reclaim code to memory.c mm: make PT_RECLAIM depends on MMU_GATHER_RCU_TABLE_FREE mm: convert __HAVE_ARCH_TLB_REMOVE_TABLE to CONFIG_HAVE_ARCH_TLB_REMOVE_TABLE config um: mm: enable MMU_GATHER_RCU_TABLE_FREE parisc: mm: enable MMU_GATHER_RCU_TABLE_FREE mips: mm: enable MMU_GATHER_RCU_TABLE_FREE LoongArch: mm: enable MMU_GATHER_RCU_TABLE_FREE alpha: mm: enable MMU_GATHER_RCU_TABLE_FREE mm: change mm/pt_reclaim.c to use asm/tlb.h instead of asm-generic/tlb.h mm/damon/stat: remove __read_mostly from memory_idle_ms_percentiles zsmalloc: make common caches global mm: add SPDX id lines to some mm source files mm/zswap: use %pe to print error pointers mm/vmscan: use %pe to print error pointers mm/readahead: fix typo in comment mm: khugepaged: fix NR_FILE_PAGES and NR_SHMEM in collapse_file() mm: refactor vma_map_pages to use vm_insert_pages mm/damon: unify address range representation with damon_addr_range mm/cma: replace snprintf with strscpy in cma_new_area ...
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c451
1 files changed, 260 insertions, 191 deletions
diff --git a/mm/memory.c b/mm/memory.c
index b2909c94e249..b0d487229b2e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -934,7 +934,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct page *page;
if (likely(softleaf_is_swap(entry))) {
- if (swap_duplicate(entry) < 0)
+ if (swap_dup_entry_direct(entry) < 0)
return -EIO;
/* make sure dst_mm is on swapoff's mmlist. */
@@ -1256,7 +1256,7 @@ again:
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
nr = 1;
@@ -1325,7 +1325,7 @@ again:
} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(orig_src_pte, src_ptl);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
@@ -1748,7 +1748,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
nr = swap_pte_batch(pte, max_nr, ptent);
rss[MM_SWAPENTS] -= nr;
- free_swap_and_cache_nr(entry, nr);
+ swap_put_entries_direct(entry, nr);
} else if (softleaf_is_migration(entry)) {
struct folio *folio = softleaf_to_folio(entry);
@@ -1821,11 +1821,70 @@ static inline int do_zap_pte_range(struct mmu_gather *tlb,
return nr;
}
+static bool pte_table_reclaim_possible(unsigned long start, unsigned long end,
+ struct zap_details *details)
+{
+ if (!IS_ENABLED(CONFIG_PT_RECLAIM))
+ return false;
+ /* Only zap if we are allowed to and cover the full page table. */
+ return details && details->reclaim_pt && (end - start >= PMD_SIZE);
+}
+
+static bool zap_empty_pte_table(struct mm_struct *mm, pmd_t *pmd,
+ spinlock_t *ptl, pmd_t *pmdval)
+{
+ spinlock_t *pml = pmd_lockptr(mm, pmd);
+
+ if (ptl != pml && !spin_trylock(pml))
+ return false;
+
+ *pmdval = pmdp_get(pmd);
+ pmd_clear(pmd);
+ if (ptl != pml)
+ spin_unlock(pml);
+ return true;
+}
+
+static bool zap_pte_table_if_empty(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, pmd_t *pmdval)
+{
+ spinlock_t *pml, *ptl = NULL;
+ pte_t *start_pte, *pte;
+ int i;
+
+ pml = pmd_lock(mm, pmd);
+ start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, pmdval, &ptl);
+ if (!start_pte)
+ goto out_ptl;
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+
+ for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) {
+ if (!pte_none(ptep_get(pte)))
+ goto out_ptl;
+ }
+ pte_unmap(start_pte);
+
+ pmd_clear(pmd);
+
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
+ return true;
+out_ptl:
+ if (start_pte)
+ pte_unmap_unlock(start_pte, ptl);
+ if (ptl != pml)
+ spin_unlock(pml);
+ return false;
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
+ bool can_reclaim_pt = pte_table_reclaim_possible(addr, end, details);
bool force_flush = false, force_break = false;
struct mm_struct *mm = tlb->mm;
int rss[NR_MM_COUNTERS];
@@ -1834,7 +1893,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t *pte;
pmd_t pmdval;
unsigned long start = addr;
- bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
bool direct_reclaim = true;
int nr;
@@ -1846,7 +1904,7 @@ retry:
return addr;
flush_tlb_batched_pending(mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
bool any_skipped = false;
@@ -1875,10 +1933,10 @@ retry:
* from being repopulated by another thread.
*/
if (can_reclaim_pt && direct_reclaim && addr == end)
- direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
+ direct_reclaim = zap_empty_pte_table(mm, pmd, ptl, &pmdval);
add_mm_rss_vec(mm, rss);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
/* Do the actual TLB flush before dropping ptl */
if (force_flush) {
@@ -1904,10 +1962,10 @@ retry:
}
if (can_reclaim_pt) {
- if (direct_reclaim)
- free_pte(mm, start, tlb, pmdval);
- else
- try_to_free_pte(mm, pmd, start, tlb);
+ if (direct_reclaim || zap_pte_table_if_empty(mm, pmd, start, &pmdval)) {
+ pte_free_tlb(tlb, pmd_pgtable(pmdval), addr);
+ mm_dec_nr_ptes(mm);
+ }
}
return addr;
@@ -2499,7 +2557,6 @@ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
{
unsigned long count = vma_pages(vma);
unsigned long uaddr = vma->vm_start;
- int ret, i;
/* Fail if the user requested offset is beyond the end of the object */
if (offset >= num)
@@ -2509,14 +2566,7 @@ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
if (count > num - offset)
return -ENXIO;
- for (i = 0; i < count; i++) {
- ret = vm_insert_page(vma, uaddr, pages[offset + i]);
- if (ret < 0)
- return ret;
- uaddr += PAGE_SIZE;
- }
-
- return 0;
+ return vm_insert_pages(vma, uaddr, pages + offset, &count);
}
/**
@@ -2816,7 +2866,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
BUG_ON(!pte_none(ptep_get(pte)));
if (!pfn_modify_allowed(pfn, prot)) {
@@ -2826,7 +2876,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(mapped_pte, ptl);
return err;
}
@@ -3177,7 +3227,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
return -EINVAL;
}
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
if (fn) {
do {
@@ -3190,7 +3240,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
}
*mask |= PGTBL_PTE_MODIFIED;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
if (mm != &init_mm)
pte_unmap_unlock(mapped_pte, ptl);
@@ -4357,12 +4407,27 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
return 0;
}
-static inline bool should_try_to_free_swap(struct folio *folio,
+/*
+ * Check if we should call folio_free_swap to free the swap cache.
+ * folio_free_swap only frees the swap cache to release the slot if swap
+ * count is zero, so we don't need to check the swap count here.
+ */
+static inline bool should_try_to_free_swap(struct swap_info_struct *si,
+ struct folio *folio,
struct vm_area_struct *vma,
+ unsigned int extra_refs,
unsigned int fault_flags)
{
if (!folio_test_swapcache(folio))
return false;
+ /*
+ * Always try to free swap cache for SWP_SYNCHRONOUS_IO devices. Swap
+ * cache can help save some IO or memory overhead, but these devices
+ * are fast, and meanwhile, swap cache pinning the slot deferring the
+ * release of metadata or fragmentation is a more critical issue.
+ */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
+ return true;
if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
folio_test_mlocked(folio))
return true;
@@ -4373,7 +4438,7 @@ static inline bool should_try_to_free_swap(struct folio *folio,
* reference only in case it's likely that we'll be the exclusive user.
*/
return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
- folio_ref_count(folio) == (1 + folio_nr_pages(folio));
+ folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
}
static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
@@ -4611,7 +4676,16 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
+/* Sanity check that a folio is fully exclusive */
+static void check_swap_exclusive(struct folio *folio, swp_entry_t entry,
+ unsigned int nr_pages)
+{
+ /* Called under PT locked and folio locked, the swap count is stable */
+ do {
+ VM_WARN_ON_ONCE_FOLIO(__swap_count(entry) != 1, folio);
+ entry.val++;
+ } while (--nr_pages);
+}
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
@@ -4624,17 +4698,14 @@ static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct folio *swapcache, *folio = NULL;
- DECLARE_WAITQUEUE(wait, current);
+ struct folio *swapcache = NULL, *folio;
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
- bool need_clear_cache = false;
bool exclusive = false;
softleaf_t entry;
pte_t pte;
vm_fault_t ret = 0;
- void *shadow = NULL;
int nr_pages;
unsigned long page_idx;
unsigned long address;
@@ -4705,57 +4776,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio = swap_cache_get_folio(entry);
if (folio)
swap_update_readahead(folio, vma, vmf->address);
- swapcache = folio;
-
if (!folio) {
- if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
- __swap_count(entry) == 1) {
- /* skip swapcache */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
folio = alloc_swap_folio(vmf);
if (folio) {
- __folio_set_locked(folio);
- __folio_set_swapbacked(folio);
-
- nr_pages = folio_nr_pages(folio);
- if (folio_test_large(folio))
- entry.val = ALIGN_DOWN(entry.val, nr_pages);
/*
- * Prevent parallel swapin from proceeding with
- * the cache flag. Otherwise, another thread
- * may finish swapin first, free the entry, and
- * swapout reusing the same entry. It's
- * undetectable as pte_same() returns true due
- * to entry reuse.
+ * folio is charged, so swapin can only fail due
+ * to raced swapin and return NULL.
*/
- if (swapcache_prepare(entry, nr_pages)) {
- /*
- * Relax a bit to prevent rapid
- * repeated page faults.
- */
- add_wait_queue(&swapcache_wq, &wait);
- schedule_timeout_uninterruptible(1);
- remove_wait_queue(&swapcache_wq, &wait);
- goto out_page;
- }
- need_clear_cache = true;
-
- memcg1_swapin(entry, nr_pages);
-
- shadow = swap_cache_get_shadow(entry);
- if (shadow)
- workingset_refault(folio, shadow);
-
- folio_add_lru(folio);
-
- /* To provide entry to swap_read_folio() */
- folio->swap = entry;
- swap_read_folio(folio, NULL);
- folio->private = NULL;
+ swapcache = swapin_folio(entry, folio);
+ if (swapcache != folio)
+ folio_put(folio);
+ folio = swapcache;
}
} else {
- folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
- vmf);
- swapcache = folio;
+ folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
}
if (!folio) {
@@ -4777,60 +4812,58 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
}
+ swapcache = folio;
ret |= folio_lock_or_retry(folio, vmf);
if (ret & VM_FAULT_RETRY)
goto out_release;
page = folio_file_page(folio, swp_offset(entry));
- if (swapcache) {
- /*
- * Make sure folio_free_swap() or swapoff did not release the
- * swapcache from under us. The page pin, and pte_same test
- * below, are not enough to exclude that. Even if it is still
- * swapcache, we need to check that the page's swap has not
- * changed.
- */
- if (unlikely(!folio_matches_swap_entry(folio, entry)))
- goto out_page;
-
- if (unlikely(PageHWPoison(page))) {
- /*
- * hwpoisoned dirty swapcache pages are kept for killing
- * owner processes (which may be unknown at hwpoison time)
- */
- ret = VM_FAULT_HWPOISON;
- goto out_page;
- }
-
- /*
- * KSM sometimes has to copy on read faults, for example, if
- * folio->index of non-ksm folios would be nonlinear inside the
- * anon VMA -- the ksm flag is lost on actual swapout.
- */
- folio = ksm_might_need_to_copy(folio, vma, vmf->address);
- if (unlikely(!folio)) {
- ret = VM_FAULT_OOM;
- folio = swapcache;
- goto out_page;
- } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
- ret = VM_FAULT_HWPOISON;
- folio = swapcache;
- goto out_page;
- }
- if (folio != swapcache)
- page = folio_page(folio, 0);
+ /*
+ * Make sure folio_free_swap() or swapoff did not release the
+ * swapcache from under us. The page pin, and pte_same test
+ * below, are not enough to exclude that. Even if it is still
+ * swapcache, we need to check that the page's swap has not
+ * changed.
+ */
+ if (unlikely(!folio_matches_swap_entry(folio, entry)))
+ goto out_page;
+ if (unlikely(PageHWPoison(page))) {
/*
- * If we want to map a page that's in the swapcache writable, we
- * have to detect via the refcount if we're really the exclusive
- * owner. Try removing the extra reference from the local LRU
- * caches if required.
+ * hwpoisoned dirty swapcache pages are kept for killing
+ * owner processes (which may be unknown at hwpoison time)
*/
- if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
- !folio_test_ksm(folio) && !folio_test_lru(folio))
- lru_add_drain();
+ ret = VM_FAULT_HWPOISON;
+ goto out_page;
}
+ /*
+ * KSM sometimes has to copy on read faults, for example, if
+ * folio->index of non-ksm folios would be nonlinear inside the
+ * anon VMA -- the ksm flag is lost on actual swapout.
+ */
+ folio = ksm_might_need_to_copy(folio, vma, vmf->address);
+ if (unlikely(!folio)) {
+ ret = VM_FAULT_OOM;
+ folio = swapcache;
+ goto out_page;
+ } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
+ ret = VM_FAULT_HWPOISON;
+ folio = swapcache;
+ goto out_page;
+ } else if (folio != swapcache)
+ page = folio_page(folio, 0);
+
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * owner. Try removing the extra reference from the local LRU
+ * caches if required.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) &&
+ !folio_test_ksm(folio) && !folio_test_lru(folio))
+ lru_add_drain();
+
folio_throttle_swaprate(folio, GFP_KERNEL);
/*
@@ -4846,24 +4879,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_nomap;
}
- /* allocated large folios for SWP_SYNCHRONOUS_IO */
- if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
- unsigned long nr = folio_nr_pages(folio);
- unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
- unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
- pte_t *folio_ptep = vmf->pte - idx;
- pte_t folio_pte = ptep_get(folio_ptep);
-
- if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
- swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
- goto out_nomap;
-
- page_idx = idx;
- address = folio_start;
- ptep = folio_ptep;
- goto check_folio;
- }
-
nr_pages = 1;
page_idx = 0;
address = vmf->address;
@@ -4908,11 +4923,36 @@ check_folio:
BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
/*
+ * If a large folio already belongs to anon mapping, then we
+ * can just go on and map it partially.
+ * If not, with the large swapin check above failing, the page table
+ * have changed, so sub pages might got charged to the wrong cgroup,
+ * or even should be shmem. So we have to free it and fallback.
+ * Nothing should have touched it, both anon and shmem checks if a
+ * large folio is fully appliable before use.
+ *
+ * This will be removed once we unify folio allocation in the swap cache
+ * layer, where allocation of a folio stabilizes the swap entries.
+ */
+ if (!folio_test_anon(folio) && folio_test_large(folio) &&
+ nr_pages != folio_nr_pages(folio)) {
+ if (!WARN_ON_ONCE(folio_test_dirty(folio)))
+ swap_cache_del_folio(folio);
+ goto out_nomap;
+ }
+
+ /*
* Check under PT lock (to protect against concurrent fork() sharing
* the swap entry concurrently) for certainly exclusive pages.
*/
if (!folio_test_ksm(folio)) {
+ /*
+ * The can_swapin_thp check above ensures all PTE have
+ * same exclusiveness. Checking just one PTE is fine.
+ */
exclusive = pte_swp_exclusive(vmf->orig_pte);
+ if (exclusive)
+ check_swap_exclusive(folio, entry, nr_pages);
if (folio != swapcache) {
/*
* We have a fresh page that is not exposed to the
@@ -4946,19 +4986,10 @@ check_folio:
/*
* Some architectures may have to restore extra metadata to the page
* when reading from swap. This metadata may be indexed by swap entry
- * so this must be called before swap_free().
+ * so this must be called before folio_put_swap().
*/
arch_swap_restore(folio_swap(entry, folio), folio);
- /*
- * Remove the swap entry and conditionally try to free up the swapcache.
- * We're already holding a reference on the page but haven't mapped it
- * yet.
- */
- swap_free_nr(entry, nr_pages);
- if (should_try_to_free_swap(folio, vma, vmf->flags))
- folio_free_swap(folio);
-
add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
pte = mk_pte(page, vma->vm_page_prot);
@@ -4990,22 +5021,24 @@ check_folio:
vmf->orig_pte = pte_advance_pfn(pte, page_idx);
/* ksm created a completely new copy */
- if (unlikely(folio != swapcache && swapcache)) {
+ if (unlikely(folio != swapcache)) {
folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
+ folio_put_swap(swapcache, NULL);
} else if (!folio_test_anon(folio)) {
/*
- * We currently only expect small !anon folios which are either
- * fully exclusive or fully shared, or new allocated large
- * folios which are fully exclusive. If we ever get large
- * folios within swapcache here, we have to be careful.
+ * We currently only expect !anon folios that are fully
+ * mappable. See the comment after can_swapin_thp above.
*/
- VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
- VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
+ folio_put_swap(folio, NULL);
} else {
+ VM_WARN_ON_ONCE(nr_pages != 1 && nr_pages != folio_nr_pages(folio));
folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address,
- rmap_flags);
+ rmap_flags);
+ folio_put_swap(folio, nr_pages == 1 ? page : NULL);
}
VM_BUG_ON(!folio_test_anon(folio) ||
@@ -5014,13 +5047,21 @@ check_folio:
arch_do_swap_page_nr(vma->vm_mm, vma, address,
pte, pte, nr_pages);
+ /*
+ * Remove the swap entry and conditionally try to free up the swapcache.
+ * Do it after mapping, so raced page faults will likely see the folio
+ * in swap cache and wait on the folio lock.
+ */
+ if (should_try_to_free_swap(si, folio, vma, nr_pages, vmf->flags))
+ folio_free_swap(folio);
+
folio_unlock(folio);
- if (folio != swapcache && swapcache) {
+ if (unlikely(folio != swapcache)) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
* (to avoid false positives from pte_same). For
- * further safety release the lock after the swap_free
+ * further safety release the lock after the folio_put_swap
* so that the swap count won't change under a
* parallel locked swapcache.
*/
@@ -5041,12 +5082,6 @@ unlock:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
- /* Clear the swap cache pin for direct swapin after PTL unlock */
- if (need_clear_cache) {
- swapcache_clear(si, entry, nr_pages);
- if (waitqueue_active(&swapcache_wq))
- wake_up(&swapcache_wq);
- }
if (si)
put_swap_device(si);
return ret;
@@ -5054,18 +5089,15 @@ out_nomap:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
+ if (folio_test_swapcache(folio))
+ folio_free_swap(folio);
folio_unlock(folio);
out_release:
folio_put(folio);
- if (folio != swapcache && swapcache) {
+ if (folio != swapcache) {
folio_unlock(swapcache);
folio_put(swapcache);
}
- if (need_clear_cache) {
- swapcache_clear(si, entry, nr_pages);
- if (waitqueue_active(&swapcache_wq))
- wake_up(&swapcache_wq);
- }
if (si)
put_swap_device(si);
return ret;
@@ -5935,7 +5967,7 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
else
*last_cpupid = folio_last_cpupid(folio);
- /* Record the current PID acceesing VMA */
+ /* Record the current PID accessing VMA */
vma_set_access_pid_bit(vma);
count_vm_numa_event(NUMA_HINT_FAULTS);
@@ -6254,7 +6286,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
* Use the maywrite version to indicate that vmf->pte may be
* modified, but since we will use pte_same() to detect the
* change of the !pte_none() entry, there is no need to recheck
- * the pmdval. Here we chooes to pass a dummy variable instead
+ * the pmdval. Here we choose to pass a dummy variable instead
* of NULL, which helps new user think about why this place is
* special.
*/
@@ -7240,40 +7272,77 @@ static inline int process_huge_page(
return 0;
}
-static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint,
- unsigned int nr_pages)
+static void clear_contig_highpages(struct page *page, unsigned long addr,
+ unsigned int nr_pages)
{
- unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio));
- int i;
+ unsigned int i, count;
+ /*
+ * When clearing we want to operate on the largest extent possible to
+ * allow for architecture specific extent based optimizations.
+ *
+ * However, since clear_user_highpages() (and primitives clear_user_pages(),
+ * clear_pages()), do not call cond_resched(), limit the unit size when
+ * running under non-preemptible scheduling models.
+ */
+ const unsigned int unit = preempt_model_preemptible() ?
+ nr_pages : PROCESS_PAGES_NON_PREEMPT_BATCH;
might_sleep();
- for (i = 0; i < nr_pages; i++) {
+
+ for (i = 0; i < nr_pages; i += count) {
cond_resched();
- clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE);
+
+ count = min(unit, nr_pages - i);
+ clear_user_highpages(page + i, addr + i * PAGE_SIZE, count);
}
}
-static int clear_subpage(unsigned long addr, int idx, void *arg)
-{
- struct folio *folio = arg;
-
- clear_user_highpage(folio_page(folio, idx), addr);
- return 0;
-}
+/*
+ * When zeroing a folio, we want to differentiate between pages in the
+ * vicinity of the faulting address where we have spatial and temporal
+ * locality, and those far away where we don't.
+ *
+ * Use a radius of 2 for determining the local neighbourhood.
+ */
+#define FOLIO_ZERO_LOCALITY_RADIUS 2
/**
* folio_zero_user - Zero a folio which will be mapped to userspace.
* @folio: The folio to zero.
- * @addr_hint: The address will be accessed or the base address if uncelar.
+ * @addr_hint: The address accessed by the user or the base address.
*/
void folio_zero_user(struct folio *folio, unsigned long addr_hint)
{
- unsigned int nr_pages = folio_nr_pages(folio);
+ const unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio));
+ const long fault_idx = (addr_hint - base_addr) / PAGE_SIZE;
+ const struct range pg = DEFINE_RANGE(0, folio_nr_pages(folio) - 1);
+ const int radius = FOLIO_ZERO_LOCALITY_RADIUS;
+ struct range r[3];
+ int i;
- if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
- clear_gigantic_page(folio, addr_hint, nr_pages);
- else
- process_huge_page(addr_hint, nr_pages, clear_subpage, folio);
+ /*
+ * Faulting page and its immediate neighbourhood. Will be cleared at the
+ * end to keep its cachelines hot.
+ */
+ r[2] = DEFINE_RANGE(clamp_t(s64, fault_idx - radius, pg.start, pg.end),
+ clamp_t(s64, fault_idx + radius, pg.start, pg.end));
+
+ /* Region to the left of the fault */
+ r[1] = DEFINE_RANGE(pg.start,
+ clamp_t(s64, r[2].start - 1, pg.start - 1, r[2].start));
+
+ /* Region to the right of the fault: always valid for the common fault_idx=0 case. */
+ r[0] = DEFINE_RANGE(clamp_t(s64, r[2].end + 1, r[2].end, pg.end + 1),
+ pg.end);
+
+ for (i = 0; i < ARRAY_SIZE(r); i++) {
+ const unsigned long addr = base_addr + r[i].start * PAGE_SIZE;
+ const unsigned int nr_pages = range_len(&r[i]);
+ struct page *page = folio_page(folio, r[i].start);
+
+ if (nr_pages > 0)
+ clear_contig_highpages(page, addr, nr_pages);
+ }
}
static int copy_user_gigantic_page(struct folio *dst, struct folio *src,