summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/hugetlb.h8
-rw-r--r--include/linux/mm.h8
-rw-r--r--mm/cma.c7
-rw-r--r--mm/cma_debug.c3
-rw-r--r--mm/damon/ops-common.c4
-rw-r--r--mm/huge_memory.c4
-rw-r--r--mm/hugetlb.c69
-rw-r--r--mm/hugetlb_vmemmap.c36
-rw-r--r--mm/memcontrol.c5
-rw-r--r--mm/memory-failure.c19
-rw-r--r--mm/userfaultfd.c92
11 files changed, 162 insertions, 93 deletions
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5957bc25efa8..2abaf99321e9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -153,8 +153,6 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared);
void folio_putback_hugetlb(struct folio *folio);
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
void hugetlb_fix_reserve_counts(struct inode *inode);
@@ -421,12 +419,6 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
return 0;
}
-static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
-{
- return 0;
-}
-
static inline void folio_putback_hugetlb(struct folio *folio)
{
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 06bbe9eba636..fc2acedf0b76 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4975,8 +4975,6 @@ extern int soft_offline_page(unsigned long pfn, int flags);
*/
extern const struct attribute_group memory_failure_attr_group;
extern void memory_failure_queue(unsigned long pfn, int flags);
-extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
#else
@@ -4984,12 +4982,6 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
{
}
-static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
-{
- return 0;
-}
-
static inline void num_poisoned_pages_inc(unsigned long pfn)
{
}
diff --git a/mm/cma.c b/mm/cma.c
index c7ca567f4c5c..a13ce4999b39 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -188,10 +188,13 @@ cleanup:
/* Expose all pages to the buddy, they are useless for CMA. */
if (!test_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags)) {
- for (r = 0; r < allocrange; r++) {
+ for (r = 0; r < cma->nranges; r++) {
+ unsigned long start_pfn;
+
cmr = &cma->ranges[r];
+ start_pfn = r <= allocrange ? early_pfn[r] : cmr->early_pfn;
end_pfn = cmr->base_pfn + cmr->count;
- for (pfn = early_pfn[r]; pfn < end_pfn; pfn++)
+ for (pfn = start_pfn; pfn < end_pfn; pfn++)
free_reserved_page(pfn_to_page(pfn));
}
}
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 5ae38f5abbcc..523ba4a0f9f7 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -205,7 +205,8 @@ static int __init cma_debugfs_init(void)
cma_debugfs_root = debugfs_create_dir("cma", NULL);
for (i = 0; i < cma_area_count; i++)
- cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root);
+ if (test_bit(CMA_ACTIVATED, &cma_areas[i].flags))
+ cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root);
return 0;
}
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 8c6d613425c1..c3e4c871b0bb 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -32,9 +32,9 @@ struct folio *damon_get_folio(unsigned long pfn)
return NULL;
folio = page_folio(page);
- if (!folio_test_lru(folio) || !folio_try_get(folio))
+ if (!folio_try_get(folio))
return NULL;
- if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) {
+ if (unlikely(page_folio(page) != folio) || !folio_test_lru(folio)) {
folio_put(folio);
folio = NULL;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 970e077019b7..653f2dc03403 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3015,9 +3015,9 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
if (!folio_test_referenced(folio) && pud_young(old_pud))
folio_set_referenced(folio);
folio_remove_rmap_pud(folio, page, vma);
- folio_put(folio);
add_mm_counter(vma->vm_mm, mm_counter_file(folio),
-HPAGE_PUD_NR);
+ folio_put(folio);
}
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
@@ -3133,7 +3133,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (!folio_test_referenced(folio) && pmd_young(old_pmd))
folio_set_referenced(folio);
folio_remove_rmap_pmd(folio, page, vma);
+ add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
folio_put(folio);
+ return;
}
add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
return;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4b80b167cc9c..c921287489de 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -118,6 +118,9 @@ static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
+static int __huge_pmd_unshare(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
+ bool check_locks);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
@@ -4974,6 +4977,7 @@ again:
addr, dst_vma);
folio_put(pte_folio);
if (ret) {
+ restore_reserve_on_error(h, dst_vma, addr, new_folio);
folio_put(new_folio);
break;
}
@@ -6270,6 +6274,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
folio_put(*foliop);
*foliop = NULL;
if (ret) {
+ restore_reserve_on_error(h, dst_vma, dst_addr, folio);
folio_put(folio);
goto out;
}
@@ -6891,6 +6896,31 @@ out:
return pte;
}
+static int __huge_pmd_unshare(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
+ bool check_locks)
+{
+ unsigned long sz = huge_page_size(hstate_vma(vma));
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd = pgd_offset(mm, addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
+
+ if (sz != PMD_SIZE)
+ return 0;
+ if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep)))
+ return 0;
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ if (check_locks)
+ hugetlb_vma_assert_locked(vma);
+ pud_clear(pud);
+
+ tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr);
+
+ mm_dec_nr_pmds(mm);
+ return 1;
+}
+
/**
* huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users
* @tlb: the current mmu_gather.
@@ -6910,24 +6940,7 @@ out:
int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
- unsigned long sz = huge_page_size(hstate_vma(vma));
- struct mm_struct *mm = vma->vm_mm;
- pgd_t *pgd = pgd_offset(mm, addr);
- p4d_t *p4d = p4d_offset(pgd, addr);
- pud_t *pud = pud_offset(p4d, addr);
-
- if (sz != PMD_SIZE)
- return 0;
- if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep)))
- return 0;
- i_mmap_assert_write_locked(vma->vm_file->f_mapping);
- hugetlb_vma_assert_locked(vma);
- pud_clear(pud);
-
- tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr);
-
- mm_dec_nr_pmds(mm);
- return 1;
+ return __huge_pmd_unshare(tlb, vma, addr, ptep, /*check_locks=*/true);
}
/*
@@ -6961,6 +6974,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
return NULL;
}
+static int __huge_pmd_unshare(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
+ bool check_locks)
+{
+ return 0;
+}
+
int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
@@ -7141,17 +7161,6 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
return ret;
}
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
-{
- int ret;
-
- spin_lock_irq(&hugetlb_lock);
- ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
- spin_unlock_irq(&hugetlb_lock);
- return ret;
-}
-
/**
* folio_putback_hugetlb - unisolate a hugetlb folio
* @folio: the isolated hugetlb folio
@@ -7269,7 +7278,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- huge_pmd_unshare(&tlb, vma, address, ptep);
+ __huge_pmd_unshare(&tlb, vma, address, ptep, take_locks);
spin_unlock(ptl);
}
huge_pmd_unshare_flush(&tlb, vma);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 4a077d231d3a..133b46dfb09f 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -207,6 +207,8 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
/* Remapping the head page requires r/w */
if (unlikely(walk->nr_walked == 0 && walk->vmemmap_head)) {
+ VM_WARN_ON_ONCE(!PageHead((const struct page *)addr));
+
list_del(&walk->vmemmap_head->lru);
/*
@@ -218,6 +220,8 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
entry = mk_pte(walk->vmemmap_head, PAGE_KERNEL);
} else {
+ VM_WARN_ON_ONCE(!PageTail((const struct page *)addr));
+
/*
* Remap the tail pages as read-only to catch illegal write
* operation to the tail pages.
@@ -232,33 +236,28 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
struct vmemmap_remap_walk *walk)
{
- struct page *page;
- struct page *from, *to;
-
- page = list_first_entry(walk->vmemmap_pages, struct page, lru);
- list_del(&page->lru);
+ struct page *src = pte_page(ptep_get(pte)), *dst;
/*
- * Initialize tail pages in the newly allocated vmemmap page.
- *
- * There is folio-scope metadata that is encoded in the first few
- * tail pages.
- *
- * Use the value last tail page in the page with the head page
- * to initialize the rest of tail pages.
+ * When rolling back vmemmap_remap_free(), keep the copied head page
+ * mapping and restore only PTEs currently pointing at the shared tail
+ * page.
*/
- from = compound_head((struct page *)addr) +
- PAGE_SIZE / sizeof(struct page) - 1;
- to = page_to_virt(page);
- for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++, to++)
- *to = *from;
+ if (walk->vmemmap_tail && walk->vmemmap_tail != src)
+ return;
+
+ VM_WARN_ON_ONCE(PageHead((const struct page *)addr));
+
+ dst = list_first_entry(walk->vmemmap_pages, struct page, lru);
+ list_del(&dst->lru);
+ copy_page(page_to_virt(dst), page_to_virt(src));
/*
* Makes sure that preceding stores to the page contents become visible
* before the set_pte_at() write.
*/
smp_wmb();
- set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
+ set_pte_at(&init_mm, addr, pte, mk_pte(dst, PAGE_KERNEL));
}
/**
@@ -324,6 +323,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
*/
walk = (struct vmemmap_remap_walk) {
.remap_pte = vmemmap_restore_pte,
+ .vmemmap_tail = vmemmap_tail,
.vmemmap_pages = vmemmap_pages,
.flags = 0,
};
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 177732fef010..1a4fd2504bcd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2011,6 +2011,7 @@ struct memcg_stock_pcp {
struct work_struct work;
unsigned long flags;
+ uint8_t drain_idx;
};
static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
@@ -2194,7 +2195,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (!success) {
i = empty_slot;
if (i == -1) {
- i = get_random_u32_below(NR_MEMCG_STOCK);
+ i = stock->drain_idx++;
+ if (stock->drain_idx == NR_MEMCG_STOCK)
+ stock->drain_idx = 0;
drain_stock(stock, i);
}
css_get(&memcg->css);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ee42d4361309..d47aef256a32 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1966,20 +1966,19 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
folio_free_raw_hwp(folio, true);
}
-/*
- * Called from hugetlb code with hugetlb_lock held.
- */
-int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+static int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared)
{
struct page *page = pfn_to_page(pfn);
- struct folio *folio = page_folio(page);
+ struct folio *folio;
bool count_increased = false;
int ret, rc;
+ spin_lock_irq(&hugetlb_lock);
+ folio = page_folio(page);
if (!folio_test_hugetlb(folio)) {
ret = MF_HUGETLB_NON_HUGEPAGE;
- goto out;
+ goto out_unlock;
} else if (flags & MF_COUNT_INCREASED) {
ret = MF_HUGETLB_IN_USED;
count_increased = true;
@@ -1995,13 +1994,13 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
} else {
ret = MF_HUGETLB_RETRY;
if (!(flags & MF_NO_RETRY))
- goto out;
+ goto out_unlock;
}
rc = hugetlb_update_hwpoison(folio, page);
if (rc >= MF_HUGETLB_FOLIO_PRE_POISONED) {
ret = rc;
- goto out;
+ goto out_unlock;
}
/*
@@ -2013,8 +2012,10 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
*migratable_cleared = true;
}
+ spin_unlock_irq(&hugetlb_lock);
return ret;
-out:
+out_unlock:
+ spin_unlock_irq(&hugetlb_lock);
if (count_increased)
folio_put(folio);
return ret;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 180bad42fc79..80cc8be5725f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -14,6 +14,8 @@
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
+#include <linux/file.h>
+#include <linux/cleanup.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -66,7 +68,7 @@ static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma)
{
if (vma_is_anonymous(vma))
return &anon_uffd_ops;
- return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL;
+ return vma->vm_ops->uffd_ops;
}
static __always_inline
@@ -443,16 +445,80 @@ static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr)
return ret;
}
-static int mfill_copy_folio_retry(struct mfill_state *state,
+#define MFILL_RETRY_STATE_VMA_FLAGS \
+ append_vma_flags(__VMA_UFFD_FLAGS, VMA_SHARED_BIT)
+
+/*
+ * VMA state saved before dropping the locks in mfill_copy_folio_retry().
+ * Used to detect VMA replacement or incompatible changes after reacquiring the
+ * locks.
+ */
+struct mfill_retry_state {
+ const struct vm_uffd_ops *ops;
+ struct file *file;
+ vma_flags_t flags;
+ pgoff_t pgoff;
+};
+
+static void mfill_retry_state_save(struct mfill_retry_state *s,
+ struct vm_area_struct *vma)
+{
+ s->flags = vma_flags_and_mask(&vma->flags, MFILL_RETRY_STATE_VMA_FLAGS);
+ s->ops = vma_uffd_ops(vma);
+ s->pgoff = vma->vm_pgoff;
+
+ if (vma->vm_file)
+ s->file = get_file(vma->vm_file);
+}
+
+static bool mfill_retry_state_changed(struct mfill_retry_state *state,
+ struct vm_area_struct *vma)
+{
+ vma_flags_t flags = vma_flags_and_mask(&vma->flags,
+ MFILL_RETRY_STATE_VMA_FLAGS);
+
+ /* Have any UFFD flags (missing, WP, minor) changed? */
+ if (!vma_flags_same_pair(&state->flags, &flags))
+ return true;
+
+ /* VMA type or effective uffd_ops changed while the lock was dropped */
+ if (state->ops != vma_uffd_ops(vma))
+ return true;
+
+ /* VMA was anonymous before; changed only if it no longer is */
+ if (!state->file)
+ return !vma_is_anonymous(vma);
+
+ /* VMA was file backed, but file, inode or offset has changed */
+ if (!vma->vm_file || vma->vm_file->f_inode != state->file->f_inode ||
+ state->file != vma->vm_file || vma->vm_pgoff != state->pgoff)
+ return true;
+
+ return false;
+}
+
+static void mfill_retry_state_put(struct mfill_retry_state *s)
+{
+ if (s->file)
+ fput(s->file);
+}
+
+DEFINE_FREE(retry_put, struct mfill_retry_state *,
+ if (_T) mfill_retry_state_put(_T));
+
+static int mfill_copy_folio_retry(struct mfill_state *mfill_state,
struct folio *folio)
{
- const struct vm_uffd_ops *orig_ops = vma_uffd_ops(state->vma);
- unsigned long src_addr = state->src_addr;
+ struct mfill_retry_state retry_state = { 0 };
+ struct mfill_retry_state *for_free __free(retry_put) = &retry_state;
+ unsigned long src_addr = mfill_state->src_addr;
void *kaddr;
int err;
+ mfill_retry_state_save(&retry_state, mfill_state->vma);
+
/* retry copying with mm_lock dropped */
- mfill_put_vma(state);
+ mfill_put_vma(mfill_state);
kaddr = kmap_local_folio(folio, 0);
err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE);
@@ -463,19 +529,14 @@ static int mfill_copy_folio_retry(struct mfill_state *state,
flush_dcache_folio(folio);
/* reget VMA and PMD, they could change underneath us */
- err = mfill_get_vma(state);
+ err = mfill_get_vma(mfill_state);
if (err)
return err;
- /*
- * The VMA type may have changed while the lock was dropped
- * (e.g. replaced with a hugetlb mapping), making the caller's
- * ops pointer stale.
- */
- if (vma_uffd_ops(state->vma) != orig_ops)
+ if (mfill_retry_state_changed(&retry_state, mfill_state->vma))
return -EAGAIN;
- err = mfill_establish_pmd(state);
+ err = mfill_establish_pmd(mfill_state);
if (err)
return err;
@@ -491,6 +552,11 @@ static int __mfill_atomic_pte(struct mfill_state *state,
struct folio *folio;
int ret;
+ if (!ops) {
+ VM_WARN_ONCE(1, "UFFDIO_COPY for unsupported VMA");
+ return -EOPNOTSUPP;
+ }
+
folio = ops->alloc_folio(state->vma, state->dst_addr);
if (!folio)
return -ENOMEM;