summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/memory.c15
-rw-r--r--mm/mmap.c84
-rw-r--r--mm/numa_memblks.c2
-rw-r--r--mm/page_alloc.c10
-rw-r--r--mm/pagewalk.c16
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/vma.c23
-rw-r--r--mm/vma.h26
10 files changed, 126 insertions, 55 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 4c9f5ea13271..33fa51d608dc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1085,7 +1085,6 @@ config HMM_MIRROR
depends on MMU
config GET_FREE_REGION
- depends on SPARSEMEM
bool
config DEVICE_PRIVATE
diff --git a/mm/memory.c b/mm/memory.c
index 3ccee51adfbb..bdf77a3ec47b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4187,6 +4187,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4199,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct folio *swapcache, *folio = NULL;
+ DECLARE_WAITQUEUE(wait, current);
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
@@ -4297,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Relax a bit to prevent rapid
* repeated page faults.
*/
+ add_wait_queue(&swapcache_wq, &wait);
schedule_timeout_uninterruptible(1);
+ remove_wait_queue(&swapcache_wq, &wait);
goto out_page;
}
need_clear_cache = true;
@@ -4604,8 +4609,11 @@ unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
/* Clear the swap cache pin for direct swapin after PTL unlock */
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
@@ -4620,8 +4628,11 @@ out_release:
folio_unlock(swapcache);
folio_put(swapcache);
}
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index 9c0fb43064b5..1e0e34cb993f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1418,6 +1418,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vmg.flags = vm_flags;
}
+ /*
+ * clear PTEs while the vma is still in the tree so that rmap
+ * cannot race with the freeing later in the truncate scenario.
+ * This is also needed for call_mmap(), which is why vm_ops
+ * close function is called.
+ */
+ vms_clean_up_area(&vms, &mas_detach);
vma = vma_merge_new_range(&vmg);
if (vma)
goto expanded;
@@ -1439,11 +1446,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (file) {
vma->vm_file = get_file(file);
- /*
- * call_mmap() may map PTE, so ensure there are no existing PTEs
- * and call the vm_ops close function if one exists.
- */
- vms_clean_up_area(&vms, &mas_detach);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
@@ -1640,6 +1642,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long populate = 0;
unsigned long ret = -EINVAL;
struct file *file;
+ vm_flags_t vm_flags;
pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
current->comm, current->pid);
@@ -1656,12 +1659,60 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
return ret;
- if (mmap_write_lock_killable(mm))
+ if (mmap_read_lock_killable(mm))
+ return -EINTR;
+
+ /*
+ * Look up VMA under read lock first so we can perform the security
+ * without holding locks (which can be problematic). We reacquire a
+ * write lock later and check nothing changed underneath us.
+ */
+ vma = vma_lookup(mm, start);
+
+ if (!vma || !(vma->vm_flags & VM_SHARED)) {
+ mmap_read_unlock(mm);
+ return -EINVAL;
+ }
+
+ prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+ prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+ prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+
+ flags &= MAP_NONBLOCK;
+ flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+
+ /* Save vm_flags used to calculate prot and flags, and recheck later. */
+ vm_flags = vma->vm_flags;
+ file = get_file(vma->vm_file);
+
+ mmap_read_unlock(mm);
+
+ /* Call outside mmap_lock to be consistent with other callers. */
+ ret = security_mmap_file(file, prot, flags);
+ if (ret) {
+ fput(file);
+ return ret;
+ }
+
+ ret = -EINVAL;
+
+ /* OK security check passed, take write lock + let it rip. */
+ if (mmap_write_lock_killable(mm)) {
+ fput(file);
return -EINTR;
+ }
vma = vma_lookup(mm, start);
- if (!vma || !(vma->vm_flags & VM_SHARED))
+ if (!vma)
+ goto out;
+
+ /* Make sure things didn't change under us. */
+ if (vma->vm_flags != vm_flags)
+ goto out;
+ if (vma->vm_file != file)
goto out;
if (start + size > vma->vm_end) {
@@ -1689,25 +1740,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
goto out;
}
- prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
- prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
- prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
-
- flags &= MAP_NONBLOCK;
- flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
- if (vma->vm_flags & VM_LOCKED)
- flags |= MAP_LOCKED;
-
- file = get_file(vma->vm_file);
- ret = security_mmap_file(vma->vm_file, prot, flags);
- if (ret)
- goto out_fput;
ret = do_mmap(vma->vm_file, start, size,
prot, flags, 0, pgoff, &populate, NULL);
-out_fput:
- fput(file);
out:
mmap_write_unlock(mm);
+ fput(file);
if (populate)
mm_populate(ret, populate);
if (!IS_ERR_VALUE(ret))
@@ -1754,7 +1791,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
vmg.prev = vma;
- vma_iter_next_range(vmi);
+ /* vmi is positioned at prev, which this mode expects. */
+ vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
if (vma_merge_new_range(&vmg))
goto out;
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c
index be52b93a9c58..a3877e9bc878 100644
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@@ -349,7 +349,7 @@ static void __init numa_clear_kernel_node_hotplug(void)
for_each_reserved_mem_region(mb_region) {
int nid = memblock_get_region_node(mb_region);
- if (nid != MAX_NUMNODES)
+ if (numa_valid_node(nid))
node_set(nid, reserved_nodemask);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8afab64814dc..94a2ffe28008 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2893,12 +2893,12 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
page = __rmqueue(zone, order, migratetype, alloc_flags);
/*
- * If the allocation fails, allow OOM handling access
- * to HIGHATOMIC reserves as failing now is worse than
- * failing a high-order atomic allocation in the
- * future.
+ * If the allocation fails, allow OOM handling and
+ * order-0 (atomic) allocs access to HIGHATOMIC
+ * reserves as failing now is worse than failing a
+ * high-order atomic allocation in the future.
*/
- if (!page && (alloc_flags & ALLOC_OOM))
+ if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK)))
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 461ea3bbd8d9..5f9f01532e67 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -744,7 +744,8 @@ struct folio *folio_walk_start(struct folio_walk *fw,
pud = pudp_get(pudp);
if (pud_none(pud))
goto not_found;
- if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) {
+ if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
+ (!pud_present(pud) || pud_leaf(pud))) {
ptl = pud_lock(vma->vm_mm, pudp);
pud = pudp_get(pudp);
@@ -753,6 +754,10 @@ struct folio *folio_walk_start(struct folio_walk *fw,
fw->pudp = pudp;
fw->pud = pud;
+ /*
+ * TODO: FW_MIGRATION support for PUD migration entries
+ * once there are relevant users.
+ */
if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
spin_unlock(ptl);
goto not_found;
@@ -769,12 +774,13 @@ struct folio *folio_walk_start(struct folio_walk *fw,
}
pmd_table:
- VM_WARN_ON_ONCE(pud_leaf(*pudp));
+ VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud));
pmdp = pmd_offset(pudp, addr);
pmd = pmdp_get_lockless(pmdp);
if (pmd_none(pmd))
goto not_found;
- if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) {
+ if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
+ (!pmd_present(pmd) || pmd_leaf(pmd))) {
ptl = pmd_lock(vma->vm_mm, pmdp);
pmd = pmdp_get(pmdp);
@@ -786,7 +792,7 @@ pmd_table:
if (pmd_none(pmd)) {
spin_unlock(ptl);
goto not_found;
- } else if (!pmd_leaf(pmd)) {
+ } else if (pmd_present(pmd) && !pmd_leaf(pmd)) {
spin_unlock(ptl);
goto pte_table;
} else if (pmd_present(pmd)) {
@@ -812,7 +818,7 @@ pmd_table:
}
pte_table:
- VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));
+ VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd));
ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
if (!ptep)
goto not_found;
diff --git a/mm/shmem.c b/mm/shmem.c
index c5adb987b23c..4ba1d00fabda 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1166,7 +1166,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
+ inode_lock_shared(inode);
generic_fillattr(idmap, request_mask, inode, stat);
+ inode_unlock_shared(inode);
if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3d26c257ed8b..552b92dfdac7 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1209,7 +1209,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
/* Zero out spare memory. */
if (want_init_on_alloc(flags)) {
kasan_disable_current();
- memset((void *)p + new_size, 0, ks - new_size);
+ memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
kasan_enable_current();
}
diff --git a/mm/vma.c b/mm/vma.c
index 4737afcb064c..b21ffec33f8e 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -917,6 +917,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
pgoff_t pgoff = vmg->pgoff;
pgoff_t pglen = PHYS_PFN(end - start);
bool can_merge_left, can_merge_right;
+ bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
mmap_assert_write_locked(vmg->mm);
VM_WARN_ON(vmg->vma);
@@ -930,7 +931,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
return NULL;
can_merge_left = can_vma_merge_left(vmg);
- can_merge_right = can_vma_merge_right(vmg, can_merge_left);
+ can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left);
/* If we can merge with the next VMA, adjust vmg accordingly. */
if (can_merge_right) {
@@ -953,7 +954,11 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
if (can_merge_right && !can_merge_remove_vma(next))
vmg->end = end;
- vma_prev(vmg->vmi); /* Equivalent to going to the previous range */
+ /* In expand-only case we are already positioned at prev. */
+ if (!just_expand) {
+ /* Equivalent to going to the previous range. */
+ vma_prev(vmg->vmi);
+ }
}
/*
@@ -967,12 +972,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
}
/* If expansion failed, reset state. Allows us to retry merge later. */
- vmg->vma = NULL;
- vmg->start = start;
- vmg->end = end;
- vmg->pgoff = pgoff;
- if (vmg->vma == prev)
- vma_iter_set(vmg->vmi, start);
+ if (!just_expand) {
+ vmg->vma = NULL;
+ vmg->start = start;
+ vmg->end = end;
+ vmg->pgoff = pgoff;
+ if (vmg->vma == prev)
+ vma_iter_set(vmg->vmi, start);
+ }
return NULL;
}
diff --git a/mm/vma.h b/mm/vma.h
index 819f994cf727..55457cb68200 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -59,6 +59,17 @@ enum vma_merge_state {
VMA_MERGE_SUCCESS,
};
+enum vma_merge_flags {
+ VMG_FLAG_DEFAULT = 0,
+ /*
+ * If we can expand, simply do so. We know there is nothing to merge to
+ * the right. Does not reset state upon failure to merge. The VMA
+ * iterator is assumed to be positioned at the previous VMA, rather than
+ * at the gap.
+ */
+ VMG_FLAG_JUST_EXPAND = 1 << 0,
+};
+
/* Represents a VMA merge operation. */
struct vma_merge_struct {
struct mm_struct *mm;
@@ -75,6 +86,7 @@ struct vma_merge_struct {
struct mempolicy *policy;
struct vm_userfaultfd_ctx uffd_ctx;
struct anon_vma_name *anon_name;
+ enum vma_merge_flags merge_flags;
enum vma_merge_state state;
};
@@ -99,6 +111,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.flags = flags_, \
.pgoff = pgoff_, \
.state = VMA_MERGE_START, \
+ .merge_flags = VMG_FLAG_DEFAULT, \
}
#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \
@@ -118,6 +131,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.uffd_ctx = vma_->vm_userfaultfd_ctx, \
.anon_name = anon_vma_name(vma_), \
.state = VMA_MERGE_START, \
+ .merge_flags = VMG_FLAG_DEFAULT, \
}
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
@@ -241,15 +255,9 @@ static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
* failure method of leaving a gap where the MAP_FIXED mapping failed.
*/
mas_set_range(mas, vms->start, vms->end - 1);
- if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) {
- pr_warn_once("%s: (%d) Unable to abort munmap() operation\n",
- current->comm, current->pid);
- /* Leaving vmas detached and in-tree may hamper recovery */
- reattach_vmas(mas_detach);
- } else {
- /* Clean up the insertion of the unfortunate gap */
- vms_complete_munmap_vmas(vms, mas_detach);
- }
+ mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
+ /* Clean up the insertion of the unfortunate gap */
+ vms_complete_munmap_vmas(vms, mas_detach);
}
int