summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/damon/core.c7
-rw-r--r--mm/damon/paddr.c3
-rw-r--r--mm/filemap.c126
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/hugetlb.c8
-rw-r--r--mm/memcontrol.c13
-rw-r--r--mm/memory.c19
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/page_alloc.c14
-rw-r--r--mm/readahead.c14
-rw-r--r--mm/swap_cgroup.c7
-rw-r--r--mm/util.c3
-rw-r--r--mm/vma.c3
14 files changed, 80 insertions, 156 deletions
diff --git a/mm/damon/core.c b/mm/damon/core.c
index c7b981308862..384935ef4e65 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -373,6 +373,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
* or damon_attrs are updated.
*/
scheme->next_apply_sis = 0;
+ scheme->walk_completed = false;
INIT_LIST_HEAD(&scheme->filters);
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
@@ -1429,9 +1430,13 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
{
struct damos_filter *filter;
+ s->core_filters_allowed = false;
damos_for_each_filter(filter, s) {
- if (damos_filter_match(ctx, t, r, filter))
+ if (damos_filter_match(ctx, t, r, filter)) {
+ if (filter->allow)
+ s->core_filters_allowed = true;
return !filter->allow;
+ }
}
return false;
}
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 0f9ae14f884d..c834aa217835 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -236,6 +236,9 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
{
struct damos_filter *filter;
+ if (scheme->core_filters_allowed)
+ return false;
+
damos_for_each_filter(filter, scheme) {
if (damos_pa_filter_match(filter, folio))
return !filter->allow;
diff --git a/mm/filemap.c b/mm/filemap.c
index 2974691fdfad..e9404290f2c6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -47,7 +47,6 @@
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
-#include <linux/fsnotify.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -1986,8 +1985,19 @@ no_page:
if (err == -EEXIST)
goto repeat;
- if (err)
+ if (err) {
+ /*
+ * When NOWAIT I/O fails to allocate folios this could
+ * be due to a nonblocking memory allocation and not
+ * because the system actually is out of memory.
+ * Return -EAGAIN so that there caller retries in a
+ * blocking fashion instead of propagating -ENOMEM
+ * to the application.
+ */
+ if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
+ err = -EAGAIN;
return ERR_PTR(err);
+ }
/*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
@@ -3198,14 +3208,6 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
unsigned long vm_flags = vmf->vma->vm_flags;
unsigned int mmap_miss;
- /*
- * If we have pre-content watches we need to disable readahead to make
- * sure that we don't populate our mapping with 0 filled pages that we
- * never emitted an event for.
- */
- if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
- return fpin;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
@@ -3274,10 +3276,6 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file *fpin = NULL;
unsigned int mmap_miss;
- /* See comment in do_sync_mmap_readahead. */
- if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
- return fpin;
-
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
@@ -3337,48 +3335,6 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
}
/**
- * filemap_fsnotify_fault - maybe emit a pre-content event.
- * @vmf: struct vm_fault containing details of the fault.
- *
- * If we have a pre-content watch on this file we will emit an event for this
- * range. If we return anything the fault caller should return immediately, we
- * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the
- * fault again and then the fault handler will run the second time through.
- *
- * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened.
- */
-vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
-{
- struct file *fpin = NULL;
- int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
- loff_t pos = vmf->pgoff >> PAGE_SHIFT;
- size_t count = PAGE_SIZE;
- int err;
-
- /*
- * We already did this and now we're retrying with everything locked,
- * don't emit the event and continue.
- */
- if (vmf->flags & FAULT_FLAG_TRIED)
- return 0;
-
- /* No watches, we're done. */
- if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode)))
- return 0;
-
- fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- if (!fpin)
- return VM_FAULT_SIGBUS;
-
- err = fsnotify_file_area_perm(fpin, mask, &pos, count);
- fput(fpin);
- if (err)
- return VM_FAULT_SIGBUS;
- return VM_FAULT_RETRY;
-}
-EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
-
-/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
*
@@ -3482,37 +3438,6 @@ retry_find:
*/
if (unlikely(!folio_test_uptodate(folio))) {
/*
- * If this is a precontent file we have can now emit an event to
- * try and populate the folio.
- */
- if (!(vmf->flags & FAULT_FLAG_TRIED) &&
- unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
- loff_t pos = folio_pos(folio);
- size_t count = folio_size(folio);
-
- /* We're NOWAIT, we have to retry. */
- if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
- folio_unlock(folio);
- goto out_retry;
- }
-
- if (mapping_locked)
- filemap_invalidate_unlock_shared(mapping);
- mapping_locked = false;
-
- folio_unlock(folio);
- fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- if (!fpin)
- goto out_retry;
-
- error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos,
- count);
- if (error)
- ret = VM_FAULT_SIGBUS;
- goto out_retry;
- }
-
- /*
* If the invalidate lock is not held, the folio was in cache
* and uptodate and now it is not. Strange but possible since we
* didn't hold the page lock all the time. Let's drop
@@ -4169,17 +4094,6 @@ retry:
bytes = min(chunk - offset, bytes);
balance_dirty_pages_ratelimited(mapping);
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- */
- if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
- status = -EFAULT;
- break;
- }
-
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
@@ -4197,6 +4111,12 @@ retry:
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
+ /*
+ * Faults here on mmap()s can recurse into arbitrary
+ * filesystem code. Lots of locks are held that can
+ * deadlock. Use an atomic copy to avoid deadlocking
+ * in page fault handling.
+ */
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
flush_dcache_folio(folio);
@@ -4222,6 +4142,16 @@ retry:
bytes = copied;
goto retry;
}
+
+ /*
+ * 'folio' is now unlocked and faults on it can be
+ * handled. Ensure forward progress by trying to
+ * fault it in now.
+ */
+ if (fault_in_iov_iter_readable(i, bytes) == bytes) {
+ status = -EFAULT;
+ break;
+ }
} else {
pos += status;
written += status;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3d3ebdc002d5..373781b21e5c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3304,7 +3304,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
folio_account_cleaned(tail,
inode_to_wb(folio->mapping->host));
__filemap_remove_folio(tail, NULL);
- folio_put(tail);
+ folio_put_refs(tail, folio_nr_pages(tail));
} else if (!folio_test_anon(folio)) {
__xa_store(&folio->mapping->i_pages, tail->index,
tail, 0);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 97930d44d460..318624c96584 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2135,6 +2135,8 @@ retry:
if (!folio_ref_count(folio)) {
struct hstate *h = folio_hstate(folio);
+ bool adjust_surplus = false;
+
if (!available_huge_pages(h))
goto out;
@@ -2157,7 +2159,9 @@ retry:
goto retry;
}
- remove_hugetlb_folio(h, folio, false);
+ if (h->surplus_huge_pages_node[folio_nid(folio)])
+ adjust_surplus = true;
+ remove_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
@@ -2177,7 +2181,7 @@ retry:
rc = hugetlb_vmemmap_restore_folio(h, folio);
if (rc) {
spin_lock_irq(&hugetlb_lock);
- add_hugetlb_folio(h, folio, false);
+ add_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages++;
goto out;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4de6acb9b8ec..a037ec92881d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1921,9 +1921,18 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old;
+ unsigned long flags;
stock = &per_cpu(memcg_stock, cpu);
+
+ /* drain_obj_stock requires stock_lock */
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ old = drain_obj_stock(stock);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+
drain_stock(stock);
+ obj_cgroup_put(old);
return 0;
}
@@ -4993,7 +5002,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- swap_cgroup_record(folio, entry);
+ swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry);
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
@@ -5055,7 +5064,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
- swap_cgroup_record(folio, entry);
+ swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index b9661ccfa64f..fb7b8dc75167 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -76,7 +76,6 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
-#include <linux/fsnotify.h>
#include <trace/events/kmem.h>
@@ -5750,17 +5749,8 @@ out_map:
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
-
if (vma_is_anonymous(vma))
return do_huge_pmd_anonymous_page(vmf);
- /*
- * Currently we just emit PAGE_SIZE for our fault events, so don't allow
- * a huge fault if we have a pre content watch on this file. This would
- * be trivial to support, but there would need to be tests to ensure
- * this works properly and those don't exist currently.
- */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
return VM_FAULT_FALLBACK;
@@ -5784,9 +5774,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
}
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
@@ -5809,9 +5796,6 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vma))
return VM_FAULT_FALLBACK;
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -5829,9 +5813,6 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
if (vma_is_anonymous(vma))
goto split;
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
diff --git a/mm/migrate.c b/mm/migrate.c
index fb19a18892c8..97f0edf0c032 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -518,15 +518,13 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_anon(folio) && folio_test_large(folio))
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
folio_ref_add(newfolio, nr); /* add cache reference */
- if (folio_test_swapbacked(folio)) {
+ if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
- if (folio_test_swapcache(folio)) {
- folio_set_swapcache(newfolio);
- newfolio->private = folio_get_private(folio);
- }
+ if (folio_test_swapcache(folio)) {
+ folio_set_swapcache(newfolio);
+ newfolio->private = folio_get_private(folio);
entries = nr;
} else {
- VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
entries = 1;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index baa79abdaf03..9cb6e99215e2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1613,13 +1613,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
-{
- BUG();
- return 0;
-}
-EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
-
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94917c729120..542d25f77be8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7004,7 +7004,7 @@ static inline bool has_unaccepted_memory(void)
static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
- long to_accept;
+ long to_accept, wmark;
bool ret = false;
if (!has_unaccepted_memory())
@@ -7013,8 +7013,18 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order)
if (list_empty(&zone->unaccepted_pages))
return false;
+ wmark = promo_wmark_pages(zone);
+
+ /*
+ * Watermarks have not been initialized yet.
+ *
+ * Accepting one MAX_ORDER page to ensure progress.
+ */
+ if (!wmark)
+ return try_to_accept_memory_one(zone);
+
/* How much to accept to get to promo watermark? */
- to_accept = promo_wmark_pages(zone) -
+ to_accept = wmark -
(zone_page_state(zone, NR_FREE_PAGES) -
__zone_watermark_unusable_free(zone, order, 0) -
zone_page_state(zone, NR_UNACCEPTED));
diff --git a/mm/readahead.c b/mm/readahead.c
index 220155a5c964..6a4e96b69702 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -128,7 +128,6 @@
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>
-#include <linux/fsnotify.h>
#include "internal.h"
@@ -559,15 +558,6 @@ void page_cache_sync_ra(struct readahead_control *ractl,
pgoff_t prev_index, miss;
/*
- * If we have pre-content watches we need to disable readahead to make
- * sure that we don't find 0 filled pages in cache that we never emitted
- * events for. Filesystems supporting HSM must make sure to not call
- * this function with ractl->file unset for files handled by HSM.
- */
- if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
- return;
-
- /*
* Even if readahead is disabled, issue this request as readahead
* as we'll need it to satisfy the requested range. The forced
* readahead will do the right thing and limit the read to just the
@@ -645,10 +635,6 @@ void page_cache_async_ra(struct readahead_control *ractl,
if (!ra->ra_pages)
return;
- /* See the comment in page_cache_sync_ra. */
- if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
- return;
-
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index be39078f255b..1007c30f12e2 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -58,9 +58,11 @@ static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map,
* entries must not have been charged
*
* @folio: the folio that the swap entry belongs to
+ * @id: mem_cgroup ID to be recorded
* @ent: the first swap entry to be recorded
*/
-void swap_cgroup_record(struct folio *folio, swp_entry_t ent)
+void swap_cgroup_record(struct folio *folio, unsigned short id,
+ swp_entry_t ent)
{
unsigned int nr_ents = folio_nr_pages(folio);
struct swap_cgroup *map;
@@ -72,8 +74,7 @@ void swap_cgroup_record(struct folio *folio, swp_entry_t ent)
map = swap_cgroup_ctrl[swp_type(ent)].map;
do {
- old = __swap_cgroup_id_xchg(map, offset,
- mem_cgroup_id(folio_memcg(folio)));
+ old = __swap_cgroup_id_xchg(map, offset, id);
VM_BUG_ON(old);
} while (++offset != end);
}
diff --git a/mm/util.c b/mm/util.c
index b6b9684a1438..8c965474d329 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -23,6 +23,7 @@
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>
+#include <linux/fsnotify.h>
#include <linux/uaccess.h>
@@ -569,6 +570,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
+ if (!ret)
+ ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
diff --git a/mm/vma.c b/mm/vma.c
index 96bcb372c90e..71ca012c616c 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2381,7 +2381,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
* vma_merge_new_range() calls khugepaged_enter_vma() too, the below
* call covers the non-merge case.
*/
- khugepaged_enter_vma(vma, map->flags);
+ if (!vma_is_anonymous(vma))
+ khugepaged_enter_vma(vma, map->flags);
ksm_add_vma(vma);
*vmap = vma;
return 0;