diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/damon/core.c | 7 | ||||
| -rw-r--r-- | mm/damon/paddr.c | 3 | ||||
| -rw-r--r-- | mm/filemap.c | 126 | ||||
| -rw-r--r-- | mm/huge_memory.c | 2 | ||||
| -rw-r--r-- | mm/hugetlb.c | 8 | ||||
| -rw-r--r-- | mm/memcontrol.c | 13 | ||||
| -rw-r--r-- | mm/memory.c | 19 | ||||
| -rw-r--r-- | mm/migrate.c | 10 | ||||
| -rw-r--r-- | mm/nommu.c | 7 | ||||
| -rw-r--r-- | mm/page_alloc.c | 14 | ||||
| -rw-r--r-- | mm/readahead.c | 14 | ||||
| -rw-r--r-- | mm/swap_cgroup.c | 7 | ||||
| -rw-r--r-- | mm/util.c | 3 | ||||
| -rw-r--r-- | mm/vma.c | 3 |
14 files changed, 80 insertions, 156 deletions
diff --git a/mm/damon/core.c b/mm/damon/core.c index c7b981308862..384935ef4e65 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -373,6 +373,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, * or damon_attrs are updated. */ scheme->next_apply_sis = 0; + scheme->walk_completed = false; INIT_LIST_HEAD(&scheme->filters); scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -1429,9 +1430,13 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, { struct damos_filter *filter; + s->core_filters_allowed = false; damos_for_each_filter(filter, s) { - if (damos_filter_match(ctx, t, r, filter)) + if (damos_filter_match(ctx, t, r, filter)) { + if (filter->allow) + s->core_filters_allowed = true; return !filter->allow; + } } return false; } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 0f9ae14f884d..c834aa217835 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -236,6 +236,9 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) { struct damos_filter *filter; + if (scheme->core_filters_allowed) + return false; + damos_for_each_filter(filter, scheme) { if (damos_pa_filter_match(filter, folio)) return !filter->allow; diff --git a/mm/filemap.c b/mm/filemap.c index 2974691fdfad..e9404290f2c6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -47,7 +47,6 @@ #include <linux/splice.h> #include <linux/rcupdate_wait.h> #include <linux/sched/mm.h> -#include <linux/fsnotify.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -1986,8 +1985,19 @@ no_page: if (err == -EEXIST) goto repeat; - if (err) + if (err) { + /* + * When NOWAIT I/O fails to allocate folios this could + * be due to a nonblocking memory allocation and not + * because the system actually is out of memory. + * Return -EAGAIN so that there caller retries in a + * blocking fashion instead of propagating -ENOMEM + * to the application. + */ + if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM) + err = -EAGAIN; return ERR_PTR(err); + } /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. @@ -3198,14 +3208,6 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) unsigned long vm_flags = vmf->vma->vm_flags; unsigned int mmap_miss; - /* - * If we have pre-content watches we need to disable readahead to make - * sure that we don't populate our mapping with 0 filled pages that we - * never emitted an event for. - */ - if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) - return fpin; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { @@ -3274,10 +3276,6 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct file *fpin = NULL; unsigned int mmap_miss; - /* See comment in do_sync_mmap_readahead. */ - if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) - return fpin; - /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; @@ -3337,48 +3335,6 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) } /** - * filemap_fsnotify_fault - maybe emit a pre-content event. - * @vmf: struct vm_fault containing details of the fault. - * - * If we have a pre-content watch on this file we will emit an event for this - * range. If we return anything the fault caller should return immediately, we - * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the - * fault again and then the fault handler will run the second time through. - * - * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened. - */ -vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf) -{ - struct file *fpin = NULL; - int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS; - loff_t pos = vmf->pgoff >> PAGE_SHIFT; - size_t count = PAGE_SIZE; - int err; - - /* - * We already did this and now we're retrying with everything locked, - * don't emit the event and continue. - */ - if (vmf->flags & FAULT_FLAG_TRIED) - return 0; - - /* No watches, we're done. */ - if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode))) - return 0; - - fpin = maybe_unlock_mmap_for_io(vmf, fpin); - if (!fpin) - return VM_FAULT_SIGBUS; - - err = fsnotify_file_area_perm(fpin, mask, &pos, count); - fput(fpin); - if (err) - return VM_FAULT_SIGBUS; - return VM_FAULT_RETRY; -} -EXPORT_SYMBOL_GPL(filemap_fsnotify_fault); - -/** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault * @@ -3482,37 +3438,6 @@ retry_find: */ if (unlikely(!folio_test_uptodate(folio))) { /* - * If this is a precontent file we have can now emit an event to - * try and populate the folio. - */ - if (!(vmf->flags & FAULT_FLAG_TRIED) && - unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { - loff_t pos = folio_pos(folio); - size_t count = folio_size(folio); - - /* We're NOWAIT, we have to retry. */ - if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) { - folio_unlock(folio); - goto out_retry; - } - - if (mapping_locked) - filemap_invalidate_unlock_shared(mapping); - mapping_locked = false; - - folio_unlock(folio); - fpin = maybe_unlock_mmap_for_io(vmf, fpin); - if (!fpin) - goto out_retry; - - error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos, - count); - if (error) - ret = VM_FAULT_SIGBUS; - goto out_retry; - } - - /* * If the invalidate lock is not held, the folio was in cache * and uptodate and now it is not. Strange but possible since we * didn't hold the page lock all the time. Let's drop @@ -4169,17 +4094,6 @@ retry: bytes = min(chunk - offset, bytes); balance_dirty_pages_ratelimited(mapping); - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { - status = -EFAULT; - break; - } - if (fatal_signal_pending(current)) { status = -EINTR; break; @@ -4197,6 +4111,12 @@ retry: if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); + /* + * Faults here on mmap()s can recurse into arbitrary + * filesystem code. Lots of locks are held that can + * deadlock. Use an atomic copy to avoid deadlocking + * in page fault handling. + */ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); @@ -4222,6 +4142,16 @@ retry: bytes = copied; goto retry; } + + /* + * 'folio' is now unlocked and faults on it can be + * handled. Ensure forward progress by trying to + * fault it in now. + */ + if (fault_in_iov_iter_readable(i, bytes) == bytes) { + status = -EFAULT; + break; + } } else { pos += status; written += status; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3d3ebdc002d5..373781b21e5c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3304,7 +3304,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); __filemap_remove_folio(tail, NULL); - folio_put(tail); + folio_put_refs(tail, folio_nr_pages(tail)); } else if (!folio_test_anon(folio)) { __xa_store(&folio->mapping->i_pages, tail->index, tail, 0); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 97930d44d460..318624c96584 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2135,6 +2135,8 @@ retry: if (!folio_ref_count(folio)) { struct hstate *h = folio_hstate(folio); + bool adjust_surplus = false; + if (!available_huge_pages(h)) goto out; @@ -2157,7 +2159,9 @@ retry: goto retry; } - remove_hugetlb_folio(h, folio, false); + if (h->surplus_huge_pages_node[folio_nid(folio)]) + adjust_surplus = true; + remove_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); @@ -2177,7 +2181,7 @@ retry: rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); + add_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages++; goto out; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4de6acb9b8ec..a037ec92881d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1921,9 +1921,18 @@ void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; + struct obj_cgroup *old; + unsigned long flags; stock = &per_cpu(memcg_stock, cpu); + + /* drain_obj_stock requires stock_lock */ + local_lock_irqsave(&memcg_stock.stock_lock, flags); + old = drain_obj_stock(stock); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + drain_stock(stock); + obj_cgroup_put(old); return 0; } @@ -4993,7 +5002,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, entry); + swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; @@ -5055,7 +5064,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) mem_cgroup_id_get_many(memcg, nr_pages - 1); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); - swap_cgroup_record(folio, entry); + swap_cgroup_record(folio, mem_cgroup_id(memcg), entry); return 0; } diff --git a/mm/memory.c b/mm/memory.c index b9661ccfa64f..fb7b8dc75167 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -76,7 +76,6 @@ #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> -#include <linux/fsnotify.h> #include <trace/events/kmem.h> @@ -5750,17 +5749,8 @@ out_map: static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); - /* - * Currently we just emit PAGE_SIZE for our fault events, so don't allow - * a huge fault if we have a pre content watch on this file. This would - * be trivial to support, but there would need to be tests to ensure - * this works properly and those don't exist currently. - */ - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) - return VM_FAULT_FALLBACK; if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PMD_ORDER); return VM_FAULT_FALLBACK; @@ -5784,9 +5774,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) } if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { - /* See comment in create_huge_pmd. */ - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) - goto split; if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) @@ -5809,9 +5796,6 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; - /* See comment in create_huge_pmd. */ - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) - return VM_FAULT_FALLBACK; if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PUD_ORDER); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -5829,9 +5813,6 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) if (vma_is_anonymous(vma)) goto split; if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { - /* See comment in create_huge_pmd. */ - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) - goto split; if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) diff --git a/mm/migrate.c b/mm/migrate.c index fb19a18892c8..97f0edf0c032 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -518,15 +518,13 @@ static int __folio_migrate_mapping(struct address_space *mapping, if (folio_test_anon(folio) && folio_test_large(folio)) mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); folio_ref_add(newfolio, nr); /* add cache reference */ - if (folio_test_swapbacked(folio)) { + if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); - if (folio_test_swapcache(folio)) { - folio_set_swapcache(newfolio); - newfolio->private = folio_get_private(folio); - } + if (folio_test_swapcache(folio)) { + folio_set_swapcache(newfolio); + newfolio->private = folio_get_private(folio); entries = nr; } else { - VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); entries = 1; } diff --git a/mm/nommu.c b/mm/nommu.c index baa79abdaf03..9cb6e99215e2 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1613,13 +1613,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf) -{ - BUG(); - return 0; -} -EXPORT_SYMBOL_GPL(filemap_fsnotify_fault); - vm_fault_t filemap_fault(struct vm_fault *vmf) { BUG(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94917c729120..542d25f77be8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7004,7 +7004,7 @@ static inline bool has_unaccepted_memory(void) static bool cond_accept_memory(struct zone *zone, unsigned int order) { - long to_accept; + long to_accept, wmark; bool ret = false; if (!has_unaccepted_memory()) @@ -7013,8 +7013,18 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order) if (list_empty(&zone->unaccepted_pages)) return false; + wmark = promo_wmark_pages(zone); + + /* + * Watermarks have not been initialized yet. + * + * Accepting one MAX_ORDER page to ensure progress. + */ + if (!wmark) + return try_to_accept_memory_one(zone); + /* How much to accept to get to promo watermark? */ - to_accept = promo_wmark_pages(zone) - + to_accept = wmark - (zone_page_state(zone, NR_FREE_PAGES) - __zone_watermark_unusable_free(zone, order, 0) - zone_page_state(zone, NR_UNACCEPTED)); diff --git a/mm/readahead.c b/mm/readahead.c index 220155a5c964..6a4e96b69702 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -128,7 +128,6 @@ #include <linux/blk-cgroup.h> #include <linux/fadvise.h> #include <linux/sched/mm.h> -#include <linux/fsnotify.h> #include "internal.h" @@ -559,15 +558,6 @@ void page_cache_sync_ra(struct readahead_control *ractl, pgoff_t prev_index, miss; /* - * If we have pre-content watches we need to disable readahead to make - * sure that we don't find 0 filled pages in cache that we never emitted - * events for. Filesystems supporting HSM must make sure to not call - * this function with ractl->file unset for files handled by HSM. - */ - if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode))) - return; - - /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced * readahead will do the right thing and limit the read to just the @@ -645,10 +635,6 @@ void page_cache_async_ra(struct readahead_control *ractl, if (!ra->ra_pages) return; - /* See the comment in page_cache_sync_ra. */ - if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode))) - return; - /* * Same bit is used for PG_readahead and PG_reclaim. */ diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index be39078f255b..1007c30f12e2 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -58,9 +58,11 @@ static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, * entries must not have been charged * * @folio: the folio that the swap entry belongs to + * @id: mem_cgroup ID to be recorded * @ent: the first swap entry to be recorded */ -void swap_cgroup_record(struct folio *folio, swp_entry_t ent) +void swap_cgroup_record(struct folio *folio, unsigned short id, + swp_entry_t ent) { unsigned int nr_ents = folio_nr_pages(folio); struct swap_cgroup *map; @@ -72,8 +74,7 @@ void swap_cgroup_record(struct folio *folio, swp_entry_t ent) map = swap_cgroup_ctrl[swp_type(ent)].map; do { - old = __swap_cgroup_id_xchg(map, offset, - mem_cgroup_id(folio_memcg(folio))); + old = __swap_cgroup_id_xchg(map, offset, id); VM_BUG_ON(old); } while (++offset != end); } diff --git a/mm/util.c b/mm/util.c index b6b9684a1438..8c965474d329 100644 --- a/mm/util.c +++ b/mm/util.c @@ -23,6 +23,7 @@ #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> +#include <linux/fsnotify.h> #include <linux/uaccess.h> @@ -569,6 +570,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); + if (!ret) + ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; @@ -2381,7 +2381,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ - khugepaged_enter_vma(vma, map->flags); + if (!vma_is_anonymous(vma)) + khugepaged_enter_vma(vma, map->flags); ksm_add_vma(vma); *vmap = vma; return 0; |
