diff options
Diffstat (limited to 'mm')
56 files changed, 1544 insertions, 893 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf..4108bcd96784 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -823,6 +823,22 @@ config ARCH_WANT_GENERAL_HUGETLB config ARCH_WANTS_THP_SWAP def_bool n +config PERSISTENT_HUGE_ZERO_FOLIO + bool "Allocate a PMD sized folio for zeroing" + depends on TRANSPARENT_HUGEPAGE + help + Enable this option to reduce the runtime refcounting overhead + of the huge zero folio and expand the places in the kernel + that can use huge zero folios. For instance, block I/O benefits + from access to large folios for zeroing memory. + + With this option enabled, the huge zero folio is allocated + once and never freed. One full huge page's worth of memory shall + be used. + + Say Y if your system has lots of memory. Say N if you are + memory constrained. + config MM_ID def_bool n @@ -1381,6 +1397,8 @@ config PT_RECLAIM Note: now only empty user PTE page table pages will be reclaimed. +config FIND_NORMAL_PAGE + def_bool n source "mm/damon/Kconfig" diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef..e4d578e6121c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -510,7 +510,7 @@ static void wb_update_bandwidth_workfn(struct work_struct *work) /* * Initial write bandwidth: 100 MB/s */ -#define INIT_BW (100 << (20 - PAGE_SHIFT)) +#define INIT_BW MB_TO_PAGES(100) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) @@ -864,7 +864,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, if (!count) return page; - trace_cma_alloc_start(name, count, align); + trace_cma_alloc_start(name, count, cma->available_count, cma->count, align); for (r = 0; r < cma->nranges; r++) { page = NULL; diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index b3171f9406c1..8c868f7035fc 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -104,7 +104,7 @@ config DAMON_STAT config DAMON_STAT_ENABLED_DEFAULT bool "Enable DAMON_STAT by default" - depends on DAMON_PADDR + depends on DAMON_STAT default DAMON_STAT help Whether to enable DAMON_STAT by default. Users can disable it in diff --git a/mm/damon/core.c b/mm/damon/core.c index 08065b363972..be5942435d78 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -201,6 +201,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * @t: the given target. * @ranges: array of new monitoring target ranges. * @nr_ranges: length of @ranges. + * @min_sz_region: minimum region size. * * This function adds new regions to, or modify existing regions of a * monitoring target to fit in specific ranges. @@ -208,7 +209,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * Return: 0 if success, or negative error code otherwise. */ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges) + unsigned int nr_ranges, unsigned long min_sz_region) { struct damon_region *r, *next; unsigned int i; @@ -245,16 +246,16 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, /* no region intersects with this range */ newr = damon_new_region( ALIGN_DOWN(range->start, - DAMON_MIN_REGION), - ALIGN(range->end, DAMON_MIN_REGION)); + min_sz_region), + ALIGN(range->end, min_sz_region)); if (!newr) return -ENOMEM; damon_insert_region(newr, damon_prev_region(r), r, t); } else { /* resize intersecting regions to fit in this range */ first->ar.start = ALIGN_DOWN(range->start, - DAMON_MIN_REGION); - last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); + min_sz_region); + last->ar.end = ALIGN(range->end, min_sz_region); /* fill possible holes in the range */ err = damon_fill_regions_holes(first, last, t); @@ -544,6 +545,9 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.min_nr_regions = 10; ctx->attrs.max_nr_regions = 1000; + ctx->addr_unit = 1; + ctx->min_sz_region = DAMON_MIN_REGION; + INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -570,6 +574,23 @@ void damon_destroy_ctx(struct damon_ctx *ctx) kfree(ctx); } +static bool damon_attrs_equals(const struct damon_attrs *attrs1, + const struct damon_attrs *attrs2) +{ + const struct damon_intervals_goal *ig1 = &attrs1->intervals_goal; + const struct damon_intervals_goal *ig2 = &attrs2->intervals_goal; + + return attrs1->sample_interval == attrs2->sample_interval && + attrs1->aggr_interval == attrs2->aggr_interval && + attrs1->ops_update_interval == attrs2->ops_update_interval && + attrs1->min_nr_regions == attrs2->min_nr_regions && + attrs1->max_nr_regions == attrs2->max_nr_regions && + ig1->access_bp == ig2->access_bp && + ig1->aggrs == ig2->aggrs && + ig1->min_sample_us == ig2->min_sample_us && + ig1->max_sample_us == ig2->max_sample_us; +} + static unsigned int damon_age_for_new_attrs(unsigned int age, struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) { @@ -1108,8 +1129,8 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx) * * If @src has no region, @dst keeps current regions. */ -static int damon_commit_target_regions( - struct damon_target *dst, struct damon_target *src) +static int damon_commit_target_regions(struct damon_target *dst, + struct damon_target *src, unsigned long src_min_sz_region) { struct damon_region *src_region; struct damon_addr_range *ranges; @@ -1126,18 +1147,19 @@ static int damon_commit_target_regions( i = 0; damon_for_each_region(src_region, src) ranges[i++] = src_region->ar; - err = damon_set_regions(dst, ranges, i); + err = damon_set_regions(dst, ranges, i, src_min_sz_region); kfree(ranges); return err; } static int damon_commit_target( struct damon_target *dst, bool dst_has_pid, - struct damon_target *src, bool src_has_pid) + struct damon_target *src, bool src_has_pid, + unsigned long src_min_sz_region) { int err; - err = damon_commit_target_regions(dst, src); + err = damon_commit_target_regions(dst, src, src_min_sz_region); if (err) return err; if (dst_has_pid) @@ -1159,7 +1181,8 @@ static int damon_commit_targets( if (src_target) { err = damon_commit_target( dst_target, damon_target_has_pid(dst), - src_target, damon_target_has_pid(src)); + src_target, damon_target_has_pid(src), + src->min_sz_region); if (err) return err; } else { @@ -1182,7 +1205,8 @@ static int damon_commit_targets( if (!new_target) return -ENOMEM; err = damon_commit_target(new_target, false, - src_target, damon_target_has_pid(src)); + src_target, damon_target_has_pid(src), + src->min_sz_region); if (err) { damon_destroy_target(new_target, NULL); return err; @@ -1222,10 +1246,14 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) * 2. ops update should be done after pid handling is done (target * committing require putting pids). */ - err = damon_set_attrs(dst, &src->attrs); - if (err) - return err; + if (!damon_attrs_equals(&dst->attrs, &src->attrs)) { + err = damon_set_attrs(dst, &src->attrs); + if (err) + return err; + } dst->ops = src->ops; + dst->addr_unit = src->addr_unit; + dst->min_sz_region = src->min_sz_region; return 0; } @@ -1258,8 +1286,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) if (ctx->attrs.min_nr_regions) sz /= ctx->attrs.min_nr_regions; - if (sz < DAMON_MIN_REGION) - sz = DAMON_MIN_REGION; + if (sz < ctx->min_sz_region) + sz = ctx->min_sz_region; return sz; } @@ -1603,6 +1631,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * @t: The target of the region. * @rp: The pointer to the region. * @s: The scheme to be applied. + * @min_sz_region: minimum region size. * * If a quota of a scheme has exceeded in a quota charge window, the scheme's * action would applied to only a part of the target access pattern fulfilling @@ -1620,7 +1649,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * Return: true if the region should be entirely skipped, false otherwise. */ static bool damos_skip_charged_region(struct damon_target *t, - struct damon_region **rp, struct damos *s) + struct damon_region **rp, struct damos *s, unsigned long min_sz_region) { struct damon_region *r = *rp; struct damos_quota *quota = &s->quota; @@ -1642,11 +1671,11 @@ static bool damos_skip_charged_region(struct damon_target *t, if (quota->charge_addr_from && r->ar.start < quota->charge_addr_from) { sz_to_skip = ALIGN_DOWN(quota->charge_addr_from - - r->ar.start, DAMON_MIN_REGION); + r->ar.start, min_sz_region); if (!sz_to_skip) { - if (damon_sz_region(r) <= DAMON_MIN_REGION) + if (damon_sz_region(r) <= min_sz_region) return true; - sz_to_skip = DAMON_MIN_REGION; + sz_to_skip = min_sz_region; } damon_split_region_at(t, r, sz_to_skip); r = damon_next_region(r); @@ -1671,7 +1700,8 @@ static void damos_update_stat(struct damos *s, } static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos_filter *filter) + struct damon_region *r, struct damos_filter *filter, + unsigned long min_sz_region) { bool matched = false; struct damon_target *ti; @@ -1688,8 +1718,8 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, matched = target_idx == filter->target_idx; break; case DAMOS_FILTER_TYPE_ADDR: - start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION); - end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION); + start = ALIGN_DOWN(filter->addr_range.start, min_sz_region); + end = ALIGN_DOWN(filter->addr_range.end, min_sz_region); /* inside the range */ if (start <= r->ar.start && r->ar.end <= end) { @@ -1725,7 +1755,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, s->core_filters_allowed = false; damos_for_each_filter(filter, s) { - if (damos_filter_match(ctx, t, r, filter)) { + if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) { if (filter->allow) s->core_filters_allowed = true; return !filter->allow; @@ -1860,7 +1890,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { sz = ALIGN_DOWN(quota->esz - quota->charged_sz, - DAMON_MIN_REGION); + c->min_sz_region); if (!sz) goto update_stat; damon_split_region_at(t, r, sz); @@ -1908,7 +1938,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->esz && quota->charged_sz >= quota->esz) continue; - if (damos_skip_charged_region(t, &r, s)) + if (damos_skip_charged_region(t, &r, s, c->min_sz_region)) continue; if (!damos_valid_target(c, t, r, s)) @@ -2306,7 +2336,8 @@ static void damon_split_region_at(struct damon_target *t, } /* Split every region in the given target into 'nr_subs' regions */ -static void damon_split_regions_of(struct damon_target *t, int nr_subs) +static void damon_split_regions_of(struct damon_target *t, int nr_subs, + unsigned long min_sz_region) { struct damon_region *r, *next; unsigned long sz_region, sz_sub = 0; @@ -2316,13 +2347,13 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs) sz_region = damon_sz_region(r); for (i = 0; i < nr_subs - 1 && - sz_region > 2 * DAMON_MIN_REGION; i++) { + sz_region > 2 * min_sz_region; i++) { /* * Randomly select size of left sub-region to be at * least 10 percent and at most 90% of original region */ sz_sub = ALIGN_DOWN(damon_rand(1, 10) * - sz_region / 10, DAMON_MIN_REGION); + sz_region / 10, min_sz_region); /* Do not allow blank region */ if (sz_sub == 0 || sz_sub >= sz_region) continue; @@ -2362,7 +2393,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx) nr_subregions = 3; damon_for_each_target(t, ctx) - damon_split_regions_of(t, nr_subregions); + damon_split_regions_of(t, nr_subregions, ctx->min_sz_region); last_nr_regions = nr_regions; } @@ -2755,7 +2786,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1); + return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION); } /* diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 99321ff5cb92..998c5180a603 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -303,7 +303,7 @@ static unsigned int __damon_migrate_folio_list( * instead of migrated. */ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | - __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT, + __GFP_NOMEMALLOC | GFP_NOWAIT, .nid = target_nid, }; @@ -412,3 +412,12 @@ unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid) return nr_migrated; } + +bool damos_ops_has_filter(struct damos *s) +{ + struct damos_filter *f; + + damos_for_each_ops_filter(f, s) + return true; + return false; +} diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 61ad54aaf256..5efa5b5970de 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -21,3 +21,5 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio); unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid); + +bool damos_ops_has_filter(struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 53a55c5114fb..07a8aead439e 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -18,7 +18,26 @@ #include "../internal.h" #include "ops-common.h" -static void damon_pa_mkold(unsigned long paddr) +static phys_addr_t damon_pa_phys_addr( + unsigned long addr, unsigned long addr_unit) +{ + return (phys_addr_t)addr * addr_unit; +} + +static unsigned long damon_pa_core_addr( + phys_addr_t pa, unsigned long addr_unit) +{ + /* + * Use div_u64() for avoiding linking errors related with __udivdi3, + * __aeabi_uldivmod, or similar problems. This should also improve the + * performance optimization (read div_u64() comment for the detail). + */ + if (sizeof(pa) == 8 && sizeof(addr_unit) == 4) + return div_u64(pa, addr_unit); + return pa / addr_unit; +} + +static void damon_pa_mkold(phys_addr_t paddr) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); @@ -29,11 +48,12 @@ static void damon_pa_mkold(unsigned long paddr) folio_put(folio); } -static void __damon_pa_prepare_access_check(struct damon_region *r) +static void __damon_pa_prepare_access_check(struct damon_region *r, + unsigned long addr_unit) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); - damon_pa_mkold(r->sampling_addr); + damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, addr_unit)); } static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) @@ -43,11 +63,11 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - __damon_pa_prepare_access_check(r); + __damon_pa_prepare_access_check(r, ctx->addr_unit); } } -static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) +static bool damon_pa_young(phys_addr_t paddr, unsigned long *folio_sz) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); bool accessed; @@ -62,23 +82,25 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) } static void __damon_pa_check_access(struct damon_region *r, - struct damon_attrs *attrs) + struct damon_attrs *attrs, unsigned long addr_unit) { - static unsigned long last_addr; + static phys_addr_t last_addr; static unsigned long last_folio_sz = PAGE_SIZE; static bool last_accessed; + phys_addr_t sampling_addr = damon_pa_phys_addr( + r->sampling_addr, addr_unit); /* If the region is in the last checked page, reuse the result */ if (ALIGN_DOWN(last_addr, last_folio_sz) == - ALIGN_DOWN(r->sampling_addr, last_folio_sz)) { + ALIGN_DOWN(sampling_addr, last_folio_sz)) { damon_update_region_access_rate(r, last_accessed, attrs); return; } - last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz); + last_accessed = damon_pa_young(sampling_addr, &last_folio_sz); damon_update_region_access_rate(r, last_accessed, attrs); - last_addr = r->sampling_addr; + last_addr = sampling_addr; } static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) @@ -89,7 +111,8 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) { - __damon_pa_check_access(r, &ctx->attrs); + __damon_pa_check_access( + r, &ctx->attrs, ctx->addr_unit); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } } @@ -125,10 +148,11 @@ static bool damon_pa_invalid_damos_folio(struct folio *folio, struct damos *s) return false; } -static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, +static unsigned long damon_pa_pageout(struct damon_region *r, + unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - unsigned long addr, applied; + phys_addr_t addr, applied; LIST_HEAD(folio_list); bool install_young_filter = true; struct damos_filter *filter; @@ -149,8 +173,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, damos_add_filter(s, filter); } - addr = r->ar.start; - while (addr < r->ar.end) { + addr = damon_pa_phys_addr(r->ar.start, addr_unit); + while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { folio = damon_get_folio(PHYS_PFN(addr)); if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; @@ -160,7 +184,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, if (damos_pa_filter_out(s, folio)) goto put_folio; else - *sz_filter_passed += folio_size(folio); + *sz_filter_passed += folio_size(folio) / addr_unit; folio_clear_referenced(folio); folio_test_clear_young(folio); @@ -179,18 +203,19 @@ put_folio: applied = reclaim_pages(&folio_list); cond_resched(); s->last_applied = folio; - return applied * PAGE_SIZE; + return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } static inline unsigned long damon_pa_mark_accessed_or_deactivate( - struct damon_region *r, struct damos *s, bool mark_accessed, + struct damon_region *r, unsigned long addr_unit, + struct damos *s, bool mark_accessed, unsigned long *sz_filter_passed) { - unsigned long addr, applied = 0; + phys_addr_t addr, applied = 0; struct folio *folio; - addr = r->ar.start; - while (addr < r->ar.end) { + addr = damon_pa_phys_addr(r->ar.start, addr_unit); + while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { folio = damon_get_folio(PHYS_PFN(addr)); if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; @@ -200,7 +225,7 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( if (damos_pa_filter_out(s, folio)) goto put_folio; else - *sz_filter_passed += folio_size(folio); + *sz_filter_passed += folio_size(folio) / addr_unit; if (mark_accessed) folio_mark_accessed(folio); @@ -212,32 +237,35 @@ put_folio: folio_put(folio); } s->last_applied = folio; - return applied * PAGE_SIZE; + return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } static unsigned long damon_pa_mark_accessed(struct damon_region *r, - struct damos *s, unsigned long *sz_filter_passed) + unsigned long addr_unit, struct damos *s, + unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, true, + return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, true, sz_filter_passed); } static unsigned long damon_pa_deactivate_pages(struct damon_region *r, - struct damos *s, unsigned long *sz_filter_passed) + unsigned long addr_unit, struct damos *s, + unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, false, + return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, false, sz_filter_passed); } -static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, +static unsigned long damon_pa_migrate(struct damon_region *r, + unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - unsigned long addr, applied; + phys_addr_t addr, applied; LIST_HEAD(folio_list); struct folio *folio; - addr = r->ar.start; - while (addr < r->ar.end) { + addr = damon_pa_phys_addr(r->ar.start, addr_unit); + while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { folio = damon_get_folio(PHYS_PFN(addr)); if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; @@ -247,7 +275,7 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, if (damos_pa_filter_out(s, folio)) goto put_folio; else - *sz_filter_passed += folio_size(folio); + *sz_filter_passed += folio_size(folio) / addr_unit; if (!folio_isolate_lru(folio)) goto put_folio; @@ -259,29 +287,21 @@ put_folio: applied = damon_migrate_pages(&folio_list, s->target_nid); cond_resched(); s->last_applied = folio; - return applied * PAGE_SIZE; + return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } -static bool damon_pa_scheme_has_filter(struct damos *s) -{ - struct damos_filter *f; - - damos_for_each_ops_filter(f, s) - return true; - return false; -} - -static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, +static unsigned long damon_pa_stat(struct damon_region *r, + unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - unsigned long addr; + phys_addr_t addr; struct folio *folio; - if (!damon_pa_scheme_has_filter(s)) + if (!damos_ops_has_filter(s)) return 0; - addr = r->ar.start; - while (addr < r->ar.end) { + addr = damon_pa_phys_addr(r->ar.start, addr_unit); + while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { folio = damon_get_folio(PHYS_PFN(addr)); if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; @@ -289,7 +309,7 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, } if (!damos_pa_filter_out(s, folio)) - *sz_filter_passed += folio_size(folio); + *sz_filter_passed += folio_size(folio) / addr_unit; addr += folio_size(folio); folio_put(folio); } @@ -301,18 +321,22 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed) { + unsigned long aunit = ctx->addr_unit; + switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pa_pageout(r, scheme, sz_filter_passed); + return damon_pa_pageout(r, aunit, scheme, sz_filter_passed); case DAMOS_LRU_PRIO: - return damon_pa_mark_accessed(r, scheme, sz_filter_passed); + return damon_pa_mark_accessed(r, aunit, scheme, + sz_filter_passed); case DAMOS_LRU_DEPRIO: - return damon_pa_deactivate_pages(r, scheme, sz_filter_passed); + return damon_pa_deactivate_pages(r, aunit, scheme, + sz_filter_passed); case DAMOS_MIGRATE_HOT: case DAMOS_MIGRATE_COLD: - return damon_pa_migrate(r, scheme, sz_filter_passed); + return damon_pa_migrate(r, aunit, scheme, sz_filter_passed); case DAMOS_STAT: - return damon_pa_stat(r, scheme, sz_filter_passed); + return damon_pa_stat(r, aunit, scheme, sz_filter_passed); default: /* DAMOS actions that not yet supported by 'paddr'. */ break; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index c96c2154128f..fe4e73d0ebbb 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -834,6 +834,7 @@ static const struct damon_sysfs_ops_name damon_sysfs_ops_names[] = { struct damon_sysfs_context { struct kobject kobj; enum damon_ops_id ops_id; + unsigned long addr_unit; struct damon_sysfs_attrs *attrs; struct damon_sysfs_targets *targets; struct damon_sysfs_schemes *schemes; @@ -849,6 +850,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc( return NULL; context->kobj = (struct kobject){}; context->ops_id = ops_id; + context->addr_unit = 1; return context; } @@ -997,6 +999,32 @@ static ssize_t operations_store(struct kobject *kobj, return -EINVAL; } +static ssize_t addr_unit_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + + return sysfs_emit(buf, "%lu\n", context->addr_unit); +} + +static ssize_t addr_unit_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + unsigned long input_addr_unit; + int err = kstrtoul(buf, 0, &input_addr_unit); + + if (err) + return err; + if (!input_addr_unit) + return -EINVAL; + + context->addr_unit = input_addr_unit; + return count; +} + static void damon_sysfs_context_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_context, kobj)); @@ -1008,9 +1036,13 @@ static struct kobj_attribute damon_sysfs_context_avail_operations_attr = static struct kobj_attribute damon_sysfs_context_operations_attr = __ATTR_RW_MODE(operations, 0600); +static struct kobj_attribute damon_sysfs_context_addr_unit_attr = + __ATTR_RW_MODE(addr_unit, 0600); + static struct attribute *damon_sysfs_context_attrs[] = { &damon_sysfs_context_avail_operations_attr.attr, &damon_sysfs_context_operations_attr.attr, + &damon_sysfs_context_addr_unit_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_context); @@ -1301,7 +1333,8 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, } static int damon_sysfs_set_regions(struct damon_target *t, - struct damon_sysfs_regions *sysfs_regions) + struct damon_sysfs_regions *sysfs_regions, + unsigned long min_sz_region) { struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr, sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN); @@ -1323,7 +1356,7 @@ static int damon_sysfs_set_regions(struct damon_target *t, if (ranges[i - 1].end > ranges[i].start) goto out; } - err = damon_set_regions(t, ranges, sysfs_regions->nr); + err = damon_set_regions(t, ranges, sysfs_regions->nr, min_sz_region); out: kfree(ranges); return err; @@ -1344,7 +1377,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, /* caller will destroy targets */ return -EINVAL; } - return damon_sysfs_set_regions(t, sys_target->regions); + return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region); } static int damon_sysfs_add_targets(struct damon_ctx *ctx, @@ -1401,6 +1434,8 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, err = damon_select_ops(ctx, sys_ctx->ops_id); if (err) return err; + ctx->addr_unit = sys_ctx->addr_unit; + ctx->min_sz_region = max(DAMON_MIN_REGION / sys_ctx->addr_unit, 1); err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) return err; diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index dfedfff19940..51369e35298b 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -230,14 +230,14 @@ static void damon_test_split_regions_of(struct kunit *test) t = damon_new_target(); r = damon_new_region(0, 22); damon_add_region(r, t); - damon_split_regions_of(t, 2); + damon_split_regions_of(t, 2, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); t = damon_new_target(); r = damon_new_region(0, 220); damon_add_region(r, t); - damon_split_regions_of(t, 4); + damon_split_regions_of(t, 4, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); damon_destroy_ctx(c); @@ -303,7 +303,7 @@ static void damon_test_set_regions(struct kunit *test) damon_add_region(r1, t); damon_add_region(r2, t); - damon_set_regions(t, &range, 1); + damon_set_regions(t, &range, 1, DAMON_MIN_REGION); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); damon_for_each_region(r, t) { @@ -419,6 +419,22 @@ static void damos_test_new_filter(struct kunit *test) damos_destroy_filter(filter); } +static void damos_test_commit_filter(struct kunit *test) +{ + struct damos_filter *src_filter = damos_new_filter( + DAMOS_FILTER_TYPE_ANON, true, true); + struct damos_filter *dst_filter = damos_new_filter( + DAMOS_FILTER_TYPE_ACTIVE, false, false); + + damos_commit_filter(dst_filter, src_filter); + KUNIT_EXPECT_EQ(test, dst_filter->type, src_filter->type); + KUNIT_EXPECT_EQ(test, dst_filter->matching, src_filter->matching); + KUNIT_EXPECT_EQ(test, dst_filter->allow, src_filter->allow); + + damos_destroy_filter(src_filter); + damos_destroy_filter(dst_filter); +} + static void damos_test_filter_out(struct kunit *test) { struct damon_target *t; @@ -434,25 +450,29 @@ static void damos_test_filter_out(struct kunit *test) damon_add_region(r, t); /* region in the range */ - KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 2; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region after the range */ r->ar.start = DAMON_MIN_REGION * 6; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region started before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 4; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); @@ -465,7 +485,8 @@ static void damos_test_filter_out(struct kunit *test) /* region started in the range */ r->ar.start = DAMON_MIN_REGION * 2; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); @@ -594,6 +615,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_set_attrs), KUNIT_CASE(damon_test_moving_sum), KUNIT_CASE(damos_test_new_filter), + KUNIT_CASE(damos_test_commit_filter), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), KUNIT_CASE(damon_test_set_filters_default_reject), diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index d2b37ccf2cc0..fce38dd53cf8 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -141,7 +141,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, damon_add_region(r, t); } - damon_set_regions(t, three_regions, 3); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); for (i = 0; i < nr_expected / 2; i++) { r = __nth_region_of(t, i); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 87e825349bdf..8c048f9b129e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -299,7 +299,7 @@ static void damon_va_update(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { if (damon_va_three_regions(t, three_regions)) continue; - damon_set_regions(t, three_regions, 3); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); } } @@ -890,6 +890,107 @@ free_lists: return applied * PAGE_SIZE; } +struct damos_va_stat_private { + struct damos *scheme; + unsigned long *sz_filter_passed; +}; + +static inline bool damos_va_invalid_folio(struct folio *folio, + struct damos *s) +{ + return !folio || folio == s->last_applied; +} + +static int damos_va_stat_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct damos_va_stat_private *priv = walk->private; + struct damos *s = priv->scheme; + unsigned long *sz_filter_passed = priv->sz_filter_passed; + struct vm_area_struct *vma = walk->vma; + struct folio *folio; + spinlock_t *ptl; + pte_t *start_pte, *pte, ptent; + int nr; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t pmde; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + pmde = pmdp_get(pmd); + if (!pmd_present(pmde)) + goto huge_unlock; + + folio = vm_normal_folio_pmd(vma, addr, pmde); + + if (damos_va_invalid_folio(folio, s)) + goto huge_unlock; + + if (!damos_va_filter_out(s, folio, vma, addr, NULL, pmd)) + *sz_filter_passed += folio_size(folio); + s->last_applied = folio; + +huge_unlock: + spin_unlock(ptl); + return 0; + } +#endif + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!start_pte) + return 0; + + for (; addr < next; pte += nr, addr += nr * PAGE_SIZE) { + nr = 1; + ptent = ptep_get(pte); + + if (pte_none(ptent) || !pte_present(ptent)) + continue; + + folio = vm_normal_folio(vma, addr, ptent); + + if (damos_va_invalid_folio(folio, s)) + continue; + + if (!damos_va_filter_out(s, folio, vma, addr, pte, NULL)) + *sz_filter_passed += folio_size(folio); + nr = folio_nr_pages(folio); + s->last_applied = folio; + } + pte_unmap_unlock(start_pte, ptl); + return 0; +} + +static unsigned long damos_va_stat(struct damon_target *target, + struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) +{ + struct damos_va_stat_private priv; + struct mm_struct *mm; + struct mm_walk_ops walk_ops = { + .pmd_entry = damos_va_stat_pmd_entry, + .walk_lock = PGWALK_RDLOCK, + }; + + priv.scheme = s; + priv.sz_filter_passed = sz_filter_passed; + + if (!damos_ops_has_filter(s)) + return 0; + + mm = damon_get_mm(target); + if (!mm) + return 0; + + mmap_read_lock(mm); + walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); + mmap_read_unlock(mm); + mmput(mm); + return 0; +} + static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed) @@ -916,7 +1017,7 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_MIGRATE_COLD: return damos_va_migrate(t, r, scheme, sz_filter_passed); case DAMOS_STAT: - return 0; + return damos_va_stat(t, r, scheme, sz_filter_passed); default: /* * DAMOS actions that are not yet supported by 'vaddr'. diff --git a/mm/debug.c b/mm/debug.c index b4388f4dcd4d..64ddb0c4b4be 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -182,7 +182,7 @@ void dump_mm(const struct mm_struct *mm) "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" - "binfmt %px flags %lx\n" + "binfmt %px flags %*pb\n" #ifdef CONFIG_AIO "ioctx_table %px\n" #endif @@ -211,7 +211,7 @@ void dump_mm(const struct mm_struct *mm) mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, - mm->binfmt, mm->flags, + mm->binfmt, NUM_MM_FLAG_BITS, __mm_flags_get_bitmap(mm), #ifdef CONFIG_AIO mm->ioctx_table, #endif diff --git a/mm/filemap.c b/mm/filemap.c index 751838ef05e5..cd9387b0a5b5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -190,6 +190,9 @@ static void filemap_unaccount_folio(struct address_space *mapping, __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } + if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags)) + mod_node_page_state(folio_pgdat(folio), + NR_KERNEL_FILE_PAGES, -nr); /* * At this point folio must be either written or cleaned by @@ -960,8 +963,14 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, { void *shadow = NULL; int ret; + struct mem_cgroup *tmp; + bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags); + if (kernel_file) + tmp = set_active_memcg(root_mem_cgroup); ret = mem_cgroup_charge(folio, NULL, gfp); + if (kernel_file) + set_active_memcg(tmp); if (ret) return ret; @@ -983,6 +992,10 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, if (!(gfp & __GFP_WRITE) && shadow) workingset_refault(folio, shadow); folio_add_lru(folio); + if (kernel_file) + mod_node_page_state(folio_pgdat(folio), + NR_KERNEL_FILE_PAGES, + folio_nr_pages(folio)); } return ret; } @@ -1140,10 +1153,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { - if (test_bit(key->bit_nr, &key->folio->flags)) + if (test_bit(key->bit_nr, &key->folio->flags.f)) return -1; if (flags & WQ_FLAG_CUSTOM) { - if (test_and_set_bit(key->bit_nr, &key->folio->flags)) + if (test_and_set_bit(key->bit_nr, &key->folio->flags.f)) return -1; flags |= WQ_FLAG_DONE; } @@ -1226,9 +1239,9 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { - if (test_and_set_bit(bit_nr, &folio->flags)) + if (test_and_set_bit(bit_nr, &folio->flags.f)) return false; - } else if (test_bit(bit_nr, &folio->flags)) + } else if (test_bit(bit_nr, &folio->flags.f)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; @@ -1961,7 +1974,7 @@ no_page: gfp &= ~__GFP_FS; if (fgp_flags & FGP_NOWAIT) { gfp &= ~GFP_KERNEL; - gfp |= GFP_NOWAIT | __GFP_NOWARN; + gfp |= GFP_NOWAIT; } if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; @@ -2447,6 +2460,9 @@ static bool filemap_range_uptodate(struct address_space *mapping, pos -= folio_pos(folio); } + if (pos == 0 && count >= folio_size(folio)) + return false; + return mapping->a_ops->is_partially_uptodate(folio, pos, count); } @@ -2619,9 +2635,10 @@ retry: goto err; } if (!folio_test_uptodate(folio)) { - if ((iocb->ki_flags & IOCB_WAITQ) && - folio_batch_count(fbatch) > 1) - iocb->ki_flags |= IOCB_NOWAIT; + if (folio_batch_count(fbatch) > 1) { + err = -EAGAIN; + goto err; + } err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) @@ -3323,9 +3340,17 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; - mmap_miss = READ_ONCE(ra->mmap_miss); - if (mmap_miss) - WRITE_ONCE(ra->mmap_miss, --mmap_miss); + /* + * If the folio is locked, we're likely racing against another fault. + * Don't touch the mmap_miss counter to avoid decreasing it multiple + * times for a single folio and break the balance with mmap_miss + * increase in do_sync_mmap_readahead(). + */ + if (likely(!folio_test_locked(folio))) { + mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss) + WRITE_ONCE(ra->mmap_miss, --mmap_miss); + } if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); @@ -148,7 +148,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs, if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page))) + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio))) return -EREMOTEIO; if (flags & FOLL_GET) @@ -475,10 +475,10 @@ EXPORT_SYMBOL_GPL(unpin_folios); * lifecycle. Avoid setting the bit unless necessary, or it might cause write * cache bouncing on large SMP machines for concurrent pinned gups. */ -static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) +static inline void mm_set_has_pinned_flag(struct mm_struct *mm) { - if (!test_bit(MMF_HAS_PINNED, mm_flags)) - set_bit(MMF_HAS_PINNED, mm_flags); + if (!mm_flags_test(MMF_HAS_PINNED, mm)) + mm_flags_set(MMF_HAS_PINNED, mm); } #ifdef CONFIG_MMU @@ -1693,7 +1693,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, mmap_assert_locked(mm); if (flags & FOLL_PIN) - mm_set_has_pinned_flag(&mm->flags); + mm_set_has_pinned_flag(mm); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior @@ -3218,7 +3218,7 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, return -EINVAL; if (gup_flags & FOLL_PIN) - mm_set_has_pinned_flag(¤t->mm->flags); + mm_set_has_pinned_flag(current->mm); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9c38a95e9f09..26cedfcd7418 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -99,12 +99,12 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { - bool smaps = tva_flags & TVA_SMAPS; - bool in_pf = tva_flags & TVA_IN_PF; - bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; + const bool smaps = type == TVA_SMAPS; + const bool in_pf = type == TVA_PAGEFAULT; + const bool forced_collapse = type == TVA_FORCED_COLLAPSE; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ @@ -122,7 +122,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse)) return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ @@ -167,14 +167,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!in_pf && shmem_file(vma->vm_file)) return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file), vma, vma->vm_pgoff, 0, - !enforce_sysfs); + forced_collapse); if (!vma_is_anonymous(vma)) { /* - * Enforce sysfs THP requirements as necessary. Anonymous vmas + * Enforce THP collapse requirements as necessary. Anonymous vmas * were already handled in thp_vma_allowable_orders(). */ - if (enforce_sysfs && + if (!forced_collapse && (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && !hugepage_global_always()))) return 0; @@ -207,7 +207,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, return orders; } -static bool get_huge_zero_page(void) +static bool get_huge_zero_folio(void) { struct folio *zero_folio; retry: @@ -237,7 +237,7 @@ retry: return true; } -static void put_huge_zero_page(void) +static void put_huge_zero_folio(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -248,33 +248,39 @@ static void put_huge_zero_page(void) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return huge_zero_folio; + + if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) return READ_ONCE(huge_zero_folio); - if (!get_huge_zero_page()) + if (!get_huge_zero_folio()) return NULL; - if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) - put_huge_zero_page(); + if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm)) + put_huge_zero_folio(); return READ_ONCE(huge_zero_folio); } void mm_put_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) - put_huge_zero_page(); + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return; + + if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) + put_huge_zero_folio(); } -static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink, + struct shrink_control *sc) { /* we can free zero page only if last reference remains */ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; } -static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink, + struct shrink_control *sc) { if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { struct folio *zero_folio = xchg(&huge_zero_folio, NULL); @@ -287,7 +293,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, return 0; } -static struct shrinker *huge_zero_page_shrinker; +static struct shrinker *huge_zero_folio_shrinker; #ifdef CONFIG_SYSFS static ssize_t enabled_show(struct kobject *kobj, @@ -849,33 +855,47 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) static int __init thp_shrinker_init(void) { - huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero"); - if (!huge_zero_page_shrinker) - return -ENOMEM; - deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | SHRINKER_NONSLAB, "thp-deferred_split"); - if (!deferred_split_shrinker) { - shrinker_free(huge_zero_page_shrinker); + if (!deferred_split_shrinker) return -ENOMEM; - } - - huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count; - huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; - shrinker_register(huge_zero_page_shrinker); deferred_split_shrinker->count_objects = deferred_split_count; deferred_split_shrinker->scan_objects = deferred_split_scan; shrinker_register(deferred_split_shrinker); + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) { + /* + * Bump the reference of the huge_zero_folio and do not + * initialize the shrinker. + * + * huge_zero_folio will always be NULL on failure. We assume + * that get_huge_zero_folio() will most likely not fail as + * thp_shrinker_init() is invoked early on during boot. + */ + if (!get_huge_zero_folio()) + pr_warn("Allocating persistent huge zero folio failed\n"); + return 0; + } + + huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); + if (!huge_zero_folio_shrinker) { + shrinker_free(deferred_split_shrinker); + return -ENOMEM; + } + + huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count; + huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; + shrinker_register(huge_zero_folio_shrinker); + return 0; } static void __init thp_shrinker_exit(void) { - shrinker_free(huge_zero_page_shrinker); + shrinker_free(huge_zero_folio_shrinker); shrinker_free(deferred_split_shrinker); } @@ -911,7 +931,7 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { + if (totalram_pages() < MB_TO_PAGES(512)) { transparent_hugepage_flags = 0; return 0; } @@ -1125,7 +1145,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, off_sub = (off - ret) & (size - 1); - if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) + if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub) return ret + size; ret += off_sub; @@ -1309,6 +1329,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, { pmd_t entry; entry = folio_mk_pmd(zero_folio, vma->vm_page_prot); + entry = pmd_mkspecial(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); @@ -1379,15 +1400,25 @@ struct folio_or_pfn { bool is_folio; }; -static int insert_pmd(struct vm_area_struct *vma, unsigned long addr, +static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot, - bool write, pgtable_t pgtable) + bool write) { struct mm_struct *mm = vma->vm_mm; + pgtable_t pgtable = NULL; + spinlock_t *ptl; pmd_t entry; - lockdep_assert_held(pmd_lockptr(mm, pmd)); + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm); + if (!pgtable) + return VM_FAULT_OOM; + } + ptl = pmd_lock(mm, pmd); if (!pmd_none(*pmd)) { const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : fop.pfn; @@ -1395,23 +1426,26 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr, if (write) { if (pmd_pfn(*pmd) != pfn) { WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); - return -EEXIST; + goto out_unlock; } entry = pmd_mkyoung(*pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - - return -EEXIST; + goto out_unlock; } if (fop.is_folio) { entry = folio_mk_pmd(fop.folio, vma->vm_page_prot); - folio_get(fop.folio); - folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma); - add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR); + if (is_huge_zero_folio(fop.folio)) { + entry = pmd_mkspecial(entry); + } else { + folio_get(fop.folio); + folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma); + add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR); + } } else { entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot)); entry = pmd_mkspecial(entry); @@ -1424,11 +1458,17 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); mm_inc_nr_ptes(mm); + pgtable = NULL; } set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); - return 0; + +out_unlock: + spin_unlock(ptl); + if (pgtable) + pte_free(mm, pgtable); + return VM_FAULT_NOPAGE; } /** @@ -1450,9 +1490,6 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn, struct folio_or_pfn fop = { .pfn = pfn, }; - pgtable_t pgtable = NULL; - spinlock_t *ptl; - int error; /* * If we had pmd_special, we could avoid all these restrictions, @@ -1464,25 +1501,9 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn, (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - if (addr < vma->vm_start || addr >= vma->vm_end) - return VM_FAULT_SIGBUS; - - if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm); - if (!pgtable) - return VM_FAULT_OOM; - } - pfnmap_setup_cachemode_pfn(pfn, &pgprot); - ptl = pmd_lock(vma->vm_mm, vmf->pmd); - error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write, - pgtable); - spin_unlock(ptl); - if (error && pgtable) - pte_free(vma->vm_mm, pgtable); - - return VM_FAULT_NOPAGE; + return insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write); } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); @@ -1491,35 +1512,15 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, { struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address & PMD_MASK; - struct mm_struct *mm = vma->vm_mm; struct folio_or_pfn fop = { .folio = folio, .is_folio = true, }; - spinlock_t *ptl; - pgtable_t pgtable = NULL; - int error; - - if (addr < vma->vm_start || addr >= vma->vm_end) - return VM_FAULT_SIGBUS; if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER)) return VM_FAULT_SIGBUS; - if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm); - if (!pgtable) - return VM_FAULT_OOM; - } - - ptl = pmd_lock(mm, vmf->pmd); - error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, - write, pgtable); - spin_unlock(ptl); - if (error && pgtable) - pte_free(mm, pgtable); - - return VM_FAULT_NOPAGE; + return insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, write); } EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd); @@ -1531,25 +1532,30 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) return pud; } -static void insert_pud(struct vm_area_struct *vma, unsigned long addr, +static vm_fault_t insert_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pud_t entry; + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + ptl = pud_lock(mm, pud); if (!pud_none(*pud)) { const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : fop.pfn; if (write) { if (WARN_ON_ONCE(pud_pfn(*pud) != pfn)) - return; + goto out_unlock; entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); if (pudp_set_access_flags(vma, addr, pud, entry, 1)) update_mmu_cache_pud(vma, addr, pud); } - return; + goto out_unlock; } if (fop.is_folio) { @@ -1568,6 +1574,9 @@ static void insert_pud(struct vm_area_struct *vma, unsigned long addr, } set_pud_at(mm, addr, pud, entry); update_mmu_cache_pud(vma, addr, pud); +out_unlock: + spin_unlock(ptl); + return VM_FAULT_NOPAGE; } /** @@ -1589,7 +1598,6 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn, struct folio_or_pfn fop = { .pfn = pfn, }; - spinlock_t *ptl; /* * If we had pud_special, we could avoid all these restrictions, @@ -1601,16 +1609,9 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn, (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - if (addr < vma->vm_start || addr >= vma->vm_end) - return VM_FAULT_SIGBUS; - pfnmap_setup_cachemode_pfn(pfn, &pgprot); - ptl = pud_lock(vma->vm_mm, vmf->pud); - insert_pud(vma, addr, vmf->pud, fop, pgprot, write); - spin_unlock(ptl); - - return VM_FAULT_NOPAGE; + return insert_pud(vma, addr, vmf->pud, fop, pgprot, write); } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); @@ -1627,25 +1628,15 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, { struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address & PUD_MASK; - pud_t *pud = vmf->pud; - struct mm_struct *mm = vma->vm_mm; struct folio_or_pfn fop = { .folio = folio, .is_folio = true, }; - spinlock_t *ptl; - - if (addr < vma->vm_start || addr >= vma->vm_end) - return VM_FAULT_SIGBUS; if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER)) return VM_FAULT_SIGBUS; - ptl = pud_lock(mm, pud); - insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write); - spin_unlock(ptl); - - return VM_FAULT_NOPAGE; + return insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write); } EXPORT_SYMBOL_GPL(vmf_insert_folio_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -1675,7 +1666,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, int ret = -ENOMEM; pmd = pmdp_get_lockless(src_pmd); - if (unlikely(pmd_present(pmd) && pmd_special(pmd))) { + if (unlikely(pmd_present(pmd) && pmd_special(pmd) && + !is_huge_zero_pmd(pmd))) { dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -3310,8 +3302,8 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * unreferenced sub-pages of an anonymous THP: we can simply drop * PG_anon_exclusive (-> PG_mappedtodisk) for these here. */ - new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - new_folio->flags |= (folio->flags & + new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; + new_folio->flags.f |= (folio->flags.f & ((1L << PG_referenced) | (1L << PG_swapbacked) | (1L << PG_swapcache) | @@ -4327,8 +4319,8 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, goto out; } - pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", - pid, vaddr_start, vaddr_end); + pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n", + pid, vaddr_start, vaddr_end, new_order, in_folio_offset); mmap_read_lock(mm); /* @@ -4438,8 +4430,8 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, if (IS_ERR(candidate)) goto out; - pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", - file_path, off_start, off_end); + pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n", + file_path, off_start, off_end, new_order, in_folio_offset); mapping = candidate->f_mapping; min_order = mapping_min_folio_order(mapping); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eed59cfb5d21..1e777cc51ad0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3654,6 +3654,9 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) return; } + if (!h->max_huge_pages) + return; + /* do node specific alloc */ if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; @@ -6932,6 +6935,11 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { + pte_t *actual_pte = hugetlb_walk(dst_vma, dst_addr, PMD_SIZE); + if (actual_pte) { + ret = -EEXIST; + goto out; + } ret = -ENOMEM; goto out; } diff --git a/mm/internal.h b/mm/internal.h index 45b725c3dc03..45da9ff5694f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1333,11 +1333,6 @@ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; -static inline bool is_migrate_highatomic(enum migratetype migratetype) -{ - return migratetype == MIGRATE_HIGHATOMIC; -} - void setup_zone_pageset(struct zone *zone); struct migration_target_control { diff --git a/mm/kasan/init.c b/mm/kasan/init.c index 8fce3370c84e..f084e7a5df1e 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -266,11 +266,9 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, } if (pgd_none(*pgd)) { - p4d_t *p; if (slab_is_available()) { - p = p4d_alloc(&init_mm, pgd, addr); - if (!p) + if (!p4d_alloc(&init_mm, pgd, addr)) return -ENOMEM; } else { pgd_populate_kernel(addr, pgd, diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index f4b17984b627..4cf2b5f8d6c1 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1073,6 +1073,45 @@ static void kmem_cache_rcu_uaf(struct kunit *test) kmem_cache_destroy(cache); } +/* + * Check that SLAB_TYPESAFE_BY_RCU objects are immediately reused when + * CONFIG_SLUB_RCU_DEBUG is off, and stay at the same address. + * Without this, KASAN builds would be unable to trigger bugs caused by + * SLAB_TYPESAFE_BY_RCU users handling reycled objects improperly. + */ +static void kmem_cache_rcu_reuse(struct kunit *test) +{ + char *p, *p2; + struct kmem_cache *cache; + + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_SLUB_RCU_DEBUG); + + cache = kmem_cache_create("test_cache", 16, 0, SLAB_TYPESAFE_BY_RCU, + NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + migrate_disable(); + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + kunit_err(test, "Allocation failed: %s\n", __func__); + goto out; + } + + kmem_cache_free(cache, p); + p2 = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p2) { + kunit_err(test, "Allocation failed: %s\n", __func__); + goto out; + } + KUNIT_EXPECT_PTR_EQ(test, p, p2); + + kmem_cache_free(cache, p2); + +out: + migrate_enable(); + kmem_cache_destroy(cache); +} + static void kmem_cache_double_destroy(struct kunit *test) { struct kmem_cache *cache; @@ -2106,6 +2145,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmem_cache_double_free), KUNIT_CASE(kmem_cache_invalid_free), KUNIT_CASE(kmem_cache_rcu_uaf), + KUNIT_CASE(kmem_cache_rcu_reuse), KUNIT_CASE(kmem_cache_double_destroy), KUNIT_CASE(kmem_cache_accounted), KUNIT_CASE(kmem_cache_bulk), diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b486c1d19b2d..4ec324a4c1fe 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - test_bit(MMF_DISABLE_THP, &mm->flags); + mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } static bool hugepage_pmd_enabled(void) @@ -445,7 +445,7 @@ void __khugepaged_enter(struct mm_struct *mm) /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); - if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) + if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; mm_slot = mm_slot_alloc(mm_slot_cache); @@ -472,10 +472,9 @@ void __khugepaged_enter(struct mm_struct *mm) void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && + if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, - PMD_ORDER)) + if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } @@ -497,7 +496,7 @@ void __khugepaged_exit(struct mm_struct *mm) spin_unlock(&khugepaged_mm_lock); if (free) { - clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + mm_flags_clear(MMF_VM_HUGEPAGE, mm); mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } else if (mm_slot) { @@ -921,7 +920,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; - unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; + enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : + TVA_FORCED_COLLAPSE; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; @@ -932,7 +932,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1459,7 +1459,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) /* * Not strictly needed because the mm exited already. * - * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + * mm_flags_clear(MMF_VM_HUGEPAGE, mm); */ /* khugepaged_mm_lock actually not necessary for the below */ @@ -1533,9 +1533,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's - * analogously elide sysfs THP settings here. + * analogously elide sysfs THP settings here and force collapse. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2402,7 +2402,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, mm_slot = khugepaged_scan.mm_slot; slot = &mm_slot->slot; } else { - slot = list_entry(khugepaged_scan.mm_head.next, + slot = list_first_entry(&khugepaged_scan.mm_head, struct mm_slot, mm_node); mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; @@ -2432,8 +2432,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, - TVA_ENFORCE_SYSFS, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; @@ -2516,9 +2515,8 @@ breakouterloop_mmap_lock: * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ - if (slot->mm_node.next != &khugepaged_scan.mm_head) { - slot = list_entry(slot->mm_node.next, - struct mm_slot, mm_node); + if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { + slot = list_next_entry(slot, mm_node); khugepaged_scan.mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; @@ -2767,7 +2765,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); @@ -1217,8 +1217,8 @@ mm_exiting: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGEABLE, mm); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); @@ -2620,8 +2620,8 @@ no_vmas: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGEABLE, mm); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmap_read_unlock(mm); mmdrop(mm); } else { @@ -2742,7 +2742,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma) vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) && + if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && __ksm_should_add_vma(file, vm_flags)) vm_flags |= VM_MERGEABLE; @@ -2784,16 +2784,16 @@ int ksm_enable_merge_any(struct mm_struct *mm) { int err; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; } - set_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_set(MMF_VM_MERGE_ANY, mm); ksm_add_vmas(mm); return 0; @@ -2815,7 +2815,7 @@ int ksm_disable_merge_any(struct mm_struct *mm) { int err; - if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (!mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; err = ksm_del_vmas(mm); @@ -2824,7 +2824,7 @@ int ksm_disable_merge_any(struct mm_struct *mm) return err; } - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); return 0; } @@ -2832,9 +2832,9 @@ int ksm_disable(struct mm_struct *mm) { mmap_assert_write_locked(mm); - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) return 0; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return ksm_disable_merge_any(mm); return ksm_del_vmas(mm); } @@ -2852,7 +2852,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (!vma_ksm_compatible(vma)) return 0; - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; @@ -2912,7 +2912,7 @@ int __ksm_enter(struct mm_struct *mm) list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - set_bit(MMF_VM_MERGEABLE, &mm->flags); + mm_flags_set(MMF_VM_MERGEABLE, mm); mmgrab(mm); if (needs_wakeup) @@ -2954,8 +2954,8 @@ void __ksm_exit(struct mm_struct *mm) if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); + mm_flags_clear(MMF_VM_MERGEABLE, mm); mmdrop(mm); } else if (mm_slot) { mmap_write_lock(mm); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8dd7fbed5a94..9712a751690f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2203,7 +2203,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, * try_charge() (context permitting), as well as from the userland * return path where reclaim is always able to block. */ -void mem_cgroup_handle_over_high(gfp_t gfp_mask) +void __mem_cgroup_handle_over_high(gfp_t gfp_mask) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2213,9 +2213,6 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask) struct mem_cgroup *memcg; bool in_retry = false; - if (likely(!nr_pages)) - return; - memcg = get_mem_cgroup_from_mm(current->mm); current->memcg_nr_pages_over_high = 0; @@ -2486,7 +2483,7 @@ done_restock: if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && !(current->flags & PF_MEMALLOC) && gfpflags_allow_blocking(gfp_mask)) - mem_cgroup_handle_over_high(gfp_mask); + __mem_cgroup_handle_over_high(gfp_mask); return 0; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index df6ee59527dd..b93ab99ad3ef 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1708,10 +1708,10 @@ static int identify_page_state(unsigned long pfn, struct page *p, * carried out only if the first check can't determine the page status. */ for (ps = error_states;; ps++) - if ((p->flags & ps->mask) == ps->res) + if ((p->flags.f & ps->mask) == ps->res) break; - page_flags |= (p->flags & (1UL << PG_dirty)); + page_flags |= (p->flags.f & (1UL << PG_dirty)); if (!ps->mask) for (ps = error_states;; ps++) @@ -2137,7 +2137,7 @@ retry: return action_result(pfn, MF_MSG_FREE_HUGE, res); } - page_flags = folio->flags; + page_flags = folio->flags.f; if (!hwpoison_user_mappings(folio, p, pfn, flags)) { folio_unlock(folio); @@ -2397,7 +2397,7 @@ try_again: * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page * status correctly, we save a copy of the page flags at this time. */ - page_flags = folio->flags; + page_flags = folio->flags.f; /* * __munlock_folio() may clear a writeback folio's LRU flag without @@ -2742,13 +2742,13 @@ static int soft_offline_in_use_page(struct page *page) putback_movable_pages(&pagelist); pr_info("%#lx: %s migration failed %ld, type %pGp\n", - pfn, msg_page[huge], ret, &page->flags); + pfn, msg_page[huge], ret, &page->flags.f); if (ret > 0) ret = -EBUSY; } } else { pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n", - pfn, msg_page[huge], page_count(page), &page->flags); + pfn, msg_page[huge], page_count(page), &page->flags.f); ret = -EBUSY; } return ret; diff --git a/mm/memory.c b/mm/memory.c index 0ba4f6b71847..d9de6c056179 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -491,22 +491,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) add_mm_counter(mm, i, rss[i]); } -/* - * This function is called to print an error when a bad pte - * is found. For example, we might have a PFN-mapped pte in - * a region that doesn't allow it. - * - * The calling function must still handle the error. - */ -static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, struct page *page) +static bool is_bad_page_map_ratelimited(void) { - pgd_t *pgd = pgd_offset(vma->vm_mm, addr); - p4d_t *p4d = p4d_offset(pgd, addr); - pud_t *pud = pud_offset(p4d, addr); - pmd_t *pmd = pmd_offset(pud, addr); - struct address_space *mapping; - pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; @@ -518,7 +504,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; - return; + return true; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", @@ -529,15 +515,91 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; + return false; +} + +static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr) +{ + unsigned long long pgdv, p4dv, pudv, pmdv; + p4d_t p4d, *p4dp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pgd_t *pgdp; + + /* + * Although this looks like a fully lockless pgtable walk, it is not: + * see locking requirements for print_bad_page_map(). + */ + pgdp = pgd_offset(mm, addr); + pgdv = pgd_val(*pgdp); + + if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) { + pr_alert("pgd:%08llx\n", pgdv); + return; + } + + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + p4dv = p4d_val(p4d); + + if (!p4d_present(p4d) || p4d_leaf(p4d)) { + pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv); + return; + } + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + pudv = pud_val(pud); + + if (!pud_present(pud) || pud_leaf(pud)) { + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv); + return; + } + + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + pmdv = pmd_val(pmd); + + /* + * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE, + * because the table should already be mapped by the caller and + * doing another map would be bad. print_bad_page_map() should + * already take care of printing the PTE. + */ + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv, + p4dv, pudv, pmdv); +} + +/* + * This function is called to print an error when a bad page table entry (e.g., + * corrupted page table entry) is found. For example, we might have a + * PFN-mapped pte in a region that doesn't allow it. + * + * The calling function must still handle the error. + * + * This function must be called during a proper page table walk, as it will + * re-walk the page table to dump information: the caller MUST prevent page + * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf + * page table lock. + */ +static void print_bad_page_map(struct vm_area_struct *vma, + unsigned long addr, unsigned long long entry, struct page *page, + enum pgtable_level level) +{ + struct address_space *mapping; + pgoff_t index; + + if (is_bad_page_map_ratelimited()) + return; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); + pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm, + pgtable_level_to_str(level), entry); + __print_bad_page_map_pgtable(vma->vm_mm, addr); if (page) - dump_page(page, "bad pte"); + dump_page(page, "bad page map"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", @@ -549,18 +611,39 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +#define print_bad_pte(vma, addr, pte, page) \ + print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE) -/* - * vm_normal_page -- This function gets the "struct page" associated with a pte. +/** + * __vm_normal_page() - Get the "struct page" associated with a page table entry. + * @vma: The VMA mapping the page table entry. + * @addr: The address where the page table entry is mapped. + * @pfn: The PFN stored in the page table entry. + * @special: Whether the page table entry is marked "special". + * @level: The page table level for error reporting purposes only. + * @entry: The page table entry value for error reporting purposes only. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this - * case, NULL is returned here. "Normal" mappings do have a struct page. + * case, NULL is returned here. "Normal" mappings do have a struct page and + * are ordinarily refcounted. + * + * Page mappings of the shared zero folios are always considered "special", as + * they are not ordinarily refcounted: neither the refcount nor the mapcount + * of these folios is adjusted when mapping them into user page tables. + * Selected page table walkers (such as GUP) can still identify mappings of the + * shared zero folios and work with the underlying "struct page". * - * There are 2 broad cases. Firstly, an architecture may define a pte_special() - * pte bit, in which case this function is trivial. Secondly, an architecture - * may not have a spare pte bit, which requires a more complicated scheme, - * described below. + * There are 2 broad cases. Firstly, an architecture may define a "special" + * page table entry bit, such as pte_special(), in which case this function is + * trivial. Secondly, an architecture may not have a spare page table + * entry bit, which requires a more complicated scheme, described below. + * + * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on + * page table entries that actually map "normal" pages: however, that page + * cannot be looked up through the PFN stored in the page table entry, but + * instead will be looked up through vm_ops->find_normal_page(). So far, this + * only applies to PTEs. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). @@ -585,72 +668,104 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct - * page (that is, those where pfn_valid is true) are refcounted and considered - * normal pages by the VM. The only exception are zeropages, which are - * *never* refcounted. + * page (that is, those where pfn_valid is true, except the shared zero + * folios) are refcounted and considered normal pages by the VM. * * The disadvantage is that pages are refcounted (which can be slower and * simply not an option for some PFNMAP users). The advantage is that we * don't have to follow the strict linearity rule of PFNMAP mappings in * order to support COWable mappings. * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. */ -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +static inline struct page *__vm_normal_page(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, bool special, + unsigned long long entry, enum pgtable_level level) { - unsigned long pfn = pte_pfn(pte); - if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { - if (likely(!pte_special(pte))) - goto check_pfn; - if (vma->vm_ops && vma->vm_ops->find_special_page) - return vma->vm_ops->find_special_page(vma, addr); - if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) - return NULL; - if (is_zero_pfn(pfn)) - return NULL; - - print_bad_pte(vma, addr, pte, NULL); - return NULL; - } - - /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ - - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - if (is_zero_pfn(pfn)) + if (unlikely(special)) { +#ifdef CONFIG_FIND_NORMAL_PAGE + if (vma->vm_ops && vma->vm_ops->find_normal_page) + return vma->vm_ops->find_normal_page(vma, addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) + if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) return NULL; + + print_bad_page_map(vma, addr, entry, NULL, level); + return NULL; } - } + /* + * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table + * mappings (incl. shared zero folios) are marked accordingly. + */ + } else { + if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + /* If it has a "struct page", it's "normal". */ + if (!pfn_valid(pfn)) + return NULL; + } else { + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (is_zero_pfn(pfn)) - return NULL; + /* Only CoW'ed anon folios are "normal". */ + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) + return NULL; + } -check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { - print_bad_pte(vma, addr, pte, NULL); + /* Corrupted page table entry. */ + print_bad_page_map(vma, addr, entry, NULL, level); return NULL; } - /* * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. + * For example, VDSO mappings can cause them to exist. */ -out: - VM_WARN_ON_ONCE(is_zero_pfn(pfn)); + VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)); return pfn_to_page(pfn); } +/** + * vm_normal_page() - Get the "struct page" associated with a PTE + * @vma: The VMA mapping the @pte. + * @addr: The address where the @pte is mapped. + * @pte: The PTE. + * + * Get the "struct page" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte), + pte_val(pte), PGTABLE_LEVEL_PTE); +} + +/** + * vm_normal_folio() - Get the "struct folio" associated with a PTE + * @vma: The VMA mapping the @pte. + * @addr: The address where the @pte is mapped. + * @pte: The PTE. + * + * Get the "struct folio" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct folio" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { @@ -662,43 +777,37 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/** + * vm_normal_page_pmd() - Get the "struct page" associated with a PMD + * @vma: The VMA mapping the @pmd. + * @addr: The address where the @pmd is mapped. + * @pmd: The PMD. + * + * Get the "struct page" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { - unsigned long pfn = pmd_pfn(pmd); - - /* Currently it's only used for huge pfnmaps */ - if (unlikely(pmd_special(pmd))) - return NULL; - - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; - } - } - - if (is_huge_zero_pfn(pfn)) - return NULL; - if (unlikely(pfn > highest_memmap_pfn)) - return NULL; - - /* - * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. - */ -out: - return pfn_to_page(pfn); + return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd), + pmd_val(pmd), PGTABLE_LEVEL_PMD); } +/** + * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD + * @vma: The VMA mapping the @pmd. + * @addr: The address where the @pmd is mapped. + * @pmd: The PMD. + * + * Get the "struct folio" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct folio" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { @@ -708,6 +817,25 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, return page_folio(page); return NULL; } + +/** + * vm_normal_page_pud() - Get the "struct page" associated with a PUD + * @vma: The VMA mapping the @pud. + * @addr: The address where the @pud is mapped. + * @pud: The PUD. + * + * Get the "struct page" associated with a PUD. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t pud) +{ + return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud), + pud_val(pud), PGTABLE_LEVEL_PUD); +} #endif /** @@ -4387,8 +4515,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); @@ -4935,8 +5063,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -5204,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any - * PMD mappings if THPs are disabled. + * PMD mappings if THPs are disabled. As we already have a THP, + * behave as if we are forcing a collapse. */ - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, + /* forced_collapse=*/ true)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) @@ -6126,8 +6256,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -6161,8 +6290,7 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; diff --git a/mm/migrate.c b/mm/migrate.c index 9e5ef39ce73a..8e435a078fc3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -231,18 +231,17 @@ static void putback_movable_ops_page(struct page *page) * src and dst are also released by migration core. These pages will not be * folios in the future, so that must be reworked. * - * Returns MIGRATEPAGE_SUCCESS on success, otherwise a negative error - * code. + * Returns 0 on success, otherwise a negative error code. */ static int migrate_movable_ops_page(struct page *dst, struct page *src, enum migrate_mode mode) { - int rc = MIGRATEPAGE_SUCCESS; + int rc; VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src); VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src); rc = page_movable_ops(src)->migrate_page(dst, src, mode); - if (rc == MIGRATEPAGE_SUCCESS) + if (!rc) ClearPageMovableOpsIsolated(src); return rc; } @@ -587,7 +586,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); - return MIGRATEPAGE_SUCCESS; + return 0; } oldzone = folio_zone(folio); @@ -688,7 +687,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, } local_irq_enable(); - return MIGRATEPAGE_SUCCESS; + return 0; } int folio_migrate_mapping(struct address_space *mapping, @@ -737,7 +736,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, xas_unlock_irq(&xas); - return MIGRATEPAGE_SUCCESS; + return 0; } /* @@ -853,14 +852,14 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst, return rc; rc = __folio_migrate_mapping(mapping, dst, src, expected_count); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; if (src_private) folio_attach_private(dst, folio_detach_private(src)); folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return 0; } /** @@ -967,7 +966,7 @@ recheck_buffers: } rc = filemap_migrate_folio(mapping, dst, src, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) goto unlock_buffers; bh = head; @@ -1071,7 +1070,7 @@ static int fallback_migrate_folio(struct address_space *mapping, * * Return value: * < 0 - error code - * MIGRATEPAGE_SUCCESS - success + * 0 - success */ static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -1099,7 +1098,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, else rc = fallback_migrate_folio(mapping, dst, src, mode); - if (rc == MIGRATEPAGE_SUCCESS) { + if (!rc) { /* * For pagecache folios, src->mapping must be cleared before src * is freed. Anonymous folios must stay anonymous until freed. @@ -1189,7 +1188,7 @@ static void migrate_folio_done(struct folio *src, static int migrate_folio_unmap(new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, struct folio *src, struct folio **dstp, enum migrate_mode mode, - enum migrate_reason reason, struct list_head *ret) + struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; @@ -1198,16 +1197,6 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, bool locked = false; bool dst_locked = false; - if (folio_ref_count(src) == 1) { - /* Folio was freed from under us. So we are done. */ - folio_clear_active(src); - folio_clear_unevictable(src); - /* free_pages_prepare() will clear PG_isolated. */ - list_del(&src->lru); - migrate_folio_done(src, reason); - return MIGRATEPAGE_SUCCESS; - } - dst = get_new_folio(src, private); if (!dst) return -ENOMEM; @@ -1297,7 +1286,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, if (unlikely(page_has_movable_ops(&src->page))) { __migrate_folio_record(dst, old_page_state, anon_vma); - return MIGRATEPAGE_UNMAP; + return 0; } /* @@ -1327,7 +1316,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, if (!folio_mapped(src)) { __migrate_folio_record(dst, old_page_state, anon_vma); - return MIGRATEPAGE_UNMAP; + return 0; } out: @@ -1459,7 +1448,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ folio_putback_hugetlb(src); - return MIGRATEPAGE_SUCCESS; + return 0; } dst = get_new_folio(src, private); @@ -1522,8 +1511,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) - remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, 0); + remove_migration_ptes(src, !rc ? dst : src, 0); unlock_put_anon: folio_unlock(dst); @@ -1532,7 +1520,7 @@ put_anon: if (anon_vma) put_anon_vma(anon_vma); - if (rc == MIGRATEPAGE_SUCCESS) { + if (!rc) { move_hugetlb_state(src, dst, reason); put_new_folio = NULL; } @@ -1540,7 +1528,7 @@ put_anon: out_unlock: folio_unlock(src); out: - if (rc == MIGRATEPAGE_SUCCESS) + if (!rc) folio_putback_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); @@ -1650,7 +1638,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, reason, ret_folios); /* * The rules are: - * Success: hugetlb folio will be put back + * 0: hugetlb folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list @@ -1667,7 +1655,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, retry++; nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: + case 0: stats->nr_succeeded += nr_pages; break; default: @@ -1721,7 +1709,7 @@ static void migrate_folios_move(struct list_head *src_folios, reason, ret_folios); /* * The rules are: - * Success: folio will be freed + * 0: folio will be freed * -EAGAIN: stay on the unmap_folios list * Other errno: put on ret_folios list */ @@ -1731,7 +1719,7 @@ static void migrate_folios_move(struct list_head *src_folios, *thp_retry += is_thp; *nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: + case 0: stats->nr_succeeded += nr_pages; stats->nr_thp_succeeded += is_thp; break; @@ -1870,14 +1858,27 @@ static int migrate_pages_batch(struct list_head *from, continue; } + /* + * If we are holding the last folio reference, the folio + * was freed from under us, so just drop our reference. + */ + if (likely(!page_has_movable_ops(&folio->page)) && + folio_ref_count(folio) == 1) { + folio_clear_active(folio); + folio_clear_unevictable(folio); + list_del(&folio->lru); + migrate_folio_done(folio, reason); + stats->nr_succeeded += nr_pages; + stats->nr_thp_succeeded += is_thp; + continue; + } + rc = migrate_folio_unmap(get_new_folio, put_new_folio, - private, folio, &dst, mode, reason, - ret_folios); + private, folio, &dst, mode, ret_folios); /* * The rules are: - * Success: folio will be freed - * Unmap: folio will be put on unmap_folios list, - * dst folio put on dst_folios list + * 0: folio will be put on unmap_folios list, + * dst folio put on dst_folios list * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list @@ -1927,11 +1928,7 @@ static int migrate_pages_batch(struct list_head *from, thp_retry += is_thp; nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: - stats->nr_succeeded += nr_pages; - stats->nr_thp_succeeded += is_thp; - break; - case MIGRATEPAGE_UNMAP: + case 0: list_move_tail(&folio->lru, &unmap_folios); list_add_tail(&dst->lru, &dst_folios); break; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index e05e14d6eacd..abd9f6850db6 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -778,7 +778,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (migrate && migrate->fault_page == page) extra_cnt = 1; r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); - if (r != MIGRATEPAGE_SUCCESS) + if (r) src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; else folio_migrate_flags(newfolio, folio); diff --git a/mm/mincore.c b/mm/mincore.c index 10dabefc3acc..2f3e1816a30d 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -47,6 +47,48 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, return 0; } +static unsigned char mincore_swap(swp_entry_t entry, bool shmem) +{ + struct swap_info_struct *si; + struct folio *folio = NULL; + unsigned char present = 0; + + if (!IS_ENABLED(CONFIG_SWAP)) { + WARN_ON(1); + return 0; + } + + /* + * Shmem mapping may contain swapin error entries, which are + * absent. Page table may contain migration or hwpoison + * entries which are always uptodate. + */ + if (non_swap_entry(entry)) + return !shmem; + + /* + * Shmem mapping lookup is lockless, so we need to grab the swap + * device. mincore page table walk locks the PTL, and the swap + * device is stable, avoid touching the si for better performance. + */ + if (shmem) { + si = get_swap_device(entry); + if (!si) + return 0; + } + folio = filemap_get_entry(swap_address_space(entry), + swap_cache_index(entry)); + if (shmem) + put_swap_device(si); + /* The swap cache space contains either folio, shadow or NULL */ + if (folio && !xa_is_value(folio)) { + present = folio_test_uptodate(folio); + folio_put(folio); + } + + return present; +} + /* * Later we can get more picky about what "in core" means precisely. * For now, simply check to see if the page is in the page cache, @@ -64,8 +106,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - folio = filemap_get_incore_folio(mapping, index); - if (!IS_ERR(folio)) { + folio = filemap_get_entry(mapping, index); + if (folio) { + if (xa_is_value(folio)) { + if (shmem_mapping(mapping)) + return mincore_swap(radix_to_swp_entry(folio), + true); + else + return 0; + } present = folio_test_uptodate(folio); folio_put(folio); } @@ -143,23 +192,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (i = 0; i < step; i++) vec[i] = 1; } else { /* pte is a swap entry */ - swp_entry_t entry = pte_to_swp_entry(pte); - - if (non_swap_entry(entry)) { - /* - * migration or hwpoison entries are always - * uptodate - */ - *vec = 1; - } else { -#ifdef CONFIG_SWAP - *vec = mincore_page(swap_address_space(entry), - swap_cache_index(entry)); -#else - WARN_ON(1); - *vec = 1; -#endif - } + *vec = mincore_swap(pte_to_swp_entry(pte), false); } vec += step; } diff --git a/mm/mmap.c b/mm/mmap.c index 7306253cc3b5..7a057e0e8da9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -802,7 +802,7 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { - if (test_bit(MMF_TOPDOWN, &mm->flags)) + if (mm_flags_test(MMF_TOPDOWN, mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); @@ -1284,7 +1284,7 @@ void exit_mmap(struct mm_struct *mm) * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper * because the memory has been already freed. */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); @@ -1859,14 +1859,14 @@ loop_out: mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); mas_store(&vmi.mas, XA_ZERO_ENTRY); /* Avoid OOM iterating a broken tree */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); } /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is * not fully initialised. */ - set_bit(MMF_UNSTABLE, &mm->flags); + mm_flags_set(MMF_UNSTABLE, mm); } out: mmap_write_unlock(mm); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index b006cec8e6fe..0a0db5849b8e 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -128,6 +128,95 @@ void vma_mark_detached(struct vm_area_struct *vma) } /* + * Try to read-lock a vma. The function is allowed to occasionally yield false + * locked result to avoid performance overhead, in which case we fall back to + * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got + * detached. + * + * IMPORTANT: RCU lock must be held upon entering the function, but upon error + * IT IS RELEASED. The caller must handle this correctly. + */ +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + struct mm_struct *other_mm; + int oldcnt; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); + /* + * Check before locking. A race might cause false locked result. + * We can use READ_ONCE() for the mm_lock_seq here, and don't need + * ACQUIRE semantics, because this is just a lockless check whose result + * we don't rely on for anything - the mm_lock_seq read against which we + * need ordering is below. + */ + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) { + vma = NULL; + goto err; + } + + /* + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * Acquire fence is required here to avoid reordering against later + * vm_lock_seq check and checks inside lock_vma_under_rcu(). + */ + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) { + /* return EAGAIN if vma got detached from under us */ + vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); + goto err; + } + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + + if (unlikely(vma->vm_mm != mm)) + goto err_unstable; + + /* + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. + * False unlocked result is impossible because we modify and check + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq + * modification invalidates all existing locks. + * + * We must use ACQUIRE semantics for the mm_lock_seq so that if we are + * racing with vma_end_write_all(), we only start reading from the VMA + * after it has been unlocked. + * This pairs with RELEASE semantics in vma_end_write_all(). + */ + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { + vma_refcount_put(vma); + vma = NULL; + goto err; + } + + return vma; +err: + rcu_read_unlock(); + + return vma; +err_unstable: + /* + * If vma got attached to another mm from under us, that mm is not + * stable and can be freed in the narrow window after vma->vm_refcnt + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before + * releasing vma->vm_refcnt. + */ + other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */ + + /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */ + rcu_read_unlock(); + mmgrab(other_mm); + vma_refcount_put(vma); + mmdrop(other_mm); + + return NULL; +} + +/* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the * function returns NULL. @@ -138,11 +227,13 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, MA_STATE(mas, &mm->mm_mt, address, address); struct vm_area_struct *vma; - rcu_read_lock(); retry: + rcu_read_lock(); vma = mas_walk(&mas); - if (!vma) + if (!vma) { + rcu_read_unlock(); goto inval; + } vma = vma_start_read(mm, vma); if (IS_ERR_OR_NULL(vma)) { @@ -162,18 +253,17 @@ retry: * From here on, we can access the VMA without worrying about which * fields are accessible for RCU readers. */ + rcu_read_unlock(); /* Check if the vma we locked is the right one. */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) - goto inval_end_read; + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { + vma_end_read(vma); + goto inval; + } - rcu_read_unlock(); return vma; -inval_end_read: - vma_end_read(vma); inval: - rcu_read_unlock(); count_vm_vma_lock_event(VMA_LOCK_ABORT); return NULL; } @@ -228,6 +318,7 @@ retry: */ if (PTR_ERR(vma) == -EAGAIN) { /* reset to search from the last address */ + rcu_read_lock(); vma_iter_set(vmi, from_addr); goto retry; } @@ -257,9 +348,9 @@ retry: return vma; fallback_unlock: + rcu_read_unlock(); vma_end_read(vma); fallback: - rcu_read_unlock(); vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); rcu_read_lock(); /* Reinitialize the iterator after re-entering rcu read section */ diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index b49cc6385f1f..374aa6f021c6 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -32,7 +32,7 @@ static bool tlb_next_batch(struct mmu_gather *tlb) if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) return false; - batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); + batch = (void *)__get_free_page(GFP_NOWAIT); if (!batch) return false; @@ -364,7 +364,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) struct mmu_table_batch **batch = &tlb->batch; if (*batch == NULL) { - *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); + *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT); if (*batch == NULL) { tlb_table_invalidate(tlb); tlb_remove_table_one(table); diff --git a/mm/mmzone.c b/mm/mmzone.c index f9baa8882fbf..0c8f181d9d50 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -99,14 +99,14 @@ int folio_xchg_last_cpupid(struct folio *folio, int cpupid) unsigned long old_flags, flags; int last_cpupid; - old_flags = READ_ONCE(folio->flags); + old_flags = READ_ONCE(folio->flags.f); do { flags = old_flags; last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; - } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&folio->flags.f, &old_flags, flags))); return last_cpupid; } diff --git a/mm/nommu.c b/mm/nommu.c index 8b819fafd57b..c3a23b082adb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -64,7 +64,7 @@ const struct vm_operations_struct generic_file_vm_ops = { */ unsigned int kobjsize(const void *objp) { - struct page *page; + struct folio *folio; /* * If the object we have should not have ksize performed on it, @@ -73,22 +73,22 @@ unsigned int kobjsize(const void *objp) if (!objp || !virt_addr_valid(objp)) return 0; - page = virt_to_head_page(objp); + folio = virt_to_folio(objp); /* * If the allocator sets PageSlab, we know the pointer came from * kmalloc(). */ - if (PageSlab(page)) + if (folio_test_slab(folio)) return ksize(objp); /* - * If it's not a compound page, see if we have a matching VMA + * If it's not a large folio, see if we have a matching VMA * region. This test is intentionally done in reverse order, * so if there's no VMA, we still fall through and hand back - * PAGE_SIZE for 0-order pages. + * PAGE_SIZE for 0-order folios. */ - if (!PageCompound(page)) { + if (!folio_test_large(folio)) { struct vm_area_struct *vma; vma = find_vma(current->mm, (unsigned long)objp); @@ -100,7 +100,7 @@ unsigned int kobjsize(const void *objp) * The ksize() function is only guaranteed to work for pointers * returned by kmalloc(). So handle arbitrary pointers here. */ - return page_size(page); + return folio_size(folio); } void vfree(const void *addr) @@ -119,7 +119,8 @@ void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc_noprof); -void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int node) { return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 25923cfec9c6..17650f0b516e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/oom_kill.c - * + * * Copyright (C) 1998,2000 Rik van Riel * Thanks go out to Claus Fischer for some serious inspiration and * for goading me into coding this file... @@ -218,7 +218,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_SKIP, &p->mm->flags) || + mm_flags_test(MMF_OOM_SKIP, p->mm) || in_vfork(p)) { task_unlock(p); return LONG_MIN; @@ -325,7 +325,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) * any memory is quite low. */ if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { - if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) + if (mm_flags_test(MMF_OOM_SKIP, task->signal->oom_mm)) goto next; goto abort; } @@ -524,7 +524,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) * should imply barriers already and the reader would hit a page fault * if it stumbled over a reaped memory. */ - set_bit(MMF_UNSTABLE, &mm->flags); + mm_flags_set(MMF_UNSTABLE, mm); for_each_vma(vmi, vma) { if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) @@ -583,7 +583,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * under mmap_lock for reading because it serializes against the * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap(). */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_flags_test(MMF_OOM_SKIP, mm)) { trace_skip_task_reaping(tsk->pid); goto out_unlock; } @@ -619,7 +619,7 @@ static void oom_reap_task(struct task_struct *tsk) schedule_timeout_idle(HZ/10); if (attempts <= MAX_OOM_REAP_RETRIES || - test_bit(MMF_OOM_SKIP, &mm->flags)) + mm_flags_test(MMF_OOM_SKIP, mm)) goto done; pr_info("oom_reaper: unable to reap pid:%d (%s)\n", @@ -634,7 +634,7 @@ done: * Hide this mm from OOM killer because it has been either reaped or * somebody can't call mmap_write_unlock(mm). */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); @@ -670,7 +670,7 @@ static void wake_oom_reaper(struct timer_list *timer) unsigned long flags; /* The victim managed to terminate on its own - see exit_mmap */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_flags_test(MMF_OOM_SKIP, mm)) { put_task_struct(tsk); return; } @@ -695,7 +695,7 @@ static void wake_oom_reaper(struct timer_list *timer) static void queue_oom_reaper(struct task_struct *tsk) { /* mm is already queued? */ - if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + if (mm_flags_test_and_set(MMF_OOM_REAP_QUEUED, tsk->signal->oom_mm)) return; get_task_struct(tsk); @@ -892,7 +892,7 @@ static bool task_will_free_mem(struct task_struct *task) * This task has already been drained by the oom reaper so there are * only small chances it will free some more */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) + if (mm_flags_test(MMF_OOM_SKIP, mm)) return false; if (atomic_read(&mm->mm_users) <= 1) @@ -977,7 +977,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) continue; if (is_global_init(p)) { can_oom_reap = false; - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", task_pid_nr(victim), victim->comm, task_pid_nr(p), p->comm); @@ -1235,7 +1235,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) reap = true; else { /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + if (!mm_flags_test(MMF_OOM_SKIP, mm)) ret = -EINVAL; } task_unlock(p); @@ -1251,7 +1251,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure * possible change in exit_mmap is seen */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) + if (mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e248d1c3969..5f90fd6a7137 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -38,10 +38,10 @@ #include <linux/sched/rt.h> #include <linux/sched/signal.h> #include <linux/mm_inline.h> +#include <linux/shmem_fs.h> #include <trace/events/writeback.h> #include "internal.h" -#include "swap.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). @@ -2590,36 +2590,6 @@ done: } EXPORT_SYMBOL_GPL(writeback_iter); -/** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function - * - * Return: %0 on success, negative error code otherwise - * - * Note: please use writeback_iter() instead. - */ -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) -{ - struct folio *folio = NULL; - int error; - - while ((folio = writeback_iter(mapping, wbc, folio, &error))) { - error = writepage(folio, wbc, data); - if (error == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - error = 0; - } - } - - return error; -} -EXPORT_SYMBOL(write_cache_pages); - int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; @@ -2735,12 +2705,18 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, { unsigned long flags; + /* + * Shmem writeback relies on swap, and swap writeback is LRU based, + * not using the dirty mark. + */ + VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping)); + xa_lock_irqsave(&mapping->i_pages, flags); if (folio->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); folio_account_dirtied(folio, mapping); - __xa_set_mark(&mapping->i_pages, folio_index(folio), - PAGECACHE_TAG_DIRTY); + __xa_set_mark(&mapping->i_pages, folio->index, + PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } @@ -3019,7 +2995,7 @@ bool __folio_end_writeback(struct folio *folio) xa_lock_irqsave(&mapping->i_pages, flags); ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); - __xa_clear_mark(&mapping->i_pages, folio_index(folio), + __xa_clear_mark(&mapping->i_pages, folio->index, PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -3056,7 +3032,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (mapping && mapping_use_writeback_tags(mapping)) { - XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + XA_STATE(xas, &mapping->i_pages, folio->index); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1d037f97c5f..0873d640f26c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -355,7 +355,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit) { - return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS; + return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS; } static __always_inline void @@ -370,7 +370,7 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn, #else BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); #endif - BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits)); + BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK); VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); bitmap = get_pageblock_bitmap(page, pfn); @@ -538,8 +538,7 @@ static void set_pageblock_migratetype(struct page *page, "Use set_pageblock_isolate() for pageblock isolation"); return; } - VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page), - PB_migrate_isolate), + VM_WARN_ONCE(get_pageblock_isolate(page), "Use clear_pageblock_isolate() to unisolate pageblock"); /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */ #endif @@ -797,7 +796,7 @@ static inline void account_freepages(struct zone *zone, int nr_pages, if (is_migrate_cma(migratetype)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); - else if (is_migrate_highatomic(migratetype)) + else if (migratetype == MIGRATE_HIGHATOMIC) WRITE_ONCE(zone->nr_free_highatomic, zone->nr_free_highatomic + nr_pages); } @@ -950,7 +949,7 @@ static inline void __free_one_page(struct page *page, bool to_tail; VM_BUG_ON(!zone_is_initialized(zone)); - VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); + VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); @@ -1043,7 +1042,7 @@ static inline bool page_expected_state(struct page *page, page->memcg_data | #endif page_pool_page_is_pp(page) | - (page->flags & check_flags))) + (page->flags.f & check_flags))) return false; return true; @@ -1059,7 +1058,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & flags)) { + if (unlikely(page->flags.f & flags)) { if (flags == PAGE_FLAGS_CHECK_AT_PREP) bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; else @@ -1358,7 +1357,7 @@ __always_inline bool free_pages_prepare(struct page *page, int i; if (compound) { - page[1].flags &= ~PAGE_FLAGS_SECOND; + page[1].flags.f &= ~PAGE_FLAGS_SECOND; #ifdef NR_PAGES_IN_LARGE_FOLIO folio->_nr_pages = 0; #endif @@ -1372,7 +1371,7 @@ __always_inline bool free_pages_prepare(struct page *page, continue; } } - (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + (page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; } } if (folio_test_anon(folio)) { @@ -1391,7 +1390,7 @@ __always_inline bool free_pages_prepare(struct page *page, } page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); page_table_check_free(page, order); pgalloc_tag_sub(page, 1 << order); @@ -2034,7 +2033,13 @@ static int move_freepages_block(struct zone *zone, struct page *page, /* Look for a buddy that straddles start_pfn */ static unsigned long find_large_buddy(unsigned long start_pfn) { - int order = 0; + /* + * If start_pfn is not an order-0 PageBuddy, next PageBuddy containing + * start_pfn has minimal order of __ffs(start_pfn) + 1. Start checking + * the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy, + * the starting order does not matter. + */ + int order = start_pfn ? __ffs(start_pfn) : MAX_PAGE_ORDER; struct page *page; unsigned long pfn = start_pfn; @@ -2058,9 +2063,9 @@ static unsigned long find_large_buddy(unsigned long start_pfn) static inline void toggle_pageblock_isolate(struct page *page, bool isolate) { if (isolate) - set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate); + set_pageblock_isolate(page); else - clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate); + clear_pageblock_isolate(page); } /** @@ -4182,7 +4187,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } static inline bool -should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, +should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, int *compaction_retries) @@ -4408,7 +4413,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) if (!(gfp_mask & __GFP_NOMEMALLOC)) { alloc_flags |= ALLOC_NON_BLOCK; - if (order > 0) + if (order > 0 && (alloc_flags & ALLOC_MIN_RESERVE)) alloc_flags |= ALLOC_HIGHATOMIC; } @@ -5946,7 +5951,6 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta pcp->high_min = BOOT_PAGESET_HIGH; pcp->high_max = BOOT_PAGESET_HIGH; pcp->batch = BOOT_PAGESET_BATCH; - pcp->free_count = 0; } static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min, @@ -6236,16 +6240,13 @@ static void calculate_totalreserve_pages(void) unsigned long managed_pages = zone_managed_pages(zone); /* Find valid and maximum lowmem_reserve in the zone */ - for (j = i; j < MAX_NR_ZONES; j++) { - if (zone->lowmem_reserve[j] > max) - max = zone->lowmem_reserve[j]; - } + for (j = i; j < MAX_NR_ZONES; j++) + max = max(max, zone->lowmem_reserve[j]); /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); - if (max > managed_pages) - max = managed_pages; + max = min_t(unsigned long, max, managed_pages); pgdat->totalreserve_pages += max; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 648038247a8d..c6753d370ff4 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -902,23 +902,23 @@ struct folio *folio_walk_start(struct folio_walk *fw, fw->pudp = pudp; fw->pud = pud; - /* - * TODO: FW_MIGRATION support for PUD migration entries - * once there are relevant users. - */ - if (!pud_present(pud) || pud_special(pud)) { + if (pud_none(pud)) { spin_unlock(ptl); goto not_found; - } else if (!pud_leaf(pud)) { + } else if (pud_present(pud) && !pud_leaf(pud)) { spin_unlock(ptl); goto pmd_table; + } else if (pud_present(pud)) { + page = vm_normal_page_pud(vma, addr, pud); + if (page) + goto found; } /* - * TODO: vm_normal_page_pud() will be handy once we want to - * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. + * TODO: FW_MIGRATION support for PUD migration entries + * once there are relevant users. */ - page = pud_page(pud); - goto found; + spin_unlock(ptl); + goto not_found; } pmd_table: diff --git a/mm/rmap.c b/mm/rmap.c index 568198e9efc2..34333ae3bd80 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -79,7 +79,6 @@ #include <asm/tlbflush.h> #define CREATE_TRACE_POINTS -#include <trace/events/tlb.h> #include <trace/events/migrate.h> #include "internal.h" @@ -285,7 +284,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; - avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); + avc = anon_vma_chain_alloc(GFP_NOWAIT); if (unlikely(!avc)) { unlock_anon_vma_root(root); root = NULL; @@ -1241,18 +1240,40 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } -static __always_inline unsigned int __folio_add_rmap(struct folio *folio, +static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) +{ + int idx; + + if (nr) { + idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; + __lruvec_stat_mod_folio(folio, idx, nr); + } + if (nr_pmdmapped) { + if (folio_test_anon(folio)) { + idx = NR_ANON_THPS; + __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); + } else { + /* NR_*_PMDMAPPED are not maintained per-memcg */ + idx = folio_test_swapbacked(folio) ? + NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED; + __mod_node_page_state(folio_pgdat(folio), idx, + nr_pmdmapped); + } + } +} + +static __always_inline void __folio_add_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level, int *nr_pmdmapped) + enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; const int orig_nr_pages = nr_pages; - int first = 0, nr = 0; + int first = 0, nr = 0, nr_pmdmapped = 0; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_inc_and_test(&folio->_mapcount); break; @@ -1278,12 +1299,12 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, folio_add_large_mapcount(folio, orig_nr_pages, vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: first = atomic_inc_and_test(&folio->_entire_mapcount); if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { - if (level == RMAP_LEVEL_PMD && first) - *nr_pmdmapped = folio_large_nr_pages(folio); + if (level == PGTABLE_LEVEL_PMD && first) + nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_inc_return_large_mapcount(folio, vma); if (nr == 1) /* Was completely unmapped. */ @@ -1301,8 +1322,8 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, * We only track PMD mappings of PMD-sized * folios separately. */ - if (level == RMAP_LEVEL_PMD) - *nr_pmdmapped = nr_pages; + if (level == PGTABLE_LEVEL_PMD) + nr_pmdmapped = nr_pages; nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) @@ -1314,8 +1335,10 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, } folio_inc_large_mapcount(folio, vma); break; + default: + BUILD_BUG(); } - return nr; + __folio_mod_stat(folio, nr, nr_pmdmapped); } /** @@ -1403,59 +1426,37 @@ static void __page_check_anon_rmap(const struct folio *folio, page); } -static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) -{ - int idx; - - if (nr) { - idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; - __lruvec_stat_mod_folio(folio, idx, nr); - } - if (nr_pmdmapped) { - if (folio_test_anon(folio)) { - idx = NR_ANON_THPS; - __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); - } else { - /* NR_*_PMDMAPPED are not maintained per-memcg */ - idx = folio_test_swapbacked(folio) ? - NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED; - __mod_node_page_state(folio_pgdat(folio), idx, - nr_pmdmapped); - } - } -} - static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - unsigned long address, rmap_t flags, enum rmap_level level) + unsigned long address, rmap_t flags, enum pgtable_level level) { - int i, nr, nr_pmdmapped = 0; + int i; VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); - nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped); + __folio_add_rmap(folio, page, nr_pages, vma, level); if (likely(!folio_test_ksm(folio))) __page_check_anon_rmap(folio, page, vma, address); - __folio_mod_stat(folio, nr, nr_pmdmapped); - if (flags & RMAP_EXCLUSIVE) { switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: for (i = 0; i < nr_pages; i++) SetPageAnonExclusive(page + i); break; - case RMAP_LEVEL_PMD: + case PGTABLE_LEVEL_PMD: SetPageAnonExclusive(page); break; - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PUD: /* * Keep the compiler happy, we don't support anonymous * PUD mappings. */ WARN_ON_ONCE(1); break; + default: + BUILD_BUG(); } } @@ -1509,7 +1510,7 @@ void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page, rmap_t flags) { __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags, - RMAP_LEVEL_PTE); + PGTABLE_LEVEL_PTE); } /** @@ -1530,7 +1531,7 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags, - RMAP_LEVEL_PMD); + PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1611,14 +1612,11 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { - int nr, nr_pmdmapped = 0; - VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); - nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped); - __folio_mod_stat(folio, nr, nr_pmdmapped); + __folio_add_rmap(folio, page, nr_pages, vma, level); /* See comments in folio_add_anon_rmap_*() */ if (!folio_test_large(folio)) @@ -1639,7 +1637,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, void folio_add_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { - __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); + __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** @@ -1656,7 +1654,7 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); + __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1677,7 +1675,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page, { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); + __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif @@ -1685,7 +1683,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page, static __always_inline void __folio_remove_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; int last = 0, nr = 0, nr_pmdmapped = 0; @@ -1694,7 +1692,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_add_negative(-1, &folio->_mapcount); break; @@ -1704,7 +1702,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr = folio_sub_return_large_mapcount(folio, nr_pages, vma); if (!nr) { /* Now completely unmapped. */ - nr = folio_nr_pages(folio); + nr = folio_large_nr_pages(folio); } else { partially_mapped = nr < folio_large_nr_pages(folio) && !folio_entire_mapcount(folio); @@ -1724,11 +1722,11 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, partially_mapped = nr && atomic_read(mapped); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { last = atomic_add_negative(-1, &folio->_entire_mapcount); - if (level == RMAP_LEVEL_PMD && last) + if (level == PGTABLE_LEVEL_PMD && last) nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_dec_return_large_mapcount(folio, vma); if (!nr) { @@ -1748,9 +1746,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED)) { nr_pages = folio_large_nr_pages(folio); - if (level == RMAP_LEVEL_PMD) + if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; - nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); + nr = nr_pages - nr; /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) nr = 0; @@ -1762,6 +1760,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, partially_mapped = nr && nr < nr_pmdmapped; break; + default: + BUILD_BUG(); } /* @@ -1801,7 +1801,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, void folio_remove_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { - __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); + __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** @@ -1818,7 +1818,7 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); + __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1839,7 +1839,7 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page, { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); + __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif diff --git a/mm/shmem.c b/mm/shmem.c index e2c76a30802b..640fecc42f60 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1006,15 +1006,15 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; unsigned long swapped = 0; unsigned long max = end - 1; rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) + if (xa_is_value(folio)) swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; @@ -1817,7 +1817,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; - if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, @@ -2430,7 +2430,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } folio_wait_writeback(folio); - nr_pages = folio_nr_pages(folio); /* * Some architectures may have to restore extra metadata to the @@ -5081,7 +5080,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= SB_NOSEC | SB_I_VERSION; + sb->s_flags |= SB_NOSEC; #if IS_ENABLED(CONFIG_UNICODE) if (!ctx->encoding && ctx->strict_encoding) { @@ -5385,6 +5384,9 @@ int shmem_init_fs_context(struct fs_context *fc) fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; +#ifdef CONFIG_TMPFS + fc->sb_flags |= SB_I_VERSION; +#endif return 0; } diff --git a/mm/slab.h b/mm/slab.h index 248b34c839b7..c41a512dd07c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -50,7 +50,7 @@ typedef union { /* Reuses the bits in struct page */ struct slab { - unsigned long flags; + memdesc_flags_t flags; struct kmem_cache *slab_cache; union { @@ -174,12 +174,12 @@ static inline void *slab_address(const struct slab *slab) static inline int slab_nid(const struct slab *slab) { - return folio_nid(slab_folio(slab)); + return memdesc_nid(slab->flags); } static inline pg_data_t *slab_pgdat(const struct slab *slab) { - return folio_pgdat(slab_folio(slab)); + return NODE_DATA(slab_nid(slab)); } static inline struct slab *virt_to_slab(const void *addr) diff --git a/mm/slub.c b/mm/slub.c index 30003763d224..af343ca570b5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -657,17 +657,17 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) */ static inline bool slab_test_pfmemalloc(const struct slab *slab) { - return test_bit(SL_pfmemalloc, &slab->flags); + return test_bit(SL_pfmemalloc, &slab->flags.f); } static inline void slab_set_pfmemalloc(struct slab *slab) { - set_bit(SL_pfmemalloc, &slab->flags); + set_bit(SL_pfmemalloc, &slab->flags.f); } static inline void __slab_clear_pfmemalloc(struct slab *slab) { - __clear_bit(SL_pfmemalloc, &slab->flags); + __clear_bit(SL_pfmemalloc, &slab->flags.f); } /* @@ -675,12 +675,12 @@ static inline void __slab_clear_pfmemalloc(struct slab *slab) */ static __always_inline void slab_lock(struct slab *slab) { - bit_spin_lock(SL_locked, &slab->flags); + bit_spin_lock(SL_locked, &slab->flags.f); } static __always_inline void slab_unlock(struct slab *slab) { - bit_spin_unlock(SL_locked, &slab->flags); + bit_spin_unlock(SL_locked, &slab->flags.f); } static inline bool @@ -1046,7 +1046,7 @@ static void print_slab_info(const struct slab *slab) { pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", slab, slab->objects, slab->inuse, slab->freelist, - &slab->flags); + &slab->flags.f); } void skip_orig_size_check(struct kmem_cache *s, const void *object) @@ -2755,17 +2755,17 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab) static inline bool slab_test_node_partial(const struct slab *slab) { - return test_bit(SL_partial, &slab->flags); + return test_bit(SL_partial, &slab->flags.f); } static inline void slab_set_node_partial(struct slab *slab) { - set_bit(SL_partial, &slab->flags); + set_bit(SL_partial, &slab->flags.f); } static inline void slab_clear_node_partial(struct slab *slab) { - clear_bit(SL_partial, &slab->flags); + clear_bit(SL_partial, &slab->flags.f); } /* @@ -4881,7 +4881,7 @@ void kfree(const void *object) EXPORT_SYMBOL(kfree); static __always_inline __realloc_size(2) void * -__do_krealloc(const void *p, size_t new_size, gfp_t flags) +__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid) { void *ret; size_t ks = 0; @@ -4895,6 +4895,16 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) if (!kasan_check_byte(p)) return NULL; + /* + * If reallocation is not necessary (e. g. the new size is less + * than the current allocated size), the current allocation will be + * preserved unless __GFP_THISNODE is set. In the latter case a new + * allocation on the requested node will be attempted. + */ + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(virt_to_page(p))) + goto alloc_new; + if (is_kfence_address(p)) { ks = orig_size = kfence_ksize(p); } else { @@ -4917,6 +4927,10 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) if (new_size > ks) goto alloc_new; + /* If the old object doesn't satisfy the new alignment, allocate a new one */ + if (!IS_ALIGNED((unsigned long)p, align)) + goto alloc_new; + /* Zero out spare memory. */ if (want_init_on_alloc(flags)) { kasan_disable_current(); @@ -4939,7 +4953,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) return (void *)p; alloc_new: - ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); + ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_); if (ret && p) { /* Disable KASAN checks as the object's redzone is accessed. */ kasan_disable_current(); @@ -4951,14 +4965,19 @@ alloc_new: } /** - * krealloc - reallocate memory. The contents will remain unchanged. + * krealloc_node_align - reallocate memory. The contents will remain unchanged. * @p: object to reallocate memory for. * @new_size: how many bytes of memory are required. + * @align: desired alignment. * @flags: the type of memory to allocate. + * @nid: NUMA node or NUMA_NO_NODE * * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that @@ -4983,7 +5002,8 @@ alloc_new: * * Return: pointer to the allocated memory or %NULL in case of error */ -void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) +void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align, + gfp_t flags, int nid) { void *ret; @@ -4992,13 +5012,13 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) return ZERO_SIZE_PTR; } - ret = __do_krealloc(p, new_size, flags); + ret = __do_krealloc(p, new_size, align, flags, nid); if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) kfree(p); return ret; } -EXPORT_SYMBOL(krealloc_noprof); +EXPORT_SYMBOL(krealloc_node_align_noprof); static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) { @@ -5029,9 +5049,13 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. * @b: which set of kmalloc buckets to allocate from. + * @align: desired alignment. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. * @node: numa node to allocate from * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * @@ -5041,7 +5065,8 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * * Return: pointer to the allocated memory of %NULL in case of failure */ -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, + gfp_t flags, int node) { void *ret; @@ -5071,7 +5096,7 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) * about the resulting pointer, and cannot play * protection games. */ - return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } @@ -5115,14 +5140,19 @@ void kvfree_sensitive(const void *addr, size_t len) EXPORT_SYMBOL(kvfree_sensitive); /** - * kvrealloc - reallocate memory; contents remain unchanged + * kvrealloc_node_align - reallocate memory; contents remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate + * @align: desired alignment * @flags: the flags for the page level allocator + * @nid: NUMA node id * * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 * and @p is not a %NULL pointer, the object pointed to is freed. * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that @@ -5136,17 +5166,18 @@ EXPORT_SYMBOL(kvfree_sensitive); * * Return: pointer to the allocated memory or %NULL in case of error */ -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) { void *n; if (is_vmalloc_addr(p)) - return vrealloc_noprof(p, size, flags); + return vrealloc_node_align_noprof(p, size, align, flags, nid); - n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid); if (!n) { /* We failed to krealloc(), fall back to kvmalloc(). */ - n = kvmalloc_noprof(size, flags); + n = kvmalloc_node_align_noprof(size, align, flags, nid); if (!n) return NULL; @@ -5162,7 +5193,7 @@ void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) return n; } -EXPORT_SYMBOL(kvrealloc_noprof); +EXPORT_SYMBOL(kvrealloc_node_align_noprof); struct detached_freelist { struct slab *slab; diff --git a/mm/sparse.c b/mm/sparse.c index e6075b622407..17c50a6415c2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -43,11 +43,11 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; #endif -int page_to_nid(const struct page *page) +int memdesc_nid(memdesc_flags_t mdf) { - return section_to_node_table[page_to_section(page)]; + return section_to_node_table[memdesc_section(mdf)]; } -EXPORT_SYMBOL(page_to_nid); +EXPORT_SYMBOL(memdesc_nid); static void set_section_nid(unsigned long section_nr, int nid) { diff --git a/mm/swap.c b/mm/swap.c index b74ebe865dd9..b8cea6a1b86f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -388,14 +388,14 @@ static void __lru_cache_activate_folio(struct folio *folio) static void lru_gen_inc_refs(struct folio *folio) { - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); if (folio_test_unevictable(folio)) return; /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return; } @@ -407,7 +407,7 @@ static void lru_gen_inc_refs(struct folio *folio) } new_flags = old_flags + BIT(LRU_REFS_PGOFF); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); } static bool lru_gen_clear_refs(struct folio *folio) @@ -419,7 +419,7 @@ static bool lru_gen_clear_refs(struct folio *folio) if (gen < 0) return true; - set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0); lrugen = &folio_lruvec(folio)->lrugen; /* whether can do without shuffling under the LRU lock */ @@ -1098,7 +1098,7 @@ static const struct ctl_table swap_sysctl_table[] = { */ void __init swap_setup(void) { - unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); + unsigned long megs = PAGES_TO_MB(totalram_pages()); /* Use a smaller cluster for small-memory machines */ if (megs < 16) diff --git a/mm/swap.h b/mm/swap.h index 911ad5ff0f89..1ae44d4193b1 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -64,9 +64,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index); - struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); @@ -178,13 +175,6 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry, return NULL; } -static inline -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index) -{ - return filemap_get_folio(mapping, index); -} - static inline void *get_shadow_from_swap_cache(swp_entry_t entry) { return NULL; diff --git a/mm/swap_state.c b/mm/swap_state.c index c354435a0923..99513b74b5d8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -323,44 +323,6 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, return folio; } -/** - * filemap_get_incore_folio - Find and get a folio from the page or swap caches. - * @mapping: The address_space to search. - * @index: The page cache index. - * - * This differs from filemap_get_folio() in that it will also look for the - * folio in the swap cache. - * - * Return: The found folio or %NULL. - */ -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index) -{ - swp_entry_t swp; - struct swap_info_struct *si; - struct folio *folio = filemap_get_entry(mapping, index); - - if (!folio) - return ERR_PTR(-ENOENT); - if (!xa_is_value(folio)) - return folio; - if (!shmem_mapping(mapping)) - return ERR_PTR(-ENOENT); - - swp = radix_to_swp_entry(folio); - /* There might be swapin error entries in shmem mapping. */ - if (non_swap_entry(swp)) - return ERR_PTR(-ENOENT); - /* Prevent swapoff from happening to us */ - si = get_swap_device(swp); - if (!si) - return ERR_PTR(-ENOENT); - index = swap_cache_index(swp); - folio = filemap_get_folio(swap_address_space(swp), index); - put_swap_device(si); - return folio; -} - struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) diff --git a/mm/swapfile.c b/mm/swapfile.c index b4f3cc712580..a7ffabbe65ef 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -470,11 +470,6 @@ static void move_cluster(struct swap_info_struct *si, else list_move_tail(&ci->list, list); spin_unlock(&si->lock); - - if (ci->flags == CLUSTER_FLAG_FRAG) - atomic_long_dec(&si->frag_cluster_nr[ci->order]); - else if (new_flags == CLUSTER_FLAG_FRAG) - atomic_long_inc(&si->frag_cluster_nr[ci->order]); ci->flags = new_flags; } @@ -825,6 +820,29 @@ out: return found; } +static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, + struct list_head *list, + unsigned int order, + unsigned char usage, + bool scan_all) +{ + unsigned int found = SWAP_ENTRY_INVALID; + + do { + struct swap_cluster_info *ci = isolate_lock_cluster(si, list); + unsigned long offset; + + if (!ci) + break; + offset = cluster_offset(si, ci); + found = alloc_swap_scan_cluster(si, ci, offset, order, usage); + if (found) + break; + } while (scan_all); + + return found; +} + static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; @@ -913,46 +931,46 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o } new_cluster: - ci = isolate_lock_cluster(si, &si->free_clusters); - if (ci) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); + /* + * If the device need discard, prefer new cluster over nonfull + * to spread out the writes. + */ + if (si->flags & SWP_PAGE_DISCARD) { + found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, + false); if (found) goto done; } - /* Try reclaim from full clusters if free clusters list is drained */ + if (order < PMD_ORDER) { + found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], + order, usage, true); + if (found) + goto done; + } + + if (!(si->flags & SWP_PAGE_DISCARD)) { + found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, + false); + if (found) + goto done; + } + + /* Try reclaim full clusters if free and nonfull lists are drained */ if (vm_swap_full()) swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { - unsigned int frags = 0, frags_existing; - - while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - /* Clusters failed to allocate are moved to frag_clusters */ - frags++; - } - - frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); - while (frags < frags_existing && - (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) { - atomic_long_dec(&si->frag_cluster_nr[order]); - /* - * Rotate the frag list to iterate, they were all - * failing high order allocation or moved here due to - * per-CPU usage, but they could contain newly released - * reclaimable (eg. lazy-freed swap cache) slots. - */ - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - frags++; - } + /* + * Scan only one fragment cluster is good enough. Order 0 + * allocation will surely success, and large allocation + * failure is not critical. Scanning one cluster still + * keeps the list rotated and reclaimed (for HAS_CACHE). + */ + found = alloc_swap_scan_list(si, &si->frag_clusters[order], order, + usage, false); + if (found) + goto done; } /* @@ -971,20 +989,15 @@ new_cluster: * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ - while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { - atomic_long_dec(&si->frag_cluster_nr[o]); - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - 0, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->frag_clusters[o], + 0, usage, true); + if (found) + goto done; - while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - 0, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], + 0, usage, true); + if (found) + goto done; } done: if (!(si->flags & SWP_SOLIDSTATE)) @@ -3224,7 +3237,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); - atomic_long_set(&si->frag_cluster_nr[i], 0); } /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index aefdf3a812a1..50aaa8dcd24c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1026,18 +1026,64 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); } -static int move_present_pte(struct mm_struct *mm, - struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, - unsigned long dst_addr, unsigned long src_addr, - pte_t *dst_pte, pte_t *src_pte, - pte_t orig_dst_pte, pte_t orig_src_pte, - pmd_t *dst_pmd, pmd_t dst_pmdval, - spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio *src_folio) +/* + * Checks if the two ptes and the corresponding folio are eligible for batched + * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL. + * + * NOTE: folio's reference is not required as the whole operation is within + * PTL's critical section. + */ +static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, + unsigned long src_addr, + pte_t *src_pte, pte_t *dst_pte, + struct anon_vma *src_anon_vma) +{ + pte_t orig_dst_pte, orig_src_pte; + struct folio *folio; + + orig_dst_pte = ptep_get(dst_pte); + if (!pte_none(orig_dst_pte)) + return NULL; + + orig_src_pte = ptep_get(src_pte); + if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte))) + return NULL; + + folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); + if (!folio || !folio_trylock(folio)) + return NULL; + if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) || + folio_anon_vma(folio) != src_anon_vma) { + folio_unlock(folio); + return NULL; + } + return folio; +} + +/* + * Moves src folios to dst in a batch as long as they share the same + * anon_vma as the first folio, are not large, and can successfully + * take the lock via folio_trylock(). + */ +static long move_present_ptes(struct mm_struct *mm, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio **first_src_folio, unsigned long len, + struct anon_vma *src_anon_vma) { int err = 0; + struct folio *src_folio = *first_src_folio; + unsigned long src_start = src_addr; + unsigned long src_end; + len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr; + src_end = pmd_addr_end(src_addr, src_addr + len); + flush_cache_range(src_vma, src_addr, src_end); double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, @@ -1051,31 +1097,56 @@ static int move_present_pte(struct mm_struct *mm, err = -EBUSY; goto out; } + /* It's safe to drop the reference now as the page-table is holding one. */ + folio_put(*first_src_folio); + *first_src_folio = NULL; + arch_enter_lazy_mmu_mode(); + + while (true) { + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); + /* Folio got pinned from under us. Put it back and fail the move. */ + if (folio_maybe_dma_pinned(src_folio)) { + set_pte_at(mm, src_addr, src_pte, orig_src_pte); + err = -EBUSY; + break; + } - orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); - /* Folio got pinned from under us. Put it back and fail the move. */ - if (folio_maybe_dma_pinned(src_folio)) { - set_pte_at(mm, src_addr, src_pte, orig_src_pte); - err = -EBUSY; - goto out; - } - - folio_move_anon_rmap(src_folio, dst_vma); - src_folio->index = linear_page_index(dst_vma, dst_addr); + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); - orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); - /* Set soft dirty bit so userspace can notice the pte was moved */ + orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); + /* Set soft dirty bit so userspace can notice the pte was moved */ #ifdef CONFIG_MEM_SOFT_DIRTY - orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); #endif - if (pte_dirty(orig_src_pte)) - orig_dst_pte = pte_mkdirty(orig_dst_pte); - orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); + if (pte_dirty(orig_src_pte)) + orig_dst_pte = pte_mkdirty(orig_dst_pte); + orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); + set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); + + src_addr += PAGE_SIZE; + if (src_addr == src_end) + break; + dst_addr += PAGE_SIZE; + dst_pte++; + src_pte++; + + folio_unlock(src_folio); + src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, + dst_pte, src_anon_vma); + if (!src_folio) + break; + } - set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); + arch_leave_lazy_mmu_mode(); + if (src_addr > src_start) + flush_tlb_range(src_vma, src_start, src_addr); + + if (src_folio) + folio_unlock(src_folio); out: double_pt_unlock(dst_ptl, src_ptl); - return err; + return src_addr > src_start ? src_addr - src_start : err; } static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, @@ -1140,7 +1211,7 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); - return 0; + return PAGE_SIZE; } static int move_zeropage_pte(struct mm_struct *mm, @@ -1167,20 +1238,20 @@ static int move_zeropage_pte(struct mm_struct *mm, set_pte_at(mm, dst_addr, dst_pte, zero_pte); double_pt_unlock(dst_ptl, src_ptl); - return 0; + return PAGE_SIZE; } /* - * The mmap_lock for reading is held by the caller. Just move the page - * from src_pmd to dst_pmd if possible, and return true if succeeded - * in moving the page. + * The mmap_lock for reading is held by the caller. Just move the page(s) + * from src_pmd to dst_pmd if possible, and return number of bytes moved. + * On failure, an error code is returned. */ -static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, - struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, - unsigned long dst_addr, unsigned long src_addr, - __u64 mode) +static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + unsigned long len, __u64 mode) { swp_entry_t entry; struct swap_info_struct *si = NULL; @@ -1194,11 +1265,10 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct folio *src_folio = NULL; struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; - int err = 0; + long ret = 0; - flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, - src_addr, src_addr + PAGE_SIZE); + src_addr, src_addr + len); mmu_notifier_invalidate_range_start(&range); retry: /* @@ -1212,7 +1282,7 @@ retry: /* Retry if a huge pmd materialized from under us */ if (unlikely(!dst_pte)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } @@ -1231,14 +1301,14 @@ retry: * transparent huge pages under us. */ if (unlikely(!src_pte)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } /* Sanity checks before the operation */ if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { - err = -EINVAL; + ret = -EINVAL; goto out; } @@ -1246,7 +1316,7 @@ retry: orig_dst_pte = ptep_get(dst_pte); spin_unlock(dst_ptl); if (!pte_none(orig_dst_pte)) { - err = -EEXIST; + ret = -EEXIST; goto out; } @@ -1255,21 +1325,21 @@ retry: spin_unlock(src_ptl); if (pte_none(orig_src_pte)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) - err = -ENOENT; + ret = -ENOENT; else /* nothing to do to move a hole */ - err = 0; + ret = PAGE_SIZE; goto out; } /* If PTE changed after we locked the folio them start over */ if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } if (pte_present(orig_src_pte)) { if (is_zero_pfn(pte_pfn(orig_src_pte))) { - err = move_zeropage_pte(mm, dst_vma, src_vma, + ret = move_zeropage_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl); @@ -1292,14 +1362,14 @@ retry: spin_lock(src_ptl); if (!pte_same(orig_src_pte, ptep_get(src_pte))) { spin_unlock(src_ptl); - err = -EAGAIN; + ret = -EAGAIN; goto out; } folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); if (!folio || !PageAnonExclusive(&folio->page)) { spin_unlock(src_ptl); - err = -EBUSY; + ret = -EBUSY; goto out; } @@ -1313,7 +1383,7 @@ retry: */ if (!locked && folio_test_large(folio)) { spin_unlock(src_ptl); - err = -EAGAIN; + ret = -EAGAIN; goto out; } @@ -1332,7 +1402,7 @@ retry: } if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { - err = -EBUSY; + ret = -EBUSY; goto out; } } @@ -1343,8 +1413,8 @@ retry: pte_unmap(src_pte); pte_unmap(dst_pte); src_pte = dst_pte = NULL; - err = split_folio(src_folio); - if (err) + ret = split_folio(src_folio); + if (ret) goto out; /* have to reacquire the folio after it got split */ folio_unlock(src_folio); @@ -1362,7 +1432,7 @@ retry: src_anon_vma = folio_get_anon_vma(src_folio); if (!src_anon_vma) { /* page was unmapped from under us */ - err = -EAGAIN; + ret = -EAGAIN; goto out; } if (!anon_vma_trylock_write(src_anon_vma)) { @@ -1375,10 +1445,11 @@ retry: } } - err = move_present_pte(mm, dst_vma, src_vma, - dst_addr, src_addr, dst_pte, src_pte, - orig_dst_pte, orig_src_pte, dst_pmd, - dst_pmdval, dst_ptl, src_ptl, src_folio); + ret = move_present_ptes(mm, dst_vma, src_vma, + dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, + dst_pmdval, dst_ptl, src_ptl, &src_folio, + len, src_anon_vma); } else { struct folio *folio = NULL; @@ -1389,20 +1460,20 @@ retry: pte_unmap(dst_pte); src_pte = dst_pte = NULL; migration_entry_wait(mm, src_pmd, src_addr); - err = -EAGAIN; + ret = -EAGAIN; } else - err = -EFAULT; + ret = -EFAULT; goto out; } if (!pte_swp_exclusive(orig_src_pte)) { - err = -EBUSY; + ret = -EBUSY; goto out; } si = get_swap_device(entry); if (unlikely(!si)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } /* @@ -1422,7 +1493,7 @@ retry: swap_cache_index(entry)); if (!IS_ERR_OR_NULL(folio)) { if (folio_test_large(folio)) { - err = -EBUSY; + ret = -EBUSY; folio_put(folio); goto out; } @@ -1439,7 +1510,7 @@ retry: goto retry; } } - err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, + ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, src_folio, si, entry); } @@ -1466,7 +1537,7 @@ out: if (si) put_swap_device(si); - return err; + return ret; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -1737,7 +1808,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, { struct mm_struct *mm = ctx->mm; struct vm_area_struct *src_vma, *dst_vma; - unsigned long src_addr, dst_addr; + unsigned long src_addr, dst_addr, src_end; pmd_t *src_pmd, *dst_pmd; long err = -EINVAL; ssize_t moved = 0; @@ -1780,8 +1851,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, if (err) goto out_unlock; - for (src_addr = src_start, dst_addr = dst_start; - src_addr < src_start + len;) { + for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len; + src_addr < src_end;) { spinlock_t *ptl; pmd_t dst_pmdval; unsigned long step_size; @@ -1849,6 +1920,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, dst_addr, src_addr); step_size = HPAGE_PMD_SIZE; } else { + long ret; + if (pmd_none(*src_pmd)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { err = -ENOENT; @@ -1865,10 +1938,13 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, break; } - err = move_pages_pte(mm, dst_pmd, src_pmd, - dst_vma, src_vma, - dst_addr, src_addr, mode); - step_size = PAGE_SIZE; + ret = move_pages_ptes(mm, dst_pmd, src_pmd, + dst_vma, src_vma, dst_addr, + src_addr, src_end - src_addr, mode); + if (ret < 0) + err = ret; + else + step_size = ret; } cond_resched(); diff --git a/mm/util.c b/mm/util.c index f814e6a59ab1..d235b74f7aff 100644 --- a/mm/util.c +++ b/mm/util.c @@ -471,17 +471,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } #endif #ifdef CONFIG_MMU @@ -145,7 +145,7 @@ struct vma_merge_struct { */ bool __remove_middle :1; /* - * Internal flag used during the merge operationr to indicate we will + * Internal flag used during the merge operation to indicate we will * remove vmg->next. */ bool __remove_next :1; diff --git a/mm/vma_init.c b/mm/vma_init.c index 8e53c7943561..d847c6557261 100644 --- a/mm/vma_init.c +++ b/mm/vma_init.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared + * Functions for initializing, allocating, freeing and duplicating VMAs. Shared * between CONFIG_MMU and non-CONFIG_MMU kernel configurations. */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5edd536ba9d2..4249e1e01947 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4089,19 +4089,29 @@ void *vzalloc_node_noprof(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node_noprof); /** - * vrealloc - reallocate virtually contiguous memory; contents remain unchanged + * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents + * remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate + * @align: requested alignment * @flags: the flags for the page level allocator + * @nid: node number of the target node + * + * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * - * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and - * @p is not a %NULL pointer, the object pointed to is freed. + * If the caller wants the new memory to be on specific node *only*, + * __GFP_THISNODE flag should be set, otherwise the function will try to avoid + * reallocation and possibly disregard the specified @nid. * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that * __GFP_ZERO is not fully honored by this API. * + * Requesting an alignment that is bigger than the alignment of the existing + * allocation will fail. + * * In any case, the contents of the object pointed to are preserved up to the * lesser of the new and old sizes. * @@ -4111,7 +4121,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof); * Return: pointer to the allocated memory; %NULL if @size is zero or in case of * failure */ -void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) { struct vm_struct *vm = NULL; size_t alloced_size = 0; @@ -4135,6 +4146,12 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) if (WARN(alloced_size < old_size, "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) return NULL; + if (WARN(!IS_ALIGNED((unsigned long)p, align), + "will not reallocate with a bigger alignment (0x%lx)\n", align)) + return NULL; + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(vmalloc_to_page(p))) + goto need_realloc; } /* @@ -4165,8 +4182,10 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) return (void *)p; } +need_realloc: /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ - n = __vmalloc_noprof(size, flags); + n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0)); + if (!n) return NULL; @@ -5177,7 +5196,7 @@ static void vmap_init_nodes(void) int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { - vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); + vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT); if (vn) { /* Node partition is 16 pages. */ vmap_zone_size = (1 << 4) * PAGE_SIZE; diff --git a/mm/vmscan.c b/mm/vmscan.c index 674999999cd0..ca9e1cd3cd68 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -398,14 +398,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); - /* - * If there are no reclaimable file-backed or anonymous pages, - * ensure zones with sufficient free pages are not skipped. - * This prevents zones like DMA32 from being ignored in reclaim - * scenarios where they can still help alleviate memory pressure. - */ - if (nr == 0) - nr = zone_page_state_snapshot(zone, NR_FREE_PAGES); + return nr; } @@ -888,11 +881,11 @@ static bool lru_gen_set_refs(struct folio *folio) { /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return false; } - set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset)); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset)); return true; } #else @@ -3257,13 +3250,13 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) /* promote pages accessed through page tables */ static int folio_update_gen(struct folio *folio, int gen) { - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return -1; } @@ -3274,7 +3267,7 @@ static int folio_update_gen(struct folio *folio, int gen) new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } @@ -3285,7 +3278,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); @@ -3302,7 +3295,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai /* for folio_end_writeback() */ if (reclaiming) new_flags |= BIT(PG_reclaim); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); lru_gen_update_size(lruvec, folio, old_gen, new_gen); @@ -4553,7 +4546,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) - set_mask_bits(&folio->flags, LRU_REFS_MASK, 0); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); @@ -4766,7 +4759,7 @@ retry: /* don't add rejected folios to the oldest generation */ if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) - set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); @@ -5561,6 +5554,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (memcg_id != mem_cgroup_id(memcg)) goto done; + sc->target_mem_cgroup = memcg; lruvec = get_lruvec(memcg, nid); if (swappiness < MIN_SWAPPINESS) @@ -5597,6 +5591,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, .may_swap = true, .reclaim_idx = MAX_NR_ZONES - 1, .gfp_mask = GFP_KERNEL, + .proactive = true, }; buf = kvmalloc(len + 1, GFP_KERNEL); @@ -6493,7 +6488,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) return true; for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { - if (!zone_reclaimable_pages(zone)) + if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, NR_FREE_PAGES)) continue; pfmemalloc_reserve += min_wmark_pages(zone); diff --git a/mm/vmstat.c b/mm/vmstat.c index 71cd1ceba191..e522decf6a72 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1280,6 +1280,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_NUMA_BALANCING [I(PGPROMOTE_SUCCESS)] = "pgpromote_success", [I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate", + [I(PGPROMOTE_CANDIDATE_NRL)] = "pgpromote_candidate_nrl", #endif [I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd", [I(PGDEMOTE_DIRECT)] = "pgdemote_direct", @@ -1289,6 +1290,7 @@ const char * const vmstat_text[] = { [I(NR_HUGETLB)] = "nr_hugetlb", #endif [I(NR_BALLOON_PAGES)] = "nr_balloon_pages", + [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages", #undef I /* system-wide enum vm_stat_item counters */ diff --git a/mm/workingset.c b/mm/workingset.c index 6e7f4cb1b9a7..68a76a91111f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -318,7 +318,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } else - set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 805a10b41266..153783d49d34 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1746,7 +1746,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, * instead. */ if (!zpdesc->zspage) - return MIGRATEPAGE_SUCCESS; + return 0; /* The page is locked, so this pointer must remain valid */ zspage = get_zspage(zpdesc); @@ -1813,7 +1813,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, reset_zpdesc(zpdesc); zpdesc_put(zpdesc); - return MIGRATEPAGE_SUCCESS; + return 0; } static void zs_page_putback(struct page *page) diff --git a/mm/zswap.c b/mm/zswap.c index 3c0fd8a13718..e5e1f5687f5e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -42,8 +42,10 @@ /********************************* * statistics **********************************/ -/* The number of compressed pages currently stored in zswap */ +/* The number of pages currently stored in zswap */ atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0); +/* The number of incompressible pages currently stored in zswap */ +static atomic_long_t zswap_stored_incompressible_pages = ATOMIC_LONG_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -811,6 +813,8 @@ static void zswap_entry_free(struct zswap_entry *entry) obj_cgroup_uncharge_zswap(entry->objcg, entry->length); obj_cgroup_put(entry->objcg); } + if (entry->length == PAGE_SIZE) + atomic_long_dec(&zswap_stored_incompressible_pages); zswap_entry_cache_free(entry); atomic_long_dec(&zswap_stored_pages); } @@ -827,7 +831,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) u8 *buffer = NULL; int ret; - buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu)); if (!buffer) { ret = -ENOMEM; goto fail; @@ -948,18 +952,14 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zpool *zpool; gfp_t gfp; u8 *dst; + bool mapped = false; acomp_ctx = acomp_ctx_get_cpu_lock(pool); dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, page, PAGE_SIZE, 0); - /* - * We need PAGE_SIZE * 2 here since there maybe over-compression case, - * and hardware-accelerators may won't check the dst buffer size, so - * giving the dst buffer with enough length to avoid buffer overflow. - */ - sg_init_one(&output, dst, PAGE_SIZE * 2); + sg_init_one(&output, dst, PAGE_SIZE); acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); /* @@ -976,8 +976,26 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, */ comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - if (comp_ret) - goto unlock; + + /* + * If a page cannot be compressed into a size smaller than PAGE_SIZE, + * save the content as is without a compression, to keep the LRU order + * of writebacks. If writeback is disabled, reject the page since it + * only adds metadata overhead. swap_writeout() will put the page back + * to the active LRU list in the case. + */ + if (comp_ret || !dlen || dlen >= PAGE_SIZE) { + dlen = PAGE_SIZE; + if (!mem_cgroup_zswap_writeback_enabled( + folio_memcg(page_folio(page)))) { + comp_ret = comp_ret ? comp_ret : -EINVAL; + goto unlock; + } + comp_ret = 0; + dlen = PAGE_SIZE; + dst = kmap_local_page(page); + mapped = true; + } zpool = pool->zpool; gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; @@ -990,6 +1008,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, entry->length = dlen; unlock: + if (mapped) + kunmap_local(dst); if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC) zswap_reject_compress_poor++; else if (comp_ret) @@ -1006,12 +1026,18 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct zpool *zpool = entry->pool->zpool; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - int decomp_ret, dlen; + int decomp_ret = 0, dlen = PAGE_SIZE; u8 *src, *obj; acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); + /* zswap entries of length PAGE_SIZE are not compressed. */ + if (entry->length == PAGE_SIZE) { + memcpy_to_folio(folio, 0, obj, entry->length); + goto read_done; + } + /* * zpool_obj_read_begin() might return a kmap address of highmem when * acomp_ctx->buffer is not used. However, sg_init_one() does not @@ -1032,6 +1058,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; +read_done: zpool_obj_read_end(zpool, entry->handle, obj); acomp_ctx_put_unlock(acomp_ctx); @@ -1524,6 +1551,8 @@ static bool zswap_store_page(struct page *page, obj_cgroup_charge_zswap(objcg, entry->length); } atomic_long_inc(&zswap_stored_pages); + if (entry->length == PAGE_SIZE) + atomic_long_inc(&zswap_stored_incompressible_pages); /* * We finish initializing the entry while it's already in xarray. @@ -1792,6 +1821,14 @@ static int debugfs_get_stored_pages(void *data, u64 *val) } DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n"); +static int debugfs_get_stored_incompressible_pages(void *data, u64 *val) +{ + *val = atomic_long_read(&zswap_stored_incompressible_pages); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(stored_incompressible_pages_fops, + debugfs_get_stored_incompressible_pages, NULL, "%llu\n"); + static int zswap_debugfs_init(void) { if (!debugfs_initialized()) @@ -1819,6 +1856,9 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, NULL, &total_size_fops); debugfs_create_file("stored_pages", 0444, zswap_debugfs_root, NULL, &stored_pages_fops); + debugfs_create_file("stored_incompressible_pages", 0444, + zswap_debugfs_root, NULL, + &stored_incompressible_pages_fops); return 0; } |