diff options
Diffstat (limited to 'mm')
99 files changed, 6421 insertions, 4605 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index ebd8ea353687..e8bf1e9e6ad9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -172,6 +172,7 @@ config SLUB config KVFREE_RCU_BATCHED def_bool y depends on !SLUB_TINY && !TINY_RCU + depends on !RCU_STRICT_GRACE_PERIOD config SLUB_TINY bool "Configure for minimal memory footprint" @@ -465,14 +466,11 @@ config HAVE_BOOTMEM_INFO_NODE config ARCH_ENABLE_MEMORY_HOTPLUG bool -config ARCH_ENABLE_MEMORY_HOTREMOVE - bool - # eventually, we can have this option just 'select SPARSEMEM' menuconfig MEMORY_HOTPLUG bool "Memory hotplug" select MEMORY_ISOLATION - depends on SPARSEMEM + depends on SPARSEMEM_VMEMMAP depends on ARCH_ENABLE_MEMORY_HOTPLUG depends on 64BIT select NUMA_KEEP_MEMINFO if NUMA @@ -540,8 +538,8 @@ endchoice config MEMORY_HOTREMOVE bool "Allow for memory hot remove" select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) - depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE - depends on MIGRATION + depends on MEMORY_HOTPLUG + select MIGRATION config MHP_MEMMAP_ON_MEMORY def_bool y @@ -572,6 +570,7 @@ config SPLIT_PTE_PTLOCKS depends on !ARM || CPU_CACHE_VIPT depends on !PARISC || PA20 depends on !SPARC32 + depends on !UML config ARCH_ENABLE_SPLIT_PMD_PTLOCK bool @@ -630,20 +629,20 @@ config PAGE_REPORTING those pages to another entity, such as a hypervisor, so that the memory can be freed within the host for other uses. -# -# support for page migration -# -config MIGRATION - bool "Page migration" +config NUMA_MIGRATION + bool "NUMA page migration" default y - depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU - help - Allows the migration of the physical location of pages of processes - while the virtual addresses are not changed. This is useful in - two situations. The first is on NUMA systems to put pages nearer - to the processors accessing. The second is when allocating huge - pages as migration can relocate pages to satisfy a huge page - allocation instead of reclaiming. + depends on NUMA && MMU + select MIGRATION + help + Support the migration of pages to other NUMA nodes, available to + user space through interfaces like migrate_pages(), move_pages(), + and mbind(). Selecting this option also enables support for page + demotion for memory tiering. + +config MIGRATION + bool + depends on MMU config DEVICE_MIGRATION def_bool MIGRATION && ZONE_DEVICE diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 7638d75b27db..91b3e027b753 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -297,6 +297,17 @@ config DEBUG_KMEMLEAK_AUTO_SCAN If unsure, say Y. +config DEBUG_KMEMLEAK_VERBOSE + bool "Default kmemleak to verbose mode" + depends on DEBUG_KMEMLEAK_AUTO_SCAN + help + Say Y here to have kmemleak print unreferenced object details + (backtrace, hex dump, address) to dmesg when new memory leaks are + detected during automatic scanning. This can also be toggled at + runtime via /sys/module/kmemleak/parameters/verbose. + + If unsure, say N. + config PER_VMA_LOCK_STATS bool "Statistics for per-vma locks" depends on PER_VMA_LOCK diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7a18fa6c7272..cecbcf9060a6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -618,12 +618,13 @@ static void cgwb_release_workfn(struct work_struct *work) wb_shutdown(wb); css_put(wb->memcg_css); - css_put(wb->blkcg_css); - mutex_unlock(&wb->bdi->cgwb_release_mutex); /* triggers blkg destruction if no online users left */ blkcg_unpin_online(wb->blkcg_css); + css_put(wb->blkcg_css); + mutex_unlock(&wb->bdi->cgwb_release_mutex); + fprop_local_destroy_percpu(&wb->memcg_completions); spin_lock_irq(&cgwb_lock); diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index b0e2a9fa641f..3d7675a3ae04 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -40,57 +40,20 @@ void put_page_bootmem(struct page *page) } } -#ifndef CONFIG_SPARSEMEM_VMEMMAP static void __init register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long mapsize, section_nr, i; struct mem_section *ms; - struct page *page, *memmap; - struct mem_section_usage *usage; - - section_nr = pfn_to_section_nr(start_pfn); - ms = __nr_to_section(section_nr); - - /* Get section's memmap address */ - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - - /* - * Get page for the memmap's phys address - * XXX: need more consideration for sparse_vmemmap... - */ - page = virt_to_page(memmap); - mapsize = sizeof(struct page) * PAGES_PER_SECTION; - mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; - - /* remember memmap's page */ - for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, SECTION_INFO); - - usage = ms->usage; - page = virt_to_page(usage); - - mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; - - for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, MIX_SECTION_INFO); - -} -#else /* CONFIG_SPARSEMEM_VMEMMAP */ -static void __init register_page_bootmem_info_section(unsigned long start_pfn) -{ - unsigned long mapsize, section_nr, i; - struct mem_section *ms; - struct page *page, *memmap; struct mem_section_usage *usage; + struct page *page; + start_pfn = SECTION_ALIGN_DOWN(start_pfn); section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - if (!preinited_vmemmap_section(ms)) - register_page_bootmem_memmap(section_nr, memmap, - PAGES_PER_SECTION); + register_page_bootmem_memmap(section_nr, pfn_to_page(start_pfn), + PAGES_PER_SECTION); usage = ms->usage; page = virt_to_page(usage); @@ -100,7 +63,6 @@ static void __init register_page_bootmem_info_section(unsigned long start_pfn) for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); } -#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) { @@ -52,6 +52,7 @@ const char *cma_get_name(const struct cma *cma) { return cma->name; } +EXPORT_SYMBOL_GPL(cma_get_name); static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, unsigned int align_order) @@ -951,6 +952,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, return page; } +EXPORT_SYMBOL_GPL(cma_alloc); static struct cma_memrange *find_cma_memrange(struct cma *cma, const struct page *pages, unsigned long count) @@ -1030,6 +1032,7 @@ bool cma_release(struct cma *cma, const struct page *pages, return true; } +EXPORT_SYMBOL_GPL(cma_release); bool cma_release_frozen(struct cma *cma, const struct page *pages, unsigned long count) diff --git a/mm/compaction.c b/mm/compaction.c index 1e8f8eca318c..3648ce22c807 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -518,6 +518,24 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } +static struct lruvec * +compact_folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags, + struct compact_control *cc) +{ + struct lruvec *lruvec; + + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); + compact_lock_irqsave(&lruvec->lru_lock, flags, cc); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irqrestore(&lruvec->lru_lock, *flags); + goto retry; + } + + return lruvec; +} + /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. The lock should be periodically unlocked to avoid @@ -839,7 +857,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, { pg_data_t *pgdat = cc->zone->zone_pgdat; unsigned long nr_scanned = 0, nr_isolated = 0; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; struct lruvec *locked = NULL; struct folio *folio = NULL; @@ -913,7 +931,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (!(low_pfn % COMPACT_CLUSTER_MAX)) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -964,7 +982,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* for alloc_contig case */ if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -1053,7 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(page_has_movable_ops(page)) && !PageMovableOpsIsolated(page)) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -1153,18 +1171,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!folio_test_clear_lru(folio)) goto isolate_fail_put; - lruvec = folio_lruvec(folio); + if (locked) + lruvec = folio_lruvec(folio); /* If we already hold the lock, we can skip some rechecking */ - if (lruvec != locked) { + if (lruvec != locked || !locked) { if (locked) - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); - compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); + lruvec = compact_folio_lruvec_lock_irqsave(folio, &flags, cc); locked = lruvec; - lruvec_memcg_debug(lruvec, folio); - /* * Try get exclusive access under lock. If marked for * skip, the scan is aborted unless the current context @@ -1226,7 +1243,7 @@ isolate_success_no_list: isolate_fail_put: /* Avoid potential deadlock in freeing page under lru_lock */ if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } folio_put(folio); @@ -1242,7 +1259,7 @@ isolate_fail: */ if (nr_isolated) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } putback_movable_pages(&cc->migratepages); @@ -1274,7 +1291,7 @@ isolate_fail: isolate_abort: if (locked) - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); if (folio) { folio_set_lru(folio); folio_put(folio); diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 8c868f7035fc..34631a44cdec 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -12,6 +12,17 @@ config DAMON See https://www.kernel.org/doc/html/latest/mm/damon/index.html for more information. +config DAMON_DEBUG_SANITY + bool "Check sanity of DAMON code" + depends on DAMON + help + This enables additional DAMON debugging-purpose sanity checks in + DAMON code. This can be useful for finding bugs, but impose + additional overhead. This is therefore recommended to be enabled on + only development and test setups. + + If unsure, say N. + config DAMON_KUNIT_TEST bool "Test for damon" if !KUNIT_ALL_TESTS depends on DAMON && KUNIT=y diff --git a/mm/damon/core.c b/mm/damon/core.c index c1d1091d307e..3dbbbfdeff71 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -109,6 +109,17 @@ int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id) return err; } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_new_region(unsigned long start, unsigned long end) +{ + WARN_ONCE(start >= end, "start %lu >= end %lu\n", start, end); +} +#else +static void damon_verify_new_region(unsigned long start, unsigned long end) +{ +} +#endif + /* * Construct a damon_region struct * @@ -118,6 +129,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) { struct damon_region *region; + damon_verify_new_region(start, end); region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL); if (!region) return NULL; @@ -140,8 +152,21 @@ void damon_add_region(struct damon_region *r, struct damon_target *t) t->nr_regions++; } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_del_region(struct damon_target *t) +{ + WARN_ONCE(t->nr_regions == 0, "t->nr_regions == 0\n"); +} +#else +static void damon_verify_del_region(struct damon_target *t) +{ +} +#endif + static void damon_del_region(struct damon_region *r, struct damon_target *t) { + damon_verify_del_region(t); + list_del(&r->list); t->nr_regions--; } @@ -362,6 +387,11 @@ void damos_destroy_quota_goal(struct damos_quota_goal *g) damos_free_quota_goal(g); } +static bool damos_quota_goals_empty(struct damos_quota *q) +{ + return list_empty(&q->goals); +} + /* initialize fields of @quota that normally API users wouldn't set */ static struct damos_quota *damos_quota_init(struct damos_quota *quota) { @@ -520,8 +550,27 @@ void damon_destroy_target(struct damon_target *t, struct damon_ctx *ctx) damon_free_target(t); } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_nr_regions(struct damon_target *t) +{ + struct damon_region *r; + unsigned int count = 0; + + damon_for_each_region(r, t) + count++; + WARN_ONCE(count != t->nr_regions, "t->nr_regions (%u) != count (%u)\n", + t->nr_regions, count); +} +#else +static void damon_verify_nr_regions(struct damon_target *t) +{ +} +#endif + unsigned int damon_nr_regions(struct damon_target *t) { + damon_verify_nr_regions(t); + return t->nr_regions; } @@ -621,7 +670,7 @@ static unsigned int damon_accesses_bp_to_nr_accesses( static unsigned int damon_nr_accesses_to_accesses_bp( unsigned int nr_accesses, struct damon_attrs *attrs) { - return nr_accesses * 10000 / damon_max_nr_accesses(attrs); + return mult_frac(nr_accesses, 10000, damon_max_nr_accesses(attrs)); } static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses, @@ -707,8 +756,16 @@ static bool damon_valid_intervals_goal(struct damon_attrs *attrs) * @ctx: monitoring context * @attrs: monitoring attributes * - * This function should be called while the kdamond is not running, an access - * check results aggregation is not ongoing (e.g., from damon_call(). + * This function updates monitoring results and next monitoring/damos operation + * schedules. Because those are periodically updated by kdamond, this should + * be called from a safe contexts. Such contexts include damon_ctx setup time + * while the kdamond is not yet started, and inside of kdamond_fn(). + * + * In detail, all DAMON API callers directly call this function for initial + * setup of damon_ctx before calling damon_start(). Some of the API callers + * also indirectly call this function via damon_call() -> damon_commit() for + * online parameters updates. Finally, kdamond_fn() itself use this for + * applying auto-tuned monitoring intervals. * * Every time interval is in micro-seconds. * @@ -860,6 +917,7 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src) err = damos_commit_quota_goals(dst, src); if (err) return err; + dst->goal_tuner = src->goal_tuner; dst->weight_sz = src->weight_sz; dst->weight_nr_accesses = src->weight_nr_accesses; dst->weight_age = src->weight_age; @@ -1002,6 +1060,23 @@ static void damos_set_filters_default_reject(struct damos *s) damos_filters_default_reject(&s->ops_filters); } +/* + * damos_commit_dests() - Copy migration destinations from @src to @dst. + * @dst: Destination structure to update. + * @src: Source structure to copy from. + * + * If the number of destinations has changed, the old arrays in @dst are freed + * and new ones are allocated. On success, @dst contains a full copy of + * @src's arrays and count. + * + * On allocation failure, @dst is left in a partially torn-down state: its + * arrays may be NULL and @nr_dests may not reflect the actual allocation + * sizes. The structure remains safe to deallocate via damon_destroy_scheme(), + * but callers must not reuse @dst for further commits — it should be + * discarded. + * + * Return: 0 on success, -ENOMEM on allocation failure. + */ static int damos_commit_dests(struct damos_migrate_dests *dst, struct damos_migrate_dests *src) { @@ -1252,6 +1327,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) { int err; + dst->maybe_corrupted = true; if (!is_power_of_2(src->min_region_sz)) return -EINVAL; @@ -1277,6 +1353,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) dst->addr_unit = src->addr_unit; dst->min_region_sz = src->min_region_sz; + dst->maybe_corrupted = false; return 0; } @@ -1314,6 +1391,40 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) return sz; } +static void damon_split_region_at(struct damon_target *t, + struct damon_region *r, unsigned long sz_r); + +/* + * damon_apply_min_nr_regions() - Make effect of min_nr_regions parameter. + * @ctx: monitoring context. + * + * This function implement min_nr_regions (minimum number of damon_region + * objects in the given monitoring context) behavior. It first calculates + * maximum size of each region for enforcing the min_nr_regions as total size + * of the regions divided by the min_nr_regions. After that, this function + * splits regions to ensure all regions are equal to or smaller than the size + * limit. Finally, this function returns the maximum size limit. + * + * Returns: maximum size of each region for convincing min_nr_regions. + */ +static unsigned long damon_apply_min_nr_regions(struct damon_ctx *ctx) +{ + unsigned long max_region_sz = damon_region_sz_limit(ctx); + struct damon_target *t; + struct damon_region *r, *next; + + max_region_sz = ALIGN(max_region_sz, ctx->min_region_sz); + damon_for_each_target(t, ctx) { + damon_for_each_region_safe(r, next, t) { + while (damon_sz_region(r) > max_region_sz) { + damon_split_region_at(t, r, max_region_sz); + r = damon_next_region(r); + } + } + } + return max_region_sz; +} + static int kdamond_fn(void *data); /* @@ -1366,6 +1477,11 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive) int i; int err = 0; + for (i = 0; i < nr_ctxs; i++) { + if (!is_power_of_2(ctxs[i]->min_region_sz)) + return -EINVAL; + } + mutex_lock(&damon_lock); if ((exclusive && nr_running_ctxs) || (!exclusive && running_exclusive_ctxs)) { @@ -1462,35 +1578,6 @@ int damon_kdamond_pid(struct damon_ctx *ctx) return pid; } -/* - * damon_call_handle_inactive_ctx() - handle DAMON call request that added to - * an inactive context. - * @ctx: The inactive DAMON context. - * @control: Control variable of the call request. - * - * This function is called in a case that @control is added to @ctx but @ctx is - * not running (inactive). See if @ctx handled @control or not, and cleanup - * @control if it was not handled. - * - * Returns 0 if @control was handled by @ctx, negative error code otherwise. - */ -static int damon_call_handle_inactive_ctx( - struct damon_ctx *ctx, struct damon_call_control *control) -{ - struct damon_call_control *c; - - mutex_lock(&ctx->call_controls_lock); - list_for_each_entry(c, &ctx->call_controls, list) { - if (c == control) { - list_del(&control->list); - mutex_unlock(&ctx->call_controls_lock); - return -EINVAL; - } - } - mutex_unlock(&ctx->call_controls_lock); - return 0; -} - /** * damon_call() - Invoke a given function on DAMON worker thread (kdamond). * @ctx: DAMON context to call the function for. @@ -1508,6 +1595,10 @@ static int damon_call_handle_inactive_ctx( * synchronization. The return value of the function will be saved in * &damon_call_control->return_code. * + * Note that this function should be called only after damon_start() with the + * @ctx has succeeded. Otherwise, this function could fall into an indefinite + * wait. + * * Return: 0 on success, negative error code otherwise. */ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) @@ -1518,10 +1609,12 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) INIT_LIST_HEAD(&control->list); mutex_lock(&ctx->call_controls_lock); + if (ctx->call_controls_obsolete) { + mutex_unlock(&ctx->call_controls_lock); + return -ECANCELED; + } list_add_tail(&control->list, &ctx->call_controls); mutex_unlock(&ctx->call_controls_lock); - if (!damon_is_running(ctx)) - return damon_call_handle_inactive_ctx(ctx, control); if (control->repeat) return 0; wait_for_completion(&control->completion); @@ -1549,6 +1642,10 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) * passed at least one &damos->apply_interval_us, kdamond marks the request as * completed so that damos_walk() can wakeup and return. * + * Note that this function should be called only after damon_start() with the + * @ctx has succeeded. Otherwise, this function could fall into an indefinite + * wait. + * * Return: 0 on success, negative error code otherwise. */ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) @@ -1556,19 +1653,16 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) init_completion(&control->completion); control->canceled = false; mutex_lock(&ctx->walk_control_lock); + if (ctx->walk_control_obsolete) { + mutex_unlock(&ctx->walk_control_lock); + return -ECANCELED; + } if (ctx->walk_control) { mutex_unlock(&ctx->walk_control_lock); return -EBUSY; } ctx->walk_control = control; mutex_unlock(&ctx->walk_control_lock); - if (!damon_is_running(ctx)) { - mutex_lock(&ctx->walk_control_lock); - if (ctx->walk_control == control) - ctx->walk_control = NULL; - mutex_unlock(&ctx->walk_control_lock); - return -EINVAL; - } wait_for_completion(&control->completion); if (control->canceled) return -ECANCELED; @@ -1588,6 +1682,23 @@ static void damon_warn_fix_nr_accesses_corruption(struct damon_region *r) r->nr_accesses_bp = r->nr_accesses * 10000; } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_reset_aggregated(struct damon_region *r, + struct damon_ctx *c) +{ + WARN_ONCE(r->nr_accesses_bp != r->last_nr_accesses * 10000, + "nr_accesses_bp %u last_nr_accesses %u sis %lu %lu\n", + r->nr_accesses_bp, r->last_nr_accesses, + c->passed_sample_intervals, c->next_aggregation_sis); +} +#else +static void damon_verify_reset_aggregated(struct damon_region *r, + struct damon_ctx *c) +{ +} +#endif + + /* * Reset the aggregated monitoring results ('nr_accesses' of each region). */ @@ -1604,6 +1715,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) damon_warn_fix_nr_accesses_corruption(r); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; + damon_verify_reset_aggregated(r, c); } ti++; } @@ -1626,7 +1738,7 @@ static unsigned long damon_get_intervals_score(struct damon_ctx *c) } target_access_events = max_access_events * goal_bp / 10000; target_access_events = target_access_events ? : 1; - return access_events * 10000 / target_access_events; + return mult_frac(access_events, 10000, target_access_events); } static unsigned long damon_feed_loop_next_input(unsigned long last_input, @@ -1670,9 +1782,6 @@ static void kdamond_tune_intervals(struct damon_ctx *c) damon_set_attrs(c, &new_attrs); } -static void damon_split_region_at(struct damon_target *t, - struct damon_region *r, unsigned long sz_r); - static bool __damos_valid_target(struct damon_region *r, struct damos *s) { unsigned long sz; @@ -1687,15 +1796,27 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s) r->age <= s->pattern.max_age_region; } -static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, - struct damon_region *r, struct damos *s) +/* + * damos_quota_is_set() - Return if the given quota is actually set. + * @quota: The quota to check. + * + * Returns true if the quota is set, false otherwise. + */ +static bool damos_quota_is_set(struct damos_quota *quota) +{ + return quota->esz || quota->sz || quota->ms || + !damos_quota_goals_empty(quota); +} + +static bool damos_valid_target(struct damon_ctx *c, struct damon_region *r, + struct damos *s) { bool ret = __damos_valid_target(r, s); - if (!ret || !s->quota.esz || !c->ops.get_scheme_score) + if (!ret || !damos_quota_is_set(&s->quota) || !c->ops.get_scheme_score) return ret; - return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score; + return c->ops.get_scheme_score(c, r, s) >= s->quota.min_score; } /* @@ -1715,17 +1836,18 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * This function checks if a given region should be skipped or not for the * reason. If only the starting part of the region has previously charged, * this function splits the region into two so that the second one covers the - * area that not charged in the previous charge widnow and saves the second - * region in *rp and returns false, so that the caller can apply DAMON action - * to the second one. + * area that not charged in the previous charge widnow, and return true. The + * caller can see the second one on the next iteration of the region walk. + * Note that this means the caller should use damon_for_each_region() instead + * of damon_for_each_region_safe(). If damon_for_each_region_safe() is used, + * the second region will just be ignored. * - * Return: true if the region should be entirely skipped, false otherwise. + * Return: true if the region should be skipped, false otherwise. */ static bool damos_skip_charged_region(struct damon_target *t, - struct damon_region **rp, struct damos *s, + struct damon_region *r, struct damos *s, unsigned long min_region_sz) { - struct damon_region *r = *rp; struct damos_quota *quota = &s->quota; unsigned long sz_to_skip; @@ -1752,8 +1874,7 @@ static bool damos_skip_charged_region(struct damon_target *t, sz_to_skip = min_region_sz; } damon_split_region_at(t, r, sz_to_skip); - r = damon_next_region(r); - *rp = r; + return true; } quota->charge_target_from = NULL; quota->charge_addr_from = 0; @@ -1962,7 +2083,8 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, } if (c->ops.apply_scheme) { - if (quota->esz && quota->charged_sz + sz > quota->esz) { + if (damos_quota_is_set(quota) && + quota->charged_sz + sz > quota->esz) { sz = ALIGN_DOWN(quota->esz - quota->charged_sz, c->min_region_sz); if (!sz) @@ -1981,7 +2103,8 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); quota->charged_sz += sz; - if (quota->esz && quota->charged_sz >= quota->esz) { + if (damos_quota_is_set(quota) && + quota->charged_sz >= quota->esz) { quota->charge_target_from = t; quota->charge_addr_from = r->ar.end + 1; } @@ -2002,24 +2125,25 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - if (c->passed_sample_intervals < s->next_apply_sis) + if (time_before(c->passed_sample_intervals, s->next_apply_sis)) continue; if (!s->wmarks.activated) continue; /* Check the quota */ - if (quota->esz && quota->charged_sz >= quota->esz) + if (damos_quota_is_set(quota) && + quota->charged_sz >= quota->esz) continue; - if (damos_skip_charged_region(t, &r, s, c->min_region_sz)) + if (damos_skip_charged_region(t, r, s, c->min_region_sz)) continue; if (s->max_nr_snapshots && s->max_nr_snapshots <= s->stat.nr_snapshots) continue; - if (damos_valid_target(c, t, r, s)) + if (damos_valid_target(c, r, s)) damos_apply_scheme(c, t, r, s); if (damon_is_last_region(r, t)) @@ -2098,18 +2222,30 @@ static inline u64 damos_get_some_mem_psi_total(void) #endif /* CONFIG_PSI */ #ifdef CONFIG_NUMA +static bool invalid_mem_node(int nid) +{ + return nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY); +} + static __kernel_ulong_t damos_get_node_mem_bp( struct damos_quota_goal *goal) { struct sysinfo i; __kernel_ulong_t numerator; + if (invalid_mem_node(goal->nid)) { + if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP) + return 0; + else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */ + return 10000; + } + si_meminfo_node(&i, goal->nid); if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP) numerator = i.totalram - i.freeram; else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */ numerator = i.freeram; - return numerator * 10000 / i.totalram; + return mult_frac(numerator, 10000, i.totalram); } static unsigned long damos_get_node_memcg_used_bp( @@ -2120,6 +2256,13 @@ static unsigned long damos_get_node_memcg_used_bp( unsigned long used_pages, numerator; struct sysinfo i; + if (invalid_mem_node(goal->nid)) { + if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) + return 0; + else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ + return 10000; + } + memcg = mem_cgroup_get_from_id(goal->memcg_id); if (!memcg) { if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) @@ -2142,7 +2285,7 @@ static unsigned long damos_get_node_memcg_used_bp( numerator = used_pages; else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ numerator = i.totalram - used_pages; - return numerator * 10000 / i.totalram; + return mult_frac(numerator, 10000, i.totalram); } #else static __kernel_ulong_t damos_get_node_mem_bp( @@ -2172,8 +2315,8 @@ static unsigned int damos_get_in_active_mem_bp(bool active_ratio) global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); total = active + inactive; if (active_ratio) - return active * 10000 / total; - return inactive * 10000 / total; + return mult_frac(active, 10000, total); + return mult_frac(inactive, 10000, total); } static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) @@ -2216,17 +2359,38 @@ static unsigned long damos_quota_score(struct damos_quota *quota) damos_for_each_quota_goal(goal, quota) { damos_set_quota_goal_current_value(goal); highest_score = max(highest_score, - goal->current_value * 10000 / - goal->target_value); + mult_frac(goal->current_value, 10000, + goal->target_value)); } return highest_score; } +static void damos_goal_tune_esz_bp_consist(struct damos_quota *quota) +{ + unsigned long score = damos_quota_score(quota); + + quota->esz_bp = damon_feed_loop_next_input( + max(quota->esz_bp, 10000UL), score); +} + +static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota) +{ + unsigned long score = damos_quota_score(quota); + + if (score >= 10000) + quota->esz_bp = 0; + else if (quota->sz) + quota->esz_bp = quota->sz * 10000; + else + quota->esz_bp = ULONG_MAX; +} + /* * Called only if quota->ms, or quota->sz are set, or quota->goals is not empty */ -static void damos_set_effective_quota(struct damos_quota *quota) +static void damos_set_effective_quota(struct damos_quota *quota, + struct damon_ctx *ctx) { unsigned long throughput; unsigned long esz = ULONG_MAX; @@ -2237,21 +2401,21 @@ static void damos_set_effective_quota(struct damos_quota *quota) } if (!list_empty("a->goals)) { - unsigned long score = damos_quota_score(quota); - - quota->esz_bp = damon_feed_loop_next_input( - max(quota->esz_bp, 10000UL), - score); + if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_CONSIST) + damos_goal_tune_esz_bp_consist(quota); + else if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_TEMPORAL) + damos_goal_tune_esz_bp_temporal(quota); esz = quota->esz_bp / 10000; } if (quota->ms) { if (quota->total_charged_ns) - throughput = mult_frac(quota->total_charged_sz, 1000000, - quota->total_charged_ns); + throughput = mult_frac(quota->total_charged_sz, + 1000000, quota->total_charged_ns); else throughput = PAGE_SIZE * 1024; esz = min(throughput * quota->ms, esz); + esz = max(ctx->min_region_sz, esz); } if (quota->sz && quota->sz < esz) @@ -2288,20 +2452,22 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) /* First charge window */ if (!quota->total_charged_sz && !quota->charged_from) { quota->charged_from = jiffies; - damos_set_effective_quota(quota); + damos_set_effective_quota(quota, c); } /* New charge window starts */ - if (time_after_eq(jiffies, quota->charged_from + + if (!time_in_range_open(jiffies, quota->charged_from, + quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { - if (quota->esz && quota->charged_sz >= quota->esz) + if (damos_quota_is_set(quota) && + quota->charged_sz >= quota->esz) s->stat.qt_exceeds++; quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; quota->charged_sz = 0; if (trace_damos_esz_enabled()) cached_esz = quota->esz; - damos_set_effective_quota(quota); + damos_set_effective_quota(quota, c); if (trace_damos_esz_enabled() && quota->esz != cached_esz) damos_trace_esz(c, s, quota); } @@ -2317,7 +2483,9 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) damon_for_each_region(r, t) { if (!__damos_valid_target(r, s)) continue; - score = c->ops.get_scheme_score(c, t, r, s); + if (damos_core_filter_out(c, t, r, s)) + continue; + score = c->ops.get_scheme_score(c, r, s); c->regions_score_histogram[score] += damon_sz_region(r); if (score > max_score) @@ -2347,20 +2515,18 @@ static void damos_trace_stat(struct damon_ctx *c, struct damos *s) break; sidx++; } - trace_damos_stat_after_apply_interval(cidx, sidx, &s->stat); + trace_call__damos_stat_after_apply_interval(cidx, sidx, &s->stat); } static void kdamond_apply_schemes(struct damon_ctx *c) { struct damon_target *t; - struct damon_region *r, *next_r; + struct damon_region *r; struct damos *s; - unsigned long sample_interval = c->attrs.sample_interval ? - c->attrs.sample_interval : 1; bool has_schemes_to_apply = false; damon_for_each_scheme(s, c) { - if (c->passed_sample_intervals < s->next_apply_sis) + if (time_before(c->passed_sample_intervals, s->next_apply_sis)) continue; if (!s->wmarks.activated) @@ -2379,23 +2545,36 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (c->ops.target_valid && c->ops.target_valid(t) == false) continue; - damon_for_each_region_safe(r, next_r, t) + damon_for_each_region(r, t) damon_do_apply_schemes(c, t, r); } damon_for_each_scheme(s, c) { - if (c->passed_sample_intervals < s->next_apply_sis) + if (time_before(c->passed_sample_intervals, s->next_apply_sis)) continue; damos_walk_complete(c, s); - s->next_apply_sis = c->passed_sample_intervals + - (s->apply_interval_us ? s->apply_interval_us : - c->attrs.aggr_interval) / sample_interval; + damos_set_next_apply_sis(s, c); s->last_applied = NULL; damos_trace_stat(c, s); } mutex_unlock(&c->walk_control_lock); } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_merge_two_regions( + struct damon_region *l, struct damon_region *r) +{ + /* damon_merge_two_regions() may created incorrect left region */ + WARN_ONCE(l->ar.start >= l->ar.end, "l: %lu-%lu, r: %lu-%lu\n", + l->ar.start, l->ar.end, r->ar.start, r->ar.end); +} +#else +static void damon_verify_merge_two_regions( + struct damon_region *l, struct damon_region *r) +{ +} +#endif + /* * Merge two adjacent regions into one region */ @@ -2409,9 +2588,24 @@ static void damon_merge_two_regions(struct damon_target *t, l->nr_accesses_bp = l->nr_accesses * 10000; l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r); l->ar.end = r->ar.end; + damon_verify_merge_two_regions(l, r); damon_destroy_region(r, t); } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_merge_regions_of(struct damon_region *r) +{ + WARN_ONCE(r->nr_accesses != r->nr_accesses_bp / 10000, + "nr_accesses (%u) != nr_accesses_bp (%u)\n", + r->nr_accesses, r->nr_accesses_bp); +} +#else +static void damon_verify_merge_regions_of(struct damon_region *r) +{ +} +#endif + + /* * Merge adjacent regions having similar access frequencies * @@ -2425,6 +2619,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, struct damon_region *r, *prev = NULL, *next; damon_for_each_region_safe(r, next, t) { + damon_verify_merge_regions_of(r); if (abs(r->nr_accesses - r->last_nr_accesses) > thres) r->age = 0; else if ((r->nr_accesses == 0) != (r->last_nr_accesses == 0)) @@ -2478,6 +2673,21 @@ static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold, threshold / 2 < max_thres); } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_split_region_at(struct damon_region *r, + unsigned long sz_r) +{ + WARN_ONCE(sz_r == 0 || sz_r >= damon_sz_region(r), + "sz_r: %lu r: %lu-%lu (%lu)\n", + sz_r, r->ar.start, r->ar.end, damon_sz_region(r)); +} +#else +static void damon_verify_split_region_at(struct damon_region *r, + unsigned long sz_r) +{ +} +#endif + /* * Split a region in two * @@ -2489,6 +2699,7 @@ static void damon_split_region_at(struct damon_target *t, { struct damon_region *new; + damon_verify_split_region_at(r, sz_r); new = damon_new_region(r->ar.start + sz_r, r->ar.end); if (!new) return; @@ -2678,6 +2889,8 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel) complete(&control->completion); else if (control->canceled && control->dealloc_on_cancel) kfree(control); + if (!cancel && ctx->maybe_corrupted) + break; } mutex_lock(&ctx->call_controls_lock); @@ -2707,6 +2920,8 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) kdamond_usleep(min_wait_time); kdamond_call(ctx, false); + if (ctx->maybe_corrupted) + return -EINVAL; damos_walk_cancel(ctx); } return -EBUSY; @@ -2716,7 +2931,6 @@ static void kdamond_init_ctx(struct damon_ctx *ctx) { unsigned long sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; - unsigned long apply_interval; struct damos *scheme; ctx->passed_sample_intervals = 0; @@ -2727,9 +2941,7 @@ static void kdamond_init_ctx(struct damon_ctx *ctx) ctx->attrs.intervals_goal.aggrs; damon_for_each_scheme(scheme, ctx) { - apply_interval = scheme->apply_interval_us ? - scheme->apply_interval_us : ctx->attrs.aggr_interval; - scheme->next_apply_sis = apply_interval / sample_interval; + damos_set_next_apply_sis(scheme, ctx); damos_set_filters_default_reject(scheme); } } @@ -2745,6 +2957,12 @@ static int kdamond_fn(void *data) pr_debug("kdamond (%d) starts\n", current->pid); + mutex_lock(&ctx->call_controls_lock); + ctx->call_controls_obsolete = false; + mutex_unlock(&ctx->call_controls_lock); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control_obsolete = false; + mutex_unlock(&ctx->walk_control_lock); complete(&ctx->kdamond_started); kdamond_init_ctx(ctx); @@ -2755,7 +2973,7 @@ static int kdamond_fn(void *data) if (!ctx->regions_score_histogram) goto done; - sz_limit = damon_region_sz_limit(ctx); + sz_limit = damon_apply_min_nr_regions(ctx); while (!kdamond_need_stop(ctx)) { /* @@ -2780,16 +2998,22 @@ static int kdamond_fn(void *data) if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); - if (ctx->passed_sample_intervals >= next_aggregation_sis) + if (time_after_eq(ctx->passed_sample_intervals, + next_aggregation_sis)) { kdamond_merge_regions(ctx, max_nr_accesses / 10, sz_limit); + /* online updates might be made */ + sz_limit = damon_apply_min_nr_regions(ctx); + } /* * do kdamond_call() and kdamond_apply_schemes() after * kdamond_merge_regions() if possible, to reduce overhead */ kdamond_call(ctx, false); + if (ctx->maybe_corrupted) + break; if (!list_empty(&ctx->schemes)) kdamond_apply_schemes(ctx); else @@ -2797,10 +3021,12 @@ static int kdamond_fn(void *data) sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; - if (ctx->passed_sample_intervals >= next_aggregation_sis) { + if (time_after_eq(ctx->passed_sample_intervals, + next_aggregation_sis)) { if (ctx->attrs.intervals_goal.aggrs && - ctx->passed_sample_intervals >= - ctx->next_intervals_tune_sis) { + time_after_eq( + ctx->passed_sample_intervals, + ctx->next_intervals_tune_sis)) { /* * ctx->next_aggregation_sis might be updated * from kdamond_call(). In the case, @@ -2834,20 +3060,26 @@ static int kdamond_fn(void *data) kdamond_split_regions(ctx); } - if (ctx->passed_sample_intervals >= next_ops_update_sis) { + if (time_after_eq(ctx->passed_sample_intervals, + next_ops_update_sis)) { ctx->next_ops_update_sis = next_ops_update_sis + ctx->attrs.ops_update_interval / sample_interval; if (ctx->ops.update) ctx->ops.update(ctx); - sz_limit = damon_region_sz_limit(ctx); } } done: damon_destroy_targets(ctx); kfree(ctx->regions_score_histogram); + mutex_lock(&ctx->call_controls_lock); + ctx->call_controls_obsolete = true; + mutex_unlock(&ctx->call_controls_lock); kdamond_call(ctx, true); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control_obsolete = true; + mutex_unlock(&ctx->walk_control_lock); damos_walk_cancel(ctx); pr_debug("kdamond (%d) finishes\n", current->pid); @@ -2866,31 +3098,43 @@ done: static int walk_system_ram(struct resource *res, void *arg) { - struct damon_addr_range *a = arg; + struct resource *a = arg; - if (a->end - a->start < resource_size(res)) { + if (resource_size(a) < resource_size(res)) { a->start = res->start; a->end = res->end; } return 0; } +static unsigned long damon_res_to_core_addr(resource_size_t ra, + unsigned long addr_unit) +{ + /* + * Use div_u64() for avoiding linking errors related with __udivdi3, + * __aeabi_uldivmod, or similar problems. This should also improve the + * performance optimization (read div_u64() comment for the detail). + */ + if (sizeof(ra) == 8 && sizeof(addr_unit) == 4) + return div_u64(ra, addr_unit); + return ra / addr_unit; +} + /* * Find biggest 'System RAM' resource and store its start and end address in * @start and @end, respectively. If no System RAM is found, returns false. */ static bool damon_find_biggest_system_ram(unsigned long *start, - unsigned long *end) + unsigned long *end, unsigned long addr_unit) { - struct damon_addr_range arg = {}; + struct resource res = {}; - walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); - if (arg.end <= arg.start) + walk_system_ram_res(0, -1, &res, walk_system_ram); + *start = damon_res_to_core_addr(res.start, addr_unit); + *end = damon_res_to_core_addr(res.end + 1, addr_unit); + if (*end <= *start) return false; - - *start = arg.start; - *end = arg.end; return true; } @@ -2900,6 +3144,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * @t: The monitoring target to set the region. * @start: The pointer to the start address of the region. * @end: The pointer to the end address of the region. + * @addr_unit: The address unit for the damon_ctx of @t. * @min_region_sz: Minimum region size. * * This function sets the region of @t as requested by @start and @end. If the @@ -2912,7 +3157,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, */ int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end, - unsigned long min_region_sz) + unsigned long addr_unit, unsigned long min_region_sz) { struct damon_addr_range addr_range; @@ -2920,7 +3165,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, return -EINVAL; if (!*start && !*end && - !damon_find_biggest_system_ram(start, end)) + !damon_find_biggest_system_ram(start, end, addr_unit)) return -EINVAL; addr_range.start = *start; diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 7bc5c0b2aea3..554559d72976 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -291,12 +291,6 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; - /* - * If monitor_region_start/end are unset, always silently - * reset addr_unit to 1. - */ - if (!monitor_region_start && !monitor_region_end) - addr_unit = 1; param_ctx->addr_unit = addr_unit; param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1); @@ -345,6 +339,7 @@ static int damon_lru_sort_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, &monitor_region_end, + param_ctx->addr_unit, param_ctx->min_region_sz); if (err) goto out; diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index a218d9922234..8c6d613425c1 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -90,7 +90,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr return; if (likely(pmd_present(pmdval))) - young |= pmdp_clear_young_notify(vma, addr, pmd); + young |= pmdp_test_and_clear_young(vma, addr, pmd); young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + HPAGE_PMD_SIZE); if (young) folio_set_young(folio); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 9bfe48826840..5cdcc5037cbc 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -343,8 +343,7 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, } static int damon_pa_scheme_score(struct damon_ctx *context, - struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damon_region *r, struct damos *scheme) { switch (scheme->action) { case DAMOS_PAGEOUT: diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 43d76f5bed44..86da14778658 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -201,12 +201,6 @@ static int damon_reclaim_apply_parameters(void) if (err) return err; - /* - * If monitor_region_start/end are unset, always silently - * reset addr_unit to 1. - */ - if (!monitor_region_start && !monitor_region_end) - addr_unit = 1; param_ctx->addr_unit = addr_unit; param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1); @@ -251,6 +245,7 @@ static int damon_reclaim_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, &monitor_region_end, + param_ctx->addr_unit, param_ctx->min_region_sz); if (err) goto out; diff --git a/mm/damon/stat.c b/mm/damon/stat.c index 25fb44ccf99d..99ba346f9e32 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -145,12 +145,59 @@ static int damon_stat_damon_call_fn(void *data) return 0; } +struct damon_stat_system_ram_range_walk_arg { + bool walked; + struct resource res; +}; + +static int damon_stat_system_ram_walk_fn(struct resource *res, void *arg) +{ + struct damon_stat_system_ram_range_walk_arg *a = arg; + + if (!a->walked) { + a->walked = true; + a->res.start = res->start; + } + a->res.end = res->end; + return 0; +} + +static unsigned long damon_stat_res_to_core_addr(resource_size_t ra, + unsigned long addr_unit) +{ + /* + * Use div_u64() for avoiding linking errors related with __udivdi3, + * __aeabi_uldivmod, or similar problems. This should also improve the + * performance optimization (read div_u64() comment for the detail). + */ + if (sizeof(ra) == 8 && sizeof(addr_unit) == 4) + return div_u64(ra, addr_unit); + return ra / addr_unit; +} + +static int damon_stat_set_monitoring_region(struct damon_target *t, + unsigned long addr_unit, unsigned long min_region_sz) +{ + struct damon_addr_range addr_range; + struct damon_stat_system_ram_range_walk_arg arg = {}; + + walk_system_ram_res(0, -1, &arg, damon_stat_system_ram_walk_fn); + if (!arg.walked) + return -EINVAL; + addr_range.start = damon_stat_res_to_core_addr( + arg.res.start, addr_unit); + addr_range.end = damon_stat_res_to_core_addr( + arg.res.end + 1, addr_unit); + if (addr_range.end <= addr_range.start) + return -EINVAL; + return damon_set_regions(t, &addr_range, 1, min_region_sz); +} + static struct damon_ctx *damon_stat_build_ctx(void) { struct damon_ctx *ctx; struct damon_attrs attrs; struct damon_target *target; - unsigned long start = 0, end = 0; ctx = damon_new_ctx(); if (!ctx) @@ -180,8 +227,8 @@ static struct damon_ctx *damon_stat_build_ctx(void) if (!target) goto free_out; damon_add_target(ctx, target); - if (damon_set_region_biggest_system_ram_default(target, &start, &end, - ctx->min_region_sz)) + if (damon_stat_set_monitoring_region(target, ctx->addr_unit, + ctx->min_region_sz)) goto free_out; return ctx; free_out: @@ -198,12 +245,21 @@ static int damon_stat_start(void) { int err; + if (damon_stat_context) { + if (damon_is_running(damon_stat_context)) + return -EAGAIN; + damon_destroy_ctx(damon_stat_context); + } + damon_stat_context = damon_stat_build_ctx(); if (!damon_stat_context) return -ENOMEM; err = damon_start(&damon_stat_context, 1, true); - if (err) + if (err) { + damon_destroy_ctx(damon_stat_context); + damon_stat_context = NULL; return err; + } damon_stat_last_refresh_jiffies = jiffies; call_control.data = damon_stat_context; @@ -214,6 +270,7 @@ static void damon_stat_stop(void) { damon_stop(&damon_stat_context, 1); damon_destroy_ctx(damon_stat_context); + damon_stat_context = NULL; } static int damon_stat_enabled_store( diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 3a0782e576fa..5186966dafb3 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1488,6 +1488,7 @@ struct damon_sysfs_quotas { unsigned long sz; unsigned long reset_interval_ms; unsigned long effective_sz; /* Effective size quota in bytes */ + enum damos_quota_goal_tuner goal_tuner; }; static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) @@ -1610,6 +1611,58 @@ static ssize_t effective_bytes_show(struct kobject *kobj, return sysfs_emit(buf, "%lu\n", quotas->effective_sz); } +struct damos_sysfs_qgoal_tuner_name { + enum damos_quota_goal_tuner tuner; + char *name; +}; + +static struct damos_sysfs_qgoal_tuner_name damos_sysfs_qgoal_tuner_names[] = { + { + .tuner = DAMOS_QUOTA_GOAL_TUNER_CONSIST, + .name = "consist", + }, + { + .tuner = DAMOS_QUOTA_GOAL_TUNER_TEMPORAL, + .name = "temporal", + }, +}; + +static ssize_t goal_tuner_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int i; + + for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_tuner_names); i++) { + struct damos_sysfs_qgoal_tuner_name *tuner_name; + + tuner_name = &damos_sysfs_qgoal_tuner_names[i]; + if (tuner_name->tuner == quotas->goal_tuner) + return sysfs_emit(buf, "%s\n", tuner_name->name); + } + return -EINVAL; +} + +static ssize_t goal_tuner_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int i; + + for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_tuner_names); i++) { + struct damos_sysfs_qgoal_tuner_name *tuner_name; + + tuner_name = &damos_sysfs_qgoal_tuner_names[i]; + if (sysfs_streq(buf, tuner_name->name)) { + quotas->goal_tuner = tuner_name->tuner; + return count; + } + } + return -EINVAL; +} + static void damon_sysfs_quotas_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); @@ -1627,11 +1680,15 @@ static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = static struct kobj_attribute damon_sysfs_quotas_effective_bytes_attr = __ATTR_RO_MODE(effective_bytes, 0400); +static struct kobj_attribute damon_sysfs_quotas_goal_tuner_attr = + __ATTR_RW_MODE(goal_tuner, 0600); + static struct attribute *damon_sysfs_quotas_attrs[] = { &damon_sysfs_quotas_ms_attr.attr, &damon_sysfs_quotas_sz_attr.attr, &damon_sysfs_quotas_reset_interval_ms_attr.attr, &damon_sysfs_quotas_effective_bytes_attr.attr, + &damon_sysfs_quotas_goal_tuner_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_quotas); @@ -2718,6 +2775,7 @@ static struct damos *damon_sysfs_mk_scheme( .weight_sz = sysfs_weights->sz, .weight_nr_accesses = sysfs_weights->nr_accesses, .weight_age = sysfs_weights->age, + .goal_tuner = sysfs_quotas->goal_tuner, }; struct damos_watermarks wmarks = { .metric = sysfs_wmarks->metric, diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 576d1ddd736b..eefa959aa30a 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1524,8 +1524,10 @@ static int damon_sysfs_commit_input(void *data) if (IS_ERR(param_ctx)) return PTR_ERR(param_ctx); test_ctx = damon_sysfs_new_test_ctx(kdamond->damon_ctx); - if (!test_ctx) + if (!test_ctx) { + damon_destroy_ctx(param_ctx); return -ENOMEM; + } err = damon_commit_ctx(test_ctx, param_ctx); if (err) goto out; @@ -1618,9 +1620,12 @@ static int damon_sysfs_repeat_call_fn(void *data) if (!mutex_trylock(&damon_sysfs_lock)) return 0; + if (sysfs_kdamond->contexts->nr != 1) + goto out; damon_sysfs_upd_tuned_intervals(sysfs_kdamond); damon_sysfs_upd_schemes_stats(sysfs_kdamond); damon_sysfs_upd_schemes_effective_quotas(sysfs_kdamond); +out: mutex_unlock(&damon_sysfs_lock); return 0; } @@ -1665,7 +1670,8 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) repeat_call_control->data = kdamond; repeat_call_control->repeat = true; repeat_call_control->dealloc_on_cancel = true; - damon_call(ctx, repeat_call_control); + if (damon_call(ctx, repeat_call_control)) + kfree(repeat_call_control); return err; } @@ -1747,6 +1753,9 @@ static int damon_sysfs_update_schemes_tried_regions( static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, struct damon_sysfs_kdamond *kdamond) { + if (cmd != DAMON_SYSFS_CMD_OFF && kdamond->contexts->nr != 1) + return -EINVAL; + switch (cmd) { case DAMON_SYSFS_CMD_ON: return damon_sysfs_turn_damon_on(kdamond); diff --git a/mm/damon/tests/.kunitconfig b/mm/damon/tests/.kunitconfig index 36a450f57b58..144d27e6ecc5 100644 --- a/mm/damon/tests/.kunitconfig +++ b/mm/damon/tests/.kunitconfig @@ -13,3 +13,6 @@ CONFIG_DAMON_VADDR_KUNIT_TEST=y CONFIG_SYSFS=y CONFIG_DAMON_SYSFS=y CONFIG_DAMON_SYSFS_KUNIT_TEST=y + +# enable DAMON_DEBUG_SANITY to catch any bug +CONFIG_DAMON_DEBUG_SANITY=y diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 596f33ec2d81..9e5904c2beeb 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -693,6 +693,7 @@ static void damos_test_commit_quota(struct kunit *test) .reset_interval = 1, .ms = 2, .sz = 3, + .goal_tuner = DAMOS_QUOTA_GOAL_TUNER_CONSIST, .weight_sz = 4, .weight_nr_accesses = 5, .weight_age = 6, @@ -701,6 +702,7 @@ static void damos_test_commit_quota(struct kunit *test) .reset_interval = 7, .ms = 8, .sz = 9, + .goal_tuner = DAMOS_QUOTA_GOAL_TUNER_TEMPORAL, .weight_sz = 10, .weight_nr_accesses = 11, .weight_age = 12, @@ -714,6 +716,7 @@ static void damos_test_commit_quota(struct kunit *test) KUNIT_EXPECT_EQ(test, dst.reset_interval, src.reset_interval); KUNIT_EXPECT_EQ(test, dst.ms, src.ms); KUNIT_EXPECT_EQ(test, dst.sz, src.sz); + KUNIT_EXPECT_EQ(test, dst.goal_tuner, src.goal_tuner); KUNIT_EXPECT_EQ(test, dst.weight_sz, src.weight_sz); KUNIT_EXPECT_EQ(test, dst.weight_nr_accesses, src.weight_nr_accesses); KUNIT_EXPECT_EQ(test, dst.weight_age, src.weight_age); @@ -1057,6 +1060,27 @@ static void damon_test_commit_target_regions(struct kunit *test) (unsigned long[][2]) {{3, 8}, {8, 10}}, 2); } +static void damon_test_commit_ctx(struct kunit *test) +{ + struct damon_ctx *src, *dst; + + src = damon_new_ctx(); + if (!src) + kunit_skip(test, "src alloc fail"); + dst = damon_new_ctx(); + if (!dst) { + damon_destroy_ctx(src); + kunit_skip(test, "dst alloc fail"); + } + /* Only power of two min_region_sz is allowed. */ + src->min_region_sz = 4096; + KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0); + src->min_region_sz = 4095; + KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), -EINVAL); + damon_destroy_ctx(src); + damon_destroy_ctx(dst); +} + static void damos_test_filter_out(struct kunit *test) { struct damon_target *t; @@ -1239,6 +1263,79 @@ static void damon_test_set_filters_default_reject(struct kunit *test) damos_free_filter(target_filter); } +static void damon_test_apply_min_nr_regions_for(struct kunit *test, + unsigned long sz_regions, unsigned long min_region_sz, + unsigned long min_nr_regions, + unsigned long max_region_sz_expect, + unsigned long nr_regions_expect) +{ + struct damon_ctx *ctx; + struct damon_target *t; + struct damon_region *r; + unsigned long max_region_size; + + ctx = damon_new_ctx(); + if (!ctx) + kunit_skip(test, "ctx alloc fail\n"); + t = damon_new_target(); + if (!t) { + damon_destroy_ctx(ctx); + kunit_skip(test, "target alloc fail\n"); + } + damon_add_target(ctx, t); + r = damon_new_region(0, sz_regions); + if (!r) { + damon_destroy_ctx(ctx); + kunit_skip(test, "region alloc fail\n"); + } + damon_add_region(r, t); + + ctx->min_region_sz = min_region_sz; + ctx->attrs.min_nr_regions = min_nr_regions; + max_region_size = damon_apply_min_nr_regions(ctx); + + KUNIT_EXPECT_EQ(test, max_region_size, max_region_sz_expect); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_regions_expect); + + damon_destroy_ctx(ctx); +} + +static void damon_test_apply_min_nr_regions(struct kunit *test) +{ + /* common, expected setup */ + damon_test_apply_min_nr_regions_for(test, 10, 1, 10, 1, 10); + /* no zero size limit */ + damon_test_apply_min_nr_regions_for(test, 10, 1, 15, 1, 10); + /* max size should be aligned by min_region_sz */ + damon_test_apply_min_nr_regions_for(test, 10, 2, 2, 6, 2); + /* + * when min_nr_regions and min_region_sz conflicts, min_region_sz wins. + */ + damon_test_apply_min_nr_regions_for(test, 10, 2, 10, 2, 5); +} + +static void damon_test_is_last_region(struct kunit *test) +{ + struct damon_region *r; + struct damon_target *t; + int i; + + t = damon_new_target(); + if (!t) + kunit_skip(test, "target alloc fail\n"); + + for (i = 0; i < 4; i++) { + r = damon_new_region(i * 2, (i + 1) * 2); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc %d fail\n", i); + } + damon_add_region(r, t); + KUNIT_EXPECT_TRUE(test, damon_is_last_region(r, t)); + } + damon_free_target(t); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -1262,9 +1359,12 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damos_test_commit_pageout), KUNIT_CASE(damos_test_commit_migrate_hot), KUNIT_CASE(damon_test_commit_target_regions), + KUNIT_CASE(damon_test_commit_ctx), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), KUNIT_CASE(damon_test_set_filters_default_reject), + KUNIT_CASE(damon_test_apply_min_nr_regions), + KUNIT_CASE(damon_test_is_last_region), {}, }; diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index cfae870178bf..98e734d77d51 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -252,88 +252,12 @@ static void damon_test_apply_three_regions4(struct kunit *test) new_three_regions, expected, ARRAY_SIZE(expected)); } -static void damon_test_split_evenly_fail(struct kunit *test, - unsigned long start, unsigned long end, unsigned int nr_pieces) -{ - struct damon_target *t = damon_new_target(); - struct damon_region *r; - - if (!t) - kunit_skip(test, "target alloc fail"); - - r = damon_new_region(start, end); - if (!r) { - damon_free_target(t); - kunit_skip(test, "region alloc fail"); - } - - damon_add_region(r, t); - KUNIT_EXPECT_EQ(test, - damon_va_evenly_split_region(t, r, nr_pieces), -EINVAL); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u); - - damon_for_each_region(r, t) { - KUNIT_EXPECT_EQ(test, r->ar.start, start); - KUNIT_EXPECT_EQ(test, r->ar.end, end); - } - - damon_free_target(t); -} - -static void damon_test_split_evenly_succ(struct kunit *test, - unsigned long start, unsigned long end, unsigned int nr_pieces) -{ - struct damon_target *t = damon_new_target(); - struct damon_region *r; - unsigned long expected_width = (end - start) / nr_pieces; - unsigned long i = 0; - - if (!t) - kunit_skip(test, "target alloc fail"); - r = damon_new_region(start, end); - if (!r) { - damon_free_target(t); - kunit_skip(test, "region alloc fail"); - } - damon_add_region(r, t); - KUNIT_EXPECT_EQ(test, - damon_va_evenly_split_region(t, r, nr_pieces), 0); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces); - - damon_for_each_region(r, t) { - if (i == nr_pieces - 1) { - KUNIT_EXPECT_EQ(test, - r->ar.start, start + i * expected_width); - KUNIT_EXPECT_EQ(test, r->ar.end, end); - break; - } - KUNIT_EXPECT_EQ(test, - r->ar.start, start + i++ * expected_width); - KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width); - } - damon_free_target(t); -} - -static void damon_test_split_evenly(struct kunit *test) -{ - KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5), - -EINVAL); - - damon_test_split_evenly_fail(test, 0, 100, 0); - damon_test_split_evenly_succ(test, 0, 100, 10); - damon_test_split_evenly_succ(test, 5, 59, 5); - damon_test_split_evenly_succ(test, 4, 6, 1); - damon_test_split_evenly_succ(test, 0, 3, 2); - damon_test_split_evenly_fail(test, 5, 6, 2); -} - static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_three_regions_in_vmas), KUNIT_CASE(damon_test_apply_three_regions1), KUNIT_CASE(damon_test_apply_three_regions2), KUNIT_CASE(damon_test_apply_three_regions3), KUNIT_CASE(damon_test_apply_three_regions4), - KUNIT_CASE(damon_test_split_evenly), {}, }; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 729b7ffd3565..b069dbc7e3d2 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -53,52 +53,6 @@ static struct mm_struct *damon_get_mm(struct damon_target *t) return mm; } -/* - * Functions for the initial monitoring target regions construction - */ - -/* - * Size-evenly split a region into 'nr_pieces' small regions - * - * Returns 0 on success, or negative error code otherwise. - */ -static int damon_va_evenly_split_region(struct damon_target *t, - struct damon_region *r, unsigned int nr_pieces) -{ - unsigned long sz_orig, sz_piece, orig_end; - struct damon_region *n = NULL, *next; - unsigned long start; - unsigned int i; - - if (!r || !nr_pieces) - return -EINVAL; - - if (nr_pieces == 1) - return 0; - - orig_end = r->ar.end; - sz_orig = damon_sz_region(r); - sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION_SZ); - - if (!sz_piece) - return -EINVAL; - - r->ar.end = r->ar.start + sz_piece; - next = damon_next_region(r); - for (start = r->ar.end, i = 1; i < nr_pieces; start += sz_piece, i++) { - n = damon_new_region(start, start + sz_piece); - if (!n) - return -ENOMEM; - damon_insert_region(n, r, next, t); - r = n; - } - /* complement last region for possible rounding error */ - if (n) - n->ar.end = orig_end; - - return 0; -} - static unsigned long sz_range(struct damon_addr_range *r) { return r->end - r->start; @@ -240,10 +194,8 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, struct damon_target *t) { struct damon_target *ti; - struct damon_region *r; struct damon_addr_range regions[3]; - unsigned long sz = 0, nr_pieces; - int i, tidx = 0; + int tidx = 0; if (damon_va_three_regions(t, regions)) { damon_for_each_target(ti, ctx) { @@ -255,25 +207,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, return; } - for (i = 0; i < 3; i++) - sz += regions[i].end - regions[i].start; - if (ctx->attrs.min_nr_regions) - sz /= ctx->attrs.min_nr_regions; - if (sz < DAMON_MIN_REGION_SZ) - sz = DAMON_MIN_REGION_SZ; - - /* Set the initial three regions of the target */ - for (i = 0; i < 3; i++) { - r = damon_new_region(regions[i].start, regions[i].end); - if (!r) { - pr_err("%d'th init region creation failed\n", i); - return; - } - damon_add_region(r, t); - - nr_pieces = (regions[i].end - regions[i].start) / sz; - damon_va_evenly_split_region(t, r, nr_pieces); - } + damon_set_regions(t, regions, 3, DAMON_MIN_REGION_SZ); } /* Initialize '->regions_list' of every target (task) */ @@ -985,8 +919,7 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, } static int damon_va_scheme_score(struct damon_ctx *context, - struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damon_region *r, struct damos *scheme) { switch (scheme->action) { diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 83cf07269f13..23dc3ee09561 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -445,7 +445,7 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args) * X86 defined pmd_set_huge() verifies that the given * PMD is not a populated non-leaf entry. */ - WRITE_ONCE(*args->pmdp, __pmd(0)); + pmd_clear(args->pmdp); WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot)); WARN_ON(!pmd_clear_huge(args->pmdp)); pmd = pmdp_get(args->pmdp); @@ -465,7 +465,7 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args) * X86 defined pud_set_huge() verifies that the given * PUD is not a populated non-leaf entry. */ - WRITE_ONCE(*args->pudp, __pud(0)); + pud_clear(args->pudp); WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot)); WARN_ON(!pud_clear_huge(args->pudp)); pud = pudp_get(args->pudp); diff --git a/mm/execmem.c b/mm/execmem.c index 810a4ba9c924..084a207e4278 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -203,13 +203,6 @@ static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask) return mas_store_gfp(&mas, (void *)lower, gfp_mask); } -static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask) -{ - guard(mutex)(&execmem_cache.mutex); - - return execmem_cache_add_locked(ptr, size, gfp_mask); -} - static bool within_range(struct execmem_range *range, struct ma_state *mas, size_t size) { @@ -225,18 +218,16 @@ static bool within_range(struct execmem_range *range, struct ma_state *mas, return false; } -static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) +static void *execmem_cache_alloc_locked(struct execmem_range *range, size_t size) { struct maple_tree *free_areas = &execmem_cache.free_areas; struct maple_tree *busy_areas = &execmem_cache.busy_areas; MA_STATE(mas_free, free_areas, 0, ULONG_MAX); MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); - struct mutex *mutex = &execmem_cache.mutex; unsigned long addr, last, area_size = 0; void *area, *ptr = NULL; int err; - mutex_lock(mutex); mas_for_each(&mas_free, area, ULONG_MAX) { area_size = mas_range_len(&mas_free); @@ -245,7 +236,7 @@ static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) } if (area_size < size) - goto out_unlock; + return NULL; addr = mas_free.index; last = mas_free.last; @@ -254,7 +245,7 @@ static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) mas_set_range(&mas_busy, addr, addr + size - 1); err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL); if (err) - goto out_unlock; + return NULL; mas_store_gfp(&mas_free, NULL, GFP_KERNEL); if (area_size > size) { @@ -268,19 +259,25 @@ static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL); if (err) { mas_store_gfp(&mas_busy, NULL, GFP_KERNEL); - goto out_unlock; + return NULL; } } ptr = (void *)addr; -out_unlock: - mutex_unlock(mutex); return ptr; } -static int execmem_cache_populate(struct execmem_range *range, size_t size) +static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) +{ + guard(mutex)(&execmem_cache.mutex); + + return execmem_cache_alloc_locked(range, size); +} + +static void *execmem_cache_populate_alloc(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; + struct mutex *mutex = &execmem_cache.mutex; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -294,7 +291,7 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) } if (!p) - return err; + return NULL; vm = find_vm_area(p); if (!vm) @@ -307,33 +304,39 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) if (err) goto err_free_mem; - err = execmem_cache_add(p, alloc_size, GFP_KERNEL); + /* + * New memory blocks must be allocated and added to the cache + * as an atomic operation, otherwise they may be consumed + * by a parallel call to the execmem_cache_alloc function. + */ + mutex_lock(mutex); + err = execmem_cache_add_locked(p, alloc_size, GFP_KERNEL); if (err) goto err_reset_direct_map; - return 0; + p = execmem_cache_alloc_locked(range, size); + + mutex_unlock(mutex); + + return p; err_reset_direct_map: + mutex_unlock(mutex); execmem_set_direct_map_valid(vm, true); err_free_mem: vfree(p); - return err; + return NULL; } static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { void *p; - int err; p = __execmem_cache_alloc(range, size); if (p) return p; - err = execmem_cache_populate(range, size); - if (err) - return NULL; - - return __execmem_cache_alloc(range, size); + return execmem_cache_populate_alloc(range, size); } static inline bool is_pending_free(void *ptr) diff --git a/mm/fadvise.c b/mm/fadvise.c index 67028e30aa91..b63fe21416ff 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -43,7 +43,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) return -ESPIPE; mapping = file->f_mapping; - if (!mapping || len < 0) + if (!mapping || len < 0 || offset < 0) return -EINVAL; bdi = inode_to_bdi(mapping->host); diff --git a/mm/filemap.c b/mm/filemap.c index 406cef06b684..4e636647100c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -31,7 +31,7 @@ #include <linux/hash.h> #include <linux/writeback.h> #include <linux/backing-dev.h> -#include <linux/pagevec.h> +#include <linux/folio_batch.h> #include <linux/security.h> #include <linux/cpuset.h> #include <linux/hugetlb.h> @@ -228,7 +228,8 @@ void __filemap_remove_folio(struct folio *folio, void *shadow) page_cache_delete(mapping, folio, shadow); } -void filemap_free_folio(struct address_space *mapping, struct folio *folio) +static void filemap_free_folio(const struct address_space *mapping, + struct folio *folio) { void (*free_folio)(struct folio *); @@ -3883,14 +3884,19 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, unsigned int nr_pages = 0, folio_type; unsigned short mmap_miss = 0, mmap_miss_saved; + /* + * Recalculate end_pgoff based on file_end before calling + * next_uptodate_folio() to avoid races with concurrent + * truncation. + */ + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; + end_pgoff = min(end_pgoff, file_end); + rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); if (!folio) goto out; - file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; - end_pgoff = min(end_pgoff, file_end); - /* * Do not allow to map with PMD across i_size to preserve * SIGBUS semantics. @@ -18,7 +18,7 @@ #include <linux/hugetlb.h> #include <linux/migrate.h> #include <linux/mm_inline.h> -#include <linux/pagevec.h> +#include <linux/folio_batch.h> #include <linux/sched/mm.h> #include <linux/shmem_fs.h> @@ -778,7 +778,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, struct page *page = hmm_pfn_to_page(pfns[idx]); phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]); size_t offset = idx * map->dma_entry_size; - unsigned long attrs = 0; + unsigned long attrs = DMA_ATTR_REQUIRE_COHERENT; dma_addr_t dma_addr; int ret; @@ -871,7 +871,7 @@ bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx) struct dma_iova_state *state = &map->state; dma_addr_t *dma_addrs = map->dma_list; unsigned long *pfns = map->pfn_list; - unsigned long attrs = 0; + unsigned long attrs = DMA_ATTR_REQUIRE_COHERENT; if ((pfns[idx] & valid_dma) != valid_dma) return false; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b298cba853ab..970e077019b7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -100,6 +100,14 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } +/* If returns true, we are unable to access the VMA's folios. */ +static bool vma_is_special_huge(const struct vm_area_struct *vma) +{ + if (vma_is_dax(vma)) + return false; + return vma_test_any(vma, VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT); +} + unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, enum tva_type type, @@ -113,8 +121,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, /* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma)) supported_orders = THP_ORDERS_ALL_ANON; - else if (vma_is_special_huge(vma)) - supported_orders = THP_ORDERS_ALL_SPECIAL; + else if (vma_is_dax(vma) || vma_is_special_huge(vma)) + supported_orders = THP_ORDERS_ALL_SPECIAL_DAX; else supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; @@ -316,30 +324,77 @@ static ssize_t enabled_show(struct kobject *kobj, return sysfs_emit(buf, "%s\n", output); } +enum anon_enabled_mode { + ANON_ENABLED_ALWAYS = 0, + ANON_ENABLED_INHERIT = 1, + ANON_ENABLED_MADVISE = 2, + ANON_ENABLED_NEVER = 3, +}; + +static const char * const anon_enabled_mode_strings[] = { + [ANON_ENABLED_ALWAYS] = "always", + [ANON_ENABLED_INHERIT] = "inherit", + [ANON_ENABLED_MADVISE] = "madvise", + [ANON_ENABLED_NEVER] = "never", +}; + +enum global_enabled_mode { + GLOBAL_ENABLED_ALWAYS = 0, + GLOBAL_ENABLED_MADVISE = 1, + GLOBAL_ENABLED_NEVER = 2, +}; + +static const char * const global_enabled_mode_strings[] = { + [GLOBAL_ENABLED_ALWAYS] = "always", + [GLOBAL_ENABLED_MADVISE] = "madvise", + [GLOBAL_ENABLED_NEVER] = "never", +}; + +static bool set_global_enabled_mode(enum global_enabled_mode mode) +{ + static const unsigned long thp_flags[] = { + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + }; + enum global_enabled_mode m; + bool changed = false; + + for (m = 0; m < ARRAY_SIZE(thp_flags); m++) { + if (m == mode) + changed |= !test_and_set_bit(thp_flags[m], + &transparent_hugepage_flags); + else + changed |= test_and_clear_bit(thp_flags[m], + &transparent_hugepage_flags); + } + + return changed; +} + static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - ssize_t ret = count; + int mode; - if (sysfs_streq(buf, "always")) { - clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "madvise")) { - clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "never")) { - clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); - } else - ret = -EINVAL; + mode = sysfs_match_string(global_enabled_mode_strings, buf); + if (mode < 0) + return -EINVAL; - if (ret > 0) { + if (set_global_enabled_mode(mode)) { int err = start_stop_khugepaged(); + if (err) - ret = err; + return err; + } else { + /* + * Recalculate watermarks even when the mode didn't + * change, as the previous code always called + * start_stop_khugepaged() which does this internally. + */ + set_recommended_min_free_kbytes(); } - return ret; + return count; } static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); @@ -515,48 +570,54 @@ static ssize_t anon_enabled_show(struct kobject *kobj, return sysfs_emit(buf, "%s\n", output); } +static bool set_anon_enabled_mode(int order, enum anon_enabled_mode mode) +{ + static unsigned long *enabled_orders[] = { + &huge_anon_orders_always, + &huge_anon_orders_inherit, + &huge_anon_orders_madvise, + }; + enum anon_enabled_mode m; + bool changed = false; + + spin_lock(&huge_anon_orders_lock); + for (m = 0; m < ARRAY_SIZE(enabled_orders); m++) { + if (m == mode) + changed |= !__test_and_set_bit(order, enabled_orders[m]); + else + changed |= __test_and_clear_bit(order, enabled_orders[m]); + } + spin_unlock(&huge_anon_orders_lock); + + return changed; +} + static ssize_t anon_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int order = to_thpsize(kobj)->order; - ssize_t ret = count; + int mode; - if (sysfs_streq(buf, "always")) { - spin_lock(&huge_anon_orders_lock); - clear_bit(order, &huge_anon_orders_inherit); - clear_bit(order, &huge_anon_orders_madvise); - set_bit(order, &huge_anon_orders_always); - spin_unlock(&huge_anon_orders_lock); - } else if (sysfs_streq(buf, "inherit")) { - spin_lock(&huge_anon_orders_lock); - clear_bit(order, &huge_anon_orders_always); - clear_bit(order, &huge_anon_orders_madvise); - set_bit(order, &huge_anon_orders_inherit); - spin_unlock(&huge_anon_orders_lock); - } else if (sysfs_streq(buf, "madvise")) { - spin_lock(&huge_anon_orders_lock); - clear_bit(order, &huge_anon_orders_always); - clear_bit(order, &huge_anon_orders_inherit); - set_bit(order, &huge_anon_orders_madvise); - spin_unlock(&huge_anon_orders_lock); - } else if (sysfs_streq(buf, "never")) { - spin_lock(&huge_anon_orders_lock); - clear_bit(order, &huge_anon_orders_always); - clear_bit(order, &huge_anon_orders_inherit); - clear_bit(order, &huge_anon_orders_madvise); - spin_unlock(&huge_anon_orders_lock); - } else - ret = -EINVAL; + mode = sysfs_match_string(anon_enabled_mode_strings, buf); + if (mode < 0) + return -EINVAL; - if (ret > 0) { - int err; + if (set_anon_enabled_mode(order, mode)) { + int err = start_stop_khugepaged(); - err = start_stop_khugepaged(); if (err) - ret = err; + return err; + } else { + /* + * Recalculate watermarks even when the mode didn't + * change, as the previous code always called + * start_stop_khugepaged() which does this internally. + */ + set_recommended_min_free_kbytes(); } - return ret; + + return count; } static struct kobj_attribute anon_enabled_attr = @@ -1157,13 +1218,29 @@ retry: static struct deferred_split *folio_split_queue_lock(struct folio *folio) { - return split_queue_lock(folio_nid(folio), folio_memcg(folio)); + struct deferred_split *queue; + + rcu_read_lock(); + queue = split_queue_lock(folio_nid(folio), folio_memcg(folio)); + /* + * The memcg destruction path is acquiring the split queue lock for + * reparenting. Once you have it locked, it's safe to drop the rcu lock. + */ + rcu_read_unlock(); + + return queue; } static struct deferred_split * folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) { - return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); + struct deferred_split *queue; + + rcu_read_lock(); + queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); + rcu_read_unlock(); + + return queue; } static inline void split_queue_unlock(struct deferred_split *queue) @@ -2341,17 +2418,87 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) mm_dec_nr_ptes(mm); } -int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, +static void zap_huge_pmd_folio(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t pmdval, struct folio *folio, bool is_present) +{ + const bool is_device_private = folio_is_device_private(folio); + + /* Present and device private folios are rmappable. */ + if (is_present || is_device_private) + folio_remove_rmap_pmd(folio, &folio->page, vma); + + if (folio_test_anon(folio)) { + add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR); + } else { + add_mm_counter(mm, mm_counter_file(folio), + -HPAGE_PMD_NR); + + if (is_present && pmd_young(pmdval) && + likely(vma_has_recency(vma))) + folio_mark_accessed(folio); + } + + /* Device private folios are pinned. */ + if (is_device_private) + folio_put(folio); +} + +static struct folio *normal_or_softleaf_folio_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmdval, bool is_present) +{ + if (is_present) + return vm_normal_folio_pmd(vma, addr, pmdval); + + if (!thp_migration_supported()) + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + return pmd_to_softleaf_folio(pmdval); +} + +static bool has_deposited_pgtable(struct vm_area_struct *vma, pmd_t pmdval, + struct folio *folio) +{ + /* Some architectures require unconditional depositing. */ + if (arch_needs_pgtable_deposit()) + return true; + + /* + * Huge zero always deposited except for DAX which handles itself, see + * set_huge_zero_folio(). + */ + if (is_huge_zero_pmd(pmdval)) + return !vma_is_dax(vma); + + /* + * Otherwise, only anonymous folios are deposited, see + * __do_huge_pmd_anonymous_page(). + */ + return folio && folio_test_anon(folio); +} + +/** + * zap_huge_pmd - Zap a huge THP which is of PMD size. + * @tlb: The MMU gather TLB state associated with the operation. + * @vma: The VMA containing the range to zap. + * @pmd: A pointer to the leaf PMD entry. + * @addr: The virtual address for the range to zap. + * + * Returns: %true on success, %false otherwise. + */ +bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { - pmd_t orig_pmd; + struct mm_struct *mm = tlb->mm; + struct folio *folio = NULL; + bool is_present = false; + bool has_deposit; spinlock_t *ptl; + pmd_t orig_pmd; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) - return 0; + return false; /* * For architectures like ppc64 we look at deposited pgtable * when calling pmdp_huge_get_and_clear. So do the @@ -2362,64 +2509,19 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb->fullmm); arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { - if (arch_needs_pgtable_deposit()) - zap_deposited_table(tlb->mm, pmd); - spin_unlock(ptl); - } else if (is_huge_zero_pmd(orig_pmd)) { - if (!vma_is_dax(vma) || arch_needs_pgtable_deposit()) - zap_deposited_table(tlb->mm, pmd); - spin_unlock(ptl); - } else { - struct folio *folio = NULL; - int flush_needed = 1; - - if (pmd_present(orig_pmd)) { - struct page *page = pmd_page(orig_pmd); - - folio = page_folio(page); - folio_remove_rmap_pmd(folio, page, vma); - WARN_ON_ONCE(folio_mapcount(folio) < 0); - VM_BUG_ON_PAGE(!PageHead(page), page); - } else if (pmd_is_valid_softleaf(orig_pmd)) { - const softleaf_t entry = softleaf_from_pmd(orig_pmd); - - folio = softleaf_to_folio(entry); - flush_needed = 0; - - if (!thp_migration_supported()) - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); - } - - if (folio_test_anon(folio)) { - zap_deposited_table(tlb->mm, pmd); - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); - } else { - if (arch_needs_pgtable_deposit()) - zap_deposited_table(tlb->mm, pmd); - add_mm_counter(tlb->mm, mm_counter_file(folio), - -HPAGE_PMD_NR); - - /* - * Use flush_needed to indicate whether the PMD entry - * is present, instead of checking pmd_present() again. - */ - if (flush_needed && pmd_young(orig_pmd) && - likely(vma_has_recency(vma))) - folio_mark_accessed(folio); - } - if (folio_is_device_private(folio)) { - folio_remove_rmap_pmd(folio, &folio->page, vma); - WARN_ON_ONCE(folio_mapcount(folio) < 0); - folio_put(folio); - } + is_present = pmd_present(orig_pmd); + folio = normal_or_softleaf_folio_pmd(vma, addr, orig_pmd, is_present); + has_deposit = has_deposited_pgtable(vma, orig_pmd, folio); + if (folio) + zap_huge_pmd_folio(mm, vma, orig_pmd, folio, is_present); + if (has_deposit) + zap_deposited_table(mm, pmd); - spin_unlock(ptl); - if (flush_needed) - tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); - } - return 1; + spin_unlock(ptl); + if (is_present && folio) + tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); + return true; } #ifndef pmd_move_must_withdraw @@ -2864,7 +2966,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); arch_check_zapped_pud(vma, orig_pud); tlb_remove_pud_tlb_entry(tlb, pud, addr); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { + if (vma_is_special_huge(vma)) { spin_unlock(ptl); /* No zero page support yet */ } else { @@ -2972,7 +3074,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { pte_t entry; - entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot); + entry = pfn_pte(zero_pfn(addr), vma->vm_page_prot); entry = pte_mkspecial(entry); if (pmd_uffd_wp(old_pmd)) entry = pte_mkuffd_wp(entry); @@ -3015,7 +3117,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, */ if (arch_needs_pgtable_deposit()) zap_deposited_table(mm, pmd); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) + if (vma_is_special_huge(vma)) return; if (unlikely(pmd_is_migration_entry(old_pmd))) { const softleaf_t old_entry = softleaf_from_pmd(old_pmd); @@ -3908,7 +4010,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n folio_ref_unfreeze(folio, folio_cache_ref_count(folio) + 1); if (do_lru) - unlock_page_lruvec(lruvec); + lruvec_unlock(lruvec); if (ci) swap_cluster_unlock(ci); @@ -4106,7 +4208,7 @@ out_unlock: i_mmap_unlock_read(mapping); out: xas_destroy(&xas); - if (old_order == HPAGE_PMD_ORDER) + if (is_pmd_order(old_order)) count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); return ret; @@ -4456,7 +4558,7 @@ retry: goto next; } if (!folio_trylock(folio)) - goto next; + goto requeue; if (!split_folio(folio)) { did_split = true; if (underused) @@ -4465,13 +4567,18 @@ retry: } folio_unlock(folio); next: + /* + * If thp_underused() returns false, or if split_folio() + * succeeds, or if split_folio() fails in the case it was + * underused, then consider it used and don't add it back to + * split_queue. + */ if (did_split || !folio_test_partially_mapped(folio)) continue; +requeue: /* - * Only add back to the queue if folio is partially mapped. - * If thp_underused returns false, or if split_folio fails - * in the case it was underused, then consider it used and - * don't add it back to split_queue. + * Add back partially mapped folios, or underused folios that + * we could not lock this round. */ fqueue = folio_split_queue_lock_irqsave(folio, &flags); if (list_empty(&folio->_deferred_list)) { @@ -4576,8 +4683,16 @@ next: static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) { - return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || - is_vm_hugetlb_page(vma); + if (vma_is_dax(vma)) + return true; + if (vma_is_special_huge(vma)) + return true; + if (vma_test(vma, VMA_IO_BIT)) + return true; + if (is_vm_hugetlb_page(vma)) + return true; + + return false; } static int split_huge_pages_pid(int pid, unsigned long vaddr_start, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 327eaa4074d3..f24bf49be047 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1017,34 +1017,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -/** - * vma_kernel_pagesize - Page size granularity for this VMA. - * @vma: The user mapping. - * - * Folios in this VMA will be aligned to, and at least the size of the - * number of bytes returned by this function. - * - * Return: The default size of the folios allocated when backing a VMA. - */ -unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) -{ - if (vma->vm_ops && vma->vm_ops->pagesize) - return vma->vm_ops->pagesize(vma); - return PAGE_SIZE; -} -EXPORT_SYMBOL_GPL(vma_kernel_pagesize); - -/* - * Return the page size being used by the MMU to back a VMA. In the majority - * of cases, the page size used by the kernel matches the MMU size. On - * architectures where it differs, an architecture-specific 'strong' - * version of this symbol is required. - */ -__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) -{ - return vma_kernel_pagesize(vma); -} - /* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to @@ -1157,15 +1129,7 @@ void resv_map_release(struct kref *ref) static inline struct resv_map *inode_resv_map(struct inode *inode) { - /* - * At inode evict time, i_mapping may not point to the original - * address space within the inode. This original address space - * contains the pointer to the resv_map. So, always use the - * address space embedded within the inode. - * The VERY common case is inode->mapping == &inode->i_data but, - * this may not be true for device special inodes. - */ - return (struct resv_map *)(&inode->i_data)->i_private_data; + return HUGETLBFS_I(inode)->resv_map; } static struct resv_map *vma_resv_map(struct vm_area_struct *vma) @@ -1194,7 +1158,7 @@ static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT)); desc->private_data = map; } @@ -1202,7 +1166,7 @@ static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *ma static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT)); desc->private_data = (void *)((unsigned long)desc->private_data | flags); } @@ -3168,6 +3132,7 @@ found: /* Initialize [start_page:end_page_number] tail struct pages of a hugepage */ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, + struct hstate *h, unsigned long start_page_number, unsigned long end_page_number) { @@ -3176,6 +3141,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, struct page *page = folio_page(folio, start_page_number); unsigned long head_pfn = folio_pfn(folio); unsigned long pfn, end_pfn = head_pfn + end_page_number; + unsigned int order = huge_page_order(h); /* * As we marked all tail pages with memblock_reserved_mark_noinit(), @@ -3183,7 +3149,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, */ for (pfn = head_pfn + start_page_number; pfn < end_pfn; page++, pfn++) { __init_single_page(page, pfn, zone, nid); - prep_compound_tail((struct page *)folio, pfn - head_pfn); + prep_compound_tail(page, &folio->page, order); set_page_count(page, 0); } } @@ -3203,7 +3169,7 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, __folio_set_head(folio); ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); - hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); + hugetlb_folio_init_tail_vmemmap(folio, h, 1, nr_pages); prep_compound_head(&folio->page, huge_page_order(h)); } @@ -3260,7 +3226,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, * time as this is early in boot and there should * be no contention. */ - hugetlb_folio_init_tail_vmemmap(folio, + hugetlb_folio_init_tail_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } @@ -4252,6 +4218,9 @@ static __init int hugetlb_add_param(char *s, int (*setup)(char *)) size_t len; char *p; + if (!s) + return -EINVAL; + if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS) return -EINVAL; @@ -4818,6 +4787,18 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) return 0; } +#ifdef CONFIG_USERFAULTFD +static bool hugetlb_can_userfault(struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + return true; +} + +static const struct vm_uffd_ops hugetlb_uffd_ops = { + .can_userfault = hugetlb_can_userfault, +}; +#endif + /* * When a new function is introduced to vm_operations_struct and added * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. @@ -4831,6 +4812,9 @@ const struct vm_operations_struct hugetlb_vm_ops = { .close = hugetlb_vm_op_close, .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &hugetlb_uffd_ops, +#endif }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, @@ -6600,7 +6584,7 @@ long hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -6634,7 +6618,7 @@ long hugetlb_reserve_pages(struct inode *inode, if (err < 0) goto out_err; - if (desc && !vma_desc_test_flags(desc, VMA_MAYSHARE_BIT) && h_cg) { + if (desc && !vma_desc_test(desc, VMA_MAYSHARE_BIT) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -6671,7 +6655,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -6735,7 +6719,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) + if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index a9280259e12a..4a077d231d3a 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -19,14 +19,15 @@ #include <asm/tlbflush.h> #include "hugetlb_vmemmap.h" +#include "internal.h" /** * struct vmemmap_remap_walk - walk vmemmap page table * * @remap_pte: called for each lowest-level entry (PTE). * @nr_walked: the number of walked pte. - * @reuse_page: the page which is reused for the tail vmemmap pages. - * @reuse_addr: the virtual address of the @reuse_page page. + * @vmemmap_head: the page to be installed as first in the vmemmap range + * @vmemmap_tail: the page to be installed as non-first in the vmemmap range * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. * @flags: used to modify behavior in vmemmap page table walking @@ -35,17 +36,17 @@ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk); + unsigned long nr_walked; - struct page *reuse_page; - unsigned long reuse_addr; + struct page *vmemmap_head; + struct page *vmemmap_tail; struct list_head *vmemmap_pages; + /* Skip the TLB flush when we split the PMD */ #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) /* Skip the TLB flush when we remap the PTE */ #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) -/* synchronize_rcu() to avoid writes from page_ref_add_unless() */ -#define VMEMMAP_SYNCHRONIZE_RCU BIT(2) unsigned long flags; }; @@ -141,14 +142,7 @@ static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, { struct vmemmap_remap_walk *vmemmap_walk = walk->private; - /* - * The reuse_page is found 'first' in page table walking before - * starting remapping. - */ - if (!vmemmap_walk->reuse_page) - vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); - else - vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); + vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); vmemmap_walk->nr_walked++; return 0; @@ -208,18 +202,12 @@ static void free_vmemmap_page_list(struct list_head *list) static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { - /* - * Remap the tail pages as read-only to catch illegal write operation - * to the tail pages. - */ - pgprot_t pgprot = PAGE_KERNEL_RO; struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ - if (unlikely(addr == walk->reuse_addr)) { - pgprot = PAGE_KERNEL; - list_del(&walk->reuse_page->lru); + if (unlikely(walk->nr_walked == 0 && walk->vmemmap_head)) { + list_del(&walk->vmemmap_head->lru); /* * Makes sure that preceding stores to the page contents from @@ -227,53 +215,50 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * write. */ smp_wmb(); + + entry = mk_pte(walk->vmemmap_head, PAGE_KERNEL); + } else { + /* + * Remap the tail pages as read-only to catch illegal write + * operation to the tail pages. + */ + entry = mk_pte(walk->vmemmap_tail, PAGE_KERNEL_RO); } - entry = mk_pte(walk->reuse_page, pgprot); list_add(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } -/* - * How many struct page structs need to be reset. When we reuse the head - * struct page, the special metadata (e.g. page->flags or page->mapping) - * cannot copy to the tail struct page structs. The invalid value will be - * checked in the free_tail_page_prepare(). In order to avoid the message - * of "corrupted mapping in tail page". We need to reset at least 4 (one - * head struct page struct and three tail struct page structs) struct page - * structs. - */ -#define NR_RESET_STRUCT_PAGE 4 - -static inline void reset_struct_pages(struct page *start) -{ - struct page *from = start + NR_RESET_STRUCT_PAGE; - - BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); - memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); -} - static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { - pgprot_t pgprot = PAGE_KERNEL; struct page *page; - void *to; - - BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); + struct page *from, *to; page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); + + /* + * Initialize tail pages in the newly allocated vmemmap page. + * + * There is folio-scope metadata that is encoded in the first few + * tail pages. + * + * Use the value last tail page in the page with the head page + * to initialize the rest of tail pages. + */ + from = compound_head((struct page *)addr) + + PAGE_SIZE / sizeof(struct page) - 1; to = page_to_virt(page); - copy_page(to, (void *)walk->reuse_addr); - reset_struct_pages(to); + for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++, to++) + *to = *from; /* * Makes sure that preceding stores to the page contents become visible * before the set_pte_at() write. */ smp_wmb(); - set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); + set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); } /** @@ -283,33 +268,28 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. - * @reuse: reuse address. - * * Return: %0 on success, negative error code otherwise. */ -static int vmemmap_remap_split(unsigned long start, unsigned long end, - unsigned long reuse) +static int vmemmap_remap_split(unsigned long start, unsigned long end) { struct vmemmap_remap_walk walk = { .remap_pte = NULL, .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, }; - /* See the comment in the vmemmap_remap_free(). */ - BUG_ON(start - reuse != PAGE_SIZE); - - return vmemmap_remap_range(reuse, end, &walk); + return vmemmap_remap_range(start, end, &walk); } /** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) - * to the page which @reuse is mapped to, then free vmemmap - * which the range are mapped to. + * to use @vmemmap_head/tail, then free vmemmap which + * the range are mapped to. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. - * @reuse: reuse address. + * @vmemmap_head: the page to be installed as first in the vmemmap range + * @vmemmap_tail: the page to be installed as non-first in the vmemmap range * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers * responsibility to free pages. * @flags: modifications to vmemmap_remap_walk flags @@ -317,69 +297,38 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end, * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, - unsigned long reuse, + struct page *vmemmap_head, + struct page *vmemmap_tail, struct list_head *vmemmap_pages, unsigned long flags) { int ret; struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_remap_pte, - .reuse_addr = reuse, + .vmemmap_head = vmemmap_head, + .vmemmap_tail = vmemmap_tail, .vmemmap_pages = vmemmap_pages, .flags = flags, }; - int nid = page_to_nid((struct page *)reuse); - gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; - /* - * Allocate a new head vmemmap page to avoid breaking a contiguous - * block of struct page memory when freeing it back to page allocator - * in free_vmemmap_page_list(). This will allow the likely contiguous - * struct page backing memory to be kept contiguous and allowing for - * more allocations of hugepages. Fallback to the currently - * mapped head page in case should it fail to allocate. - */ - walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); - if (walk.reuse_page) { - copy_page(page_to_virt(walk.reuse_page), - (void *)walk.reuse_addr); - list_add(&walk.reuse_page->lru, vmemmap_pages); - memmap_pages_add(1); - } + ret = vmemmap_remap_range(start, end, &walk); + if (!ret || !walk.nr_walked) + return ret; + + end = start + walk.nr_walked * PAGE_SIZE; /* - * In order to make remapping routine most efficient for the huge pages, - * the routine of vmemmap page table walking has the following rules - * (see more details from the vmemmap_pte_range()): - * - * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) - * should be continuous. - * - The @reuse address is part of the range [@reuse, @end) that we are - * walking which is passed to vmemmap_remap_range(). - * - The @reuse address is the first in the complete range. - * - * So we need to make sure that @start and @reuse meet the above rules. + * vmemmap_pages contains pages from the previous vmemmap_remap_range() + * call which failed. These are pages which were removed from + * the vmemmap. They will be restored in the following call. */ - BUG_ON(start - reuse != PAGE_SIZE); + walk = (struct vmemmap_remap_walk) { + .remap_pte = vmemmap_restore_pte, + .vmemmap_pages = vmemmap_pages, + .flags = 0, + }; - ret = vmemmap_remap_range(reuse, end, &walk); - if (ret && walk.nr_walked) { - end = reuse + walk.nr_walked * PAGE_SIZE; - /* - * vmemmap_pages contains pages from the previous - * vmemmap_remap_range call which failed. These - * are pages which were removed from the vmemmap. - * They will be restored in the following call. - */ - walk = (struct vmemmap_remap_walk) { - .remap_pte = vmemmap_restore_pte, - .reuse_addr = reuse, - .vmemmap_pages = vmemmap_pages, - .flags = 0, - }; - - vmemmap_remap_range(reuse, end, &walk); - } + vmemmap_remap_range(start, end, &walk); return ret; } @@ -416,34 +365,26 @@ out: * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. - * @reuse: reuse address. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, - unsigned long reuse, unsigned long flags) + unsigned long flags) { LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_restore_pte, - .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, .flags = flags, }; - /* See the comment in the vmemmap_remap_free(). */ - BUG_ON(start - reuse != PAGE_SIZE); - if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) return -ENOMEM; - return vmemmap_remap_range(reuse, end, &walk); + return vmemmap_remap_range(start, end, &walk); } -DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); -EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); - static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); static int __init hugetlb_vmemmap_optimize_param(char *buf) { @@ -455,8 +396,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags) { int ret; - unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; - unsigned long vmemmap_reuse; + unsigned long vmemmap_start, vmemmap_end; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); @@ -464,25 +404,20 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; - if (flags & VMEMMAP_SYNCHRONIZE_RCU) - synchronize_rcu(); - + vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); - vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * The pages which the vmemmap virtual address range [@vmemmap_start, - * @vmemmap_end) are mapped to are freed to the buddy allocator, and - * the range is mapped to the page which @vmemmap_reuse is mapped to. + * @vmemmap_end) are mapped to are freed to the buddy allocator. * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ - ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); - if (!ret) { + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, flags); + if (!ret) folio_clear_hugetlb_vmemmap_optimized(folio); - static_branch_dec(&hugetlb_optimize_vmemmap_key); - } return ret; } @@ -499,7 +434,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, */ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { - return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); + return __hugetlb_vmemmap_restore_folio(h, folio, 0); } /** @@ -522,14 +457,11 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct folio *folio, *t_folio; long restored = 0; long ret = 0; - unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; + unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH; list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); - /* only need to synchronize_rcu() once for each batch */ - flags &= ~VMEMMAP_SYNCHRONIZE_RCU; - if (ret) break; restored++; @@ -561,14 +493,40 @@ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio * return true; } +static struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) +{ + const unsigned int idx = order - VMEMMAP_TAIL_MIN_ORDER; + struct page *tail, *p; + int node = zone_to_nid(zone); + + tail = READ_ONCE(zone->vmemmap_tails[idx]); + if (likely(tail)) + return tail; + + tail = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!tail) + return NULL; + + p = page_to_virt(tail); + for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++) + init_compound_tail(p + i, NULL, order, zone); + + if (cmpxchg(&zone->vmemmap_tails[idx], NULL, tail)) { + __free_page(tail); + tail = READ_ONCE(zone->vmemmap_tails[idx]); + } + + return tail; +} + static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio, struct list_head *vmemmap_pages, unsigned long flags) { - int ret = 0; - unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; - unsigned long vmemmap_reuse; + unsigned long vmemmap_start, vmemmap_end; + struct page *vmemmap_head, *vmemmap_tail; + int nid, ret = 0; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); @@ -576,10 +534,11 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, if (!vmemmap_should_optimize_folio(h, folio)) return ret; - static_branch_inc(&hugetlb_optimize_vmemmap_key); + nid = folio_nid(folio); + vmemmap_tail = vmemmap_get_tail(h->order, folio_zone(folio)); + if (!vmemmap_tail) + return -ENOMEM; - if (flags & VMEMMAP_SYNCHRONIZE_RCU) - synchronize_rcu(); /* * Very Subtle * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed @@ -593,22 +552,30 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, */ folio_set_hugetlb_vmemmap_optimized(folio); + vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0); + if (!vmemmap_head) { + ret = -ENOMEM; + goto out; + } + + copy_page(page_to_virt(vmemmap_head), folio); + list_add(&vmemmap_head->lru, vmemmap_pages); + memmap_pages_add(1); + + vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); - vmemmap_reuse = vmemmap_start; - vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* - * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) - * to the page which @vmemmap_reuse is mapped to. Add pages previously - * mapping the range to vmemmap_pages list so that they can be freed by - * the caller. + * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end). + * Add pages previously mapping the range to vmemmap_pages list so that + * they can be freed by the caller. */ - ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, + ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, + vmemmap_head, vmemmap_tail, vmemmap_pages, flags); - if (ret) { - static_branch_dec(&hugetlb_optimize_vmemmap_key); +out: + if (ret) folio_clear_hugetlb_vmemmap_optimized(folio); - } return ret; } @@ -627,27 +594,25 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); - __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); free_vmemmap_page_list(&vmemmap_pages); } static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) { - unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; - unsigned long vmemmap_reuse; + unsigned long vmemmap_start, vmemmap_end; if (!vmemmap_should_optimize_folio(h, folio)) return 0; + vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); - vmemmap_reuse = vmemmap_start; - vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * Split PMDs on the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end] */ - return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); + return vmemmap_remap_split(vmemmap_start, vmemmap_end); } static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, @@ -657,7 +622,7 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, struct folio *folio; int nr_to_optimize; LIST_HEAD(vmemmap_pages); - unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; + unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH; nr_to_optimize = 0; list_for_each_entry(folio, folio_list, lru) { @@ -676,7 +641,6 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, register_page_bootmem_memmap(pfn_to_section_nr(spfn), &folio->page, HUGETLB_VMEMMAP_RESERVE_SIZE); - static_branch_inc(&hugetlb_optimize_vmemmap_key); continue; } @@ -710,8 +674,6 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, int ret; ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); - /* only need to synchronize_rcu() once for each batch */ - flags &= ~VMEMMAP_SYNCHRONIZE_RCU; /* * Pages to be freed may have been accumulated. If we @@ -790,7 +752,6 @@ void __init hugetlb_vmemmap_init_early(int nid) { unsigned long psize, paddr, section_size; unsigned long ns, i, pnum, pfn, nr_pages; - unsigned long start, end; struct huge_bootmem_page *m = NULL; void *map; @@ -808,14 +769,6 @@ void __init hugetlb_vmemmap_init_early(int nid) paddr = virt_to_phys(m); pfn = PHYS_PFN(paddr); map = pfn_to_page(pfn); - start = (unsigned long)map; - end = start + nr_pages * sizeof(struct page); - - if (vmemmap_populate_hvo(start, end, nid, - HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) - continue; - - memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE); pnum = pfn_to_section_nr(pfn); ns = psize / section_size; @@ -831,11 +784,26 @@ void __init hugetlb_vmemmap_init_early(int nid) } } +static struct zone *pfn_to_zone(unsigned nid, unsigned long pfn) +{ + struct zone *zone; + enum zone_type zone_type; + + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + zone = &NODE_DATA(nid)->node_zones[zone_type]; + if (zone_spans_pfn(zone, pfn)) + return zone; + } + + return NULL; +} + void __init hugetlb_vmemmap_init_late(int nid) { struct huge_bootmem_page *m, *tm; unsigned long phys, nr_pages, start, end; unsigned long pfn, nr_mmap; + struct zone *zone = NULL; struct hstate *h; void *map; @@ -850,28 +818,41 @@ void __init hugetlb_vmemmap_init_late(int nid) h = m->hstate; pfn = PHYS_PFN(phys); nr_pages = pages_per_huge_page(h); + map = pfn_to_page(pfn); + start = (unsigned long)map; + end = start + nr_pages * sizeof(struct page); if (!hugetlb_bootmem_page_zones_valid(nid, m)) { /* * Oops, the hugetlb page spans multiple zones. - * Remove it from the list, and undo HVO. + * Remove it from the list, and populate it normally. */ list_del(&m->list); - map = pfn_to_page(pfn); - - start = (unsigned long)map; - end = start + nr_pages * sizeof(struct page); - - vmemmap_undo_hvo(start, end, nid, - HUGETLB_VMEMMAP_RESERVE_SIZE); - nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE; + vmemmap_populate(start, end, nid, NULL); + nr_mmap = end - start; memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); memblock_phys_free(phys, huge_page_size(h)); continue; - } else + } + + if (!zone || !zone_spans_pfn(zone, pfn)) + zone = pfn_to_zone(nid, pfn); + if (WARN_ON_ONCE(!zone)) + continue; + + if (vmemmap_populate_hvo(start, end, huge_page_order(h), zone, + HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) { + /* Fallback if HVO population fails */ + vmemmap_populate(start, end, nid, NULL); + nr_mmap = end - start; + } else { m->flags |= HUGE_BOOTMEM_ZONES_VALID; + nr_mmap = HUGETLB_VMEMMAP_RESERVE_SIZE; + } + + memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); } } #endif @@ -889,10 +870,27 @@ static const struct ctl_table hugetlb_vmemmap_sysctls[] = { static int __init hugetlb_vmemmap_init(void) { const struct hstate *h; + struct zone *zone; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); + for_each_zone(zone) { + for (int i = 0; i < NR_VMEMMAP_TAILS; i++) { + struct page *tail, *p; + unsigned int order; + + tail = zone->vmemmap_tails[i]; + if (!tail) + continue; + + order = i + VMEMMAP_TAIL_MIN_ORDER; + p = page_to_virt(tail); + for (int j = 0; j < PAGE_SIZE / sizeof(struct page); j++) + init_compound_tail(p + j, NULL, order, zone); + } + } + for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { register_sysctl_init("vm", hugetlb_vmemmap_sysctls); diff --git a/mm/internal.h b/mm/internal.h index cb0af847d7d9..5a2ddcf68e0b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,7 @@ #include <linux/khugepaged.h> #include <linux/mm.h> #include <linux/mm_inline.h> +#include <linux/mmu_notifier.h> #include <linux/pagemap.h> #include <linux/pagewalk.h> #include <linux/rmap.h> @@ -516,14 +517,30 @@ void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); +/** + * sync_with_folio_pmd_zap - sync with concurrent zapping of a folio PMD + * @mm: The mm_struct. + * @pmdp: Pointer to the pmd that was found to be pmd_none(). + * + * When we find a pmd_none() while unmapping a folio without holding the PTL, + * zap_huge_pmd() may have cleared the PMD but not yet modified the folio to + * indicate that it's unmapped. Skipping the PMD without synchronization could + * make folio unmapping code assume that unmapping failed. + * + * Wait for concurrent zapping to complete by grabbing the PTL. + */ +static inline void sync_with_folio_pmd_zap(struct mm_struct *mm, pmd_t *pmdp) +{ + spinlock_t *ptl = pmd_lock(mm, pmdp); + + spin_unlock(ptl); +} + struct zap_details; -void unmap_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - struct zap_details *details); -void zap_page_range_single_batched(struct mmu_gather *tlb, +void zap_vma_range_batched(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long size, struct zap_details *details); +int zap_vma_for_reaping(struct vm_area_struct *vma); int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp); @@ -540,7 +557,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); -void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); @@ -624,6 +640,11 @@ int user_proactive_reclaim(char *buf, pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* + * in mm/khugepaged.c + */ +void set_recommended_min_free_kbytes(void); + +/* * in mm/page_alloc.c */ #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -878,13 +899,21 @@ static inline void prep_compound_head(struct page *page, unsigned int order) INIT_LIST_HEAD(&folio->_deferred_list); } -static inline void prep_compound_tail(struct page *head, int tail_idx) +static inline void prep_compound_tail(struct page *tail, + const struct page *head, unsigned int order) { - struct page *p = head + tail_idx; + tail->mapping = TAIL_MAPPING; + set_compound_head(tail, head, order); + set_page_private(tail, 0); +} - p->mapping = TAIL_MAPPING; - set_compound_head(p, head); - set_page_private(p, 0); +static inline void init_compound_tail(struct page *tail, + const struct page *head, unsigned int order, struct zone *zone) +{ + atomic_set(&tail->_mapcount, -1); + set_page_node(tail, zone_to_nid(zone)); + set_page_zone(tail, zone_idx(zone)); + prep_compound_tail(tail, head, order); } void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); @@ -929,12 +958,59 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int, bool); +/* + * mm/sparse.c + */ #ifdef CONFIG_SPARSEMEM void sparse_init(void); +int sparse_index_init(unsigned long section_nr, int nid); + +static inline void sparse_init_one_section(struct mem_section *ms, + unsigned long pnum, struct page *mem_map, + struct mem_section_usage *usage, unsigned long flags) +{ + unsigned long coded_mem_map; + + BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT); + + /* + * We encode the start PFN of the section into the mem_map such that + * page_to_pfn() on !CONFIG_SPARSEMEM_VMEMMAP can simply subtract it + * from the page pointer to obtain the PFN. + */ + coded_mem_map = (unsigned long)(mem_map - section_nr_to_pfn(pnum)); + VM_WARN_ON_ONCE(coded_mem_map & ~SECTION_MAP_MASK); + + ms->section_mem_map &= ~SECTION_MAP_MASK; + ms->section_mem_map |= coded_mem_map; + ms->section_mem_map |= flags | SECTION_HAS_MEM_MAP; + ms->usage = usage; +} + +static inline void __section_mark_present(struct mem_section *ms, + unsigned long section_nr) +{ + if (section_nr > __highest_present_section_nr) + __highest_present_section_nr = section_nr; + + ms->section_mem_map |= SECTION_MARKED_PRESENT; +} #else static inline void sparse_init(void) {} #endif /* CONFIG_SPARSEMEM */ +/* + * mm/sparse-vmemmap.c + */ +#ifdef CONFIG_SPARSEMEM_VMEMMAP +void sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages); +#else +static inline void sparse_init_subsection_map(unsigned long pfn, + unsigned long nr_pages) +{ +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -1218,6 +1294,18 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } return fpin; } + +static inline bool vma_supports_mlock(const struct vm_area_struct *vma) +{ + if (vma_test_any_mask(vma, VMA_SPECIAL_FLAGS)) + return false; + if (vma_test_single_mask(vma, VMA_DROPPABLE)) + return false; + if (vma_is_dax(vma) || is_vm_hugetlb_page(vma)) + return false; + return vma != get_gate_vma(current->mm); +} + #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void mlock_new_folio(struct folio *folio) { } @@ -1233,7 +1321,17 @@ static inline void vunmap_range_noflush(unsigned long start, unsigned long end) #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT DECLARE_STATIC_KEY_TRUE(deferred_pages); +static inline bool deferred_pages_enabled(void) +{ + return static_branch_unlikely(&deferred_pages); +} + bool __init deferred_grow_zone(struct zone *zone, unsigned int order); +#else +static inline bool deferred_pages_enabled(void) +{ + return false; +} #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void init_deferred_page(unsigned long pfn, int nid); @@ -1450,6 +1548,8 @@ int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, } #endif +void clear_vm_uninitialized_flag(struct vm_struct *vm); + int __must_check __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); @@ -1748,26 +1848,108 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start, void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); -void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn); -int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t pgprot); +int remap_pfn_range_prepare(struct vm_area_desc *desc); +int remap_pfn_range_complete(struct vm_area_struct *vma, + struct mmap_action *action); +int simple_ioremap_prepare(struct vm_area_desc *desc); -static inline void io_remap_pfn_range_prepare(struct vm_area_desc *desc, - unsigned long orig_pfn, unsigned long size) +static inline int io_remap_pfn_range_prepare(struct vm_area_desc *desc) { + struct mmap_action *action = &desc->action; + const unsigned long orig_pfn = action->remap.start_pfn; + const pgprot_t orig_pgprot = action->remap.pgprot; + const unsigned long size = action->remap.size; const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + int err; + + action->remap.start_pfn = pfn; + action->remap.pgprot = pgprot_decrypted(orig_pgprot); + err = remap_pfn_range_prepare(desc); + if (err) + return err; - return remap_pfn_range_prepare(desc, pfn); + /* Remap does the actual work. */ + action->type = MMAP_REMAP_PFN; + return 0; } -static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma, - unsigned long addr, unsigned long orig_pfn, unsigned long size, - pgprot_t orig_prot) +/* + * When we succeed an mmap action or just before we unmap a VMA on error, we + * need to ensure any rmap lock held is released. On unmap it's required to + * avoid a deadlock. + */ +static inline void maybe_rmap_unlock_action(struct vm_area_struct *vma, + struct mmap_action *action) { - const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); - const pgprot_t prot = pgprot_decrypted(orig_prot); + struct file *file; + + if (!action->hide_from_rmap_until_complete) + return; + + VM_WARN_ON_ONCE(vma_is_anonymous(vma)); + file = vma->vm_file; + i_mmap_unlock_write(file->f_mapping); + action->hide_from_rmap_until_complete = false; +} + +#ifdef CONFIG_MMU_NOTIFIER +static inline bool clear_flush_young_ptes_notify(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + bool young; - return remap_pfn_range_complete(vma, addr, pfn, size, prot); + young = clear_flush_young_ptes(vma, addr, ptep, nr); + young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr, + addr + nr * PAGE_SIZE); + return young; } +static inline bool pmdp_clear_flush_young_notify(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + bool young; + + young = pmdp_clear_flush_young(vma, addr, pmdp); + young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr, addr + PMD_SIZE); + return young; +} + +static inline bool test_and_clear_young_ptes_notify(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + bool young; + + young = test_and_clear_young_ptes(vma, addr, ptep, nr); + young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + nr * PAGE_SIZE); + return young; +} + +static inline bool pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + bool young; + + young = pmdp_test_and_clear_young(vma, addr, pmdp); + young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PMD_SIZE); + return young; +} + +#else /* CONFIG_MMU_NOTIFIER */ + +#define clear_flush_young_ptes_notify clear_flush_young_ptes +#define pmdp_clear_flush_young_notify pmdp_clear_flush_young +#define test_and_clear_young_ptes_notify test_and_clear_young_ptes +#define pmdp_test_and_clear_young_notify pmdp_test_and_clear_young + +#endif /* CONFIG_MMU_NOTIFIER */ + +extern int sysctl_max_map_count; +static inline int get_sysctl_max_map_count(void) +{ + return READ_ONCE(sysctl_max_map_count); +} + +bool may_expand_vm(struct mm_struct *mm, const vma_flags_t *vma_flags, + unsigned long npages); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 32e390c42c53..32bcfbfcf15f 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -15,11 +15,6 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) return v->vm_pgoff; } -static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) -{ - return v->vm_pgoff + vma_pages(v) - 1; -} - INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, unsigned long, shared.rb_subtree_last, vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index f084e7a5df1e..9c880f607c6a 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -292,7 +292,7 @@ static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) return; } - pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd))); + pte_free_kernel(&init_mm, pte_start); pmd_clear(pmd); } @@ -307,7 +307,7 @@ static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud) return; } - pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud))); + pmd_free(&init_mm, pmd_start); pud_clear(pud); } @@ -322,7 +322,7 @@ static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d) return; } - pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d))); + pud_free(&init_mm, pud_start); p4d_clear(p4d); } @@ -337,7 +337,7 @@ static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd) return; } - p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd))); + p4d_free(&init_mm, p4d_start); pgd_clear(pgd); } diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 27efb78eb32d..e804b1e1f886 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -638,7 +638,7 @@ void kasan_report_async(void) */ void kasan_non_canonical_hook(unsigned long addr) { - unsigned long orig_addr; + unsigned long orig_addr, user_orig_addr; const char *bug_type; /* @@ -650,6 +650,9 @@ void kasan_non_canonical_hook(unsigned long addr) orig_addr = (unsigned long)kasan_shadow_to_mem((void *)addr); + /* Strip pointer tag before comparing against userspace ranges */ + user_orig_addr = (unsigned long)set_tag((void *)orig_addr, 0); + /* * For faults near the shadow address for NULL, we can be fairly certain * that this is a KASAN shadow memory access. @@ -661,11 +664,13 @@ void kasan_non_canonical_hook(unsigned long addr) * address, but make it clear that this is not necessarily what's * actually going on. */ - if (orig_addr < PAGE_SIZE) + if (user_orig_addr < PAGE_SIZE) { bug_type = "null-ptr-deref"; - else if (orig_addr < TASK_SIZE) + orig_addr = user_orig_addr; + } else if (user_orig_addr < TASK_SIZE) { bug_type = "probably user-memory-access"; - else if (addr_in_shadow((void *)addr)) + orig_addr = user_orig_addr; + } else if (addr_in_shadow((void *)addr)) bug_type = "probably wild-memory-access"; else bug_type = "maybe wild-memory-access"; diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 7393957f9a20..655dc5ce3240 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -51,7 +51,7 @@ /* === Data ================================================================= */ -static bool kfence_enabled __read_mostly; +bool kfence_enabled __read_mostly; static bool disabled_by_warn __read_mostly; unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; @@ -336,6 +336,7 @@ out: static check_canary_attributes bool check_canary_byte(u8 *addr) { struct kfence_metadata *meta; + enum kfence_fault fault; unsigned long flags; if (likely(*addr == KFENCE_CANARY_PATTERN_U8(addr))) @@ -345,8 +346,9 @@ static check_canary_attributes bool check_canary_byte(u8 *addr) meta = addr_to_metadata((unsigned long)addr); raw_spin_lock_irqsave(&meta->lock, flags); - kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION); + fault = kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION); raw_spin_unlock_irqrestore(&meta->lock, flags); + kfence_handle_fault(fault); return false; } @@ -525,11 +527,14 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z raw_spin_lock_irqsave(&meta->lock, flags); if (!kfence_obj_allocated(meta) || meta->addr != (unsigned long)addr) { + enum kfence_fault fault; + /* Invalid or double-free, bail out. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); - kfence_report_error((unsigned long)addr, false, NULL, meta, - KFENCE_ERROR_INVALID_FREE); + fault = kfence_report_error((unsigned long)addr, false, NULL, meta, + KFENCE_ERROR_INVALID_FREE); raw_spin_unlock_irqrestore(&meta->lock, flags); + kfence_handle_fault(fault); return; } @@ -731,10 +736,10 @@ static bool __init kfence_init_pool_early(void) * fails for the first page, and therefore expect addr==__kfence_pool in * most failure cases. */ - memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); + memblock_free((void *)addr, KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); __kfence_pool = NULL; - memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE); + memblock_free(kfence_metadata_init, KFENCE_METADATA_SIZE); kfence_metadata_init = NULL; return false; @@ -831,7 +836,8 @@ static void kfence_check_all_canary(void) static int kfence_check_canary_callback(struct notifier_block *nb, unsigned long reason, void *arg) { - kfence_check_all_canary(); + if (READ_ONCE(kfence_enabled)) + kfence_check_all_canary(); return NOTIFY_OK; } @@ -1266,6 +1272,7 @@ bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs struct kfence_metadata *to_report = NULL; unsigned long unprotected_page = 0; enum kfence_error_type error_type; + enum kfence_fault fault; unsigned long flags; if (!is_kfence_address((void *)addr)) @@ -1324,12 +1331,14 @@ out: if (to_report) { raw_spin_lock_irqsave(&to_report->lock, flags); to_report->unprotected_page = unprotected_page; - kfence_report_error(addr, is_write, regs, to_report, error_type); + fault = kfence_report_error(addr, is_write, regs, to_report, error_type); raw_spin_unlock_irqrestore(&to_report->lock, flags); } else { /* This may be a UAF or OOB access, but we can't be sure. */ - kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); + fault = kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); } + kfence_handle_fault(fault); + return kfence_unprotect(addr); /* Unprotect and let access proceed. */ } diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index f9caea007246..1f618f9b0d12 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -16,6 +16,8 @@ #include "../slab.h" /* for struct kmem_cache */ +extern bool kfence_enabled; + /* * Get the canary byte pattern for @addr. Use a pattern that varies based on the * lower 3 bits of the address, to detect memory corruptions with higher @@ -140,8 +142,18 @@ enum kfence_error_type { KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ }; -void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, - const struct kfence_metadata *meta, enum kfence_error_type type); +enum kfence_fault { + KFENCE_FAULT_NONE, + KFENCE_FAULT_REPORT, + KFENCE_FAULT_OOPS, + KFENCE_FAULT_PANIC, +}; + +enum kfence_fault +kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, + const struct kfence_metadata *meta, enum kfence_error_type type); + +void kfence_handle_fault(enum kfence_fault fault); void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta) __must_hold(&meta->lock); diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 787e87c26926..d548536864b1 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -7,9 +7,12 @@ #include <linux/stdarg.h> +#include <linux/bug.h> +#include <linux/init.h> #include <linux/kernel.h> #include <linux/lockdep.h> #include <linux/math.h> +#include <linux/panic.h> #include <linux/printk.h> #include <linux/sched/debug.h> #include <linux/seq_file.h> @@ -29,6 +32,26 @@ #define ARCH_FUNC_PREFIX "" #endif +static enum kfence_fault kfence_fault __ro_after_init = KFENCE_FAULT_REPORT; + +static int __init early_kfence_fault(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "report")) + kfence_fault = KFENCE_FAULT_REPORT; + else if (!strcmp(arg, "oops")) + kfence_fault = KFENCE_FAULT_OOPS; + else if (!strcmp(arg, "panic")) + kfence_fault = KFENCE_FAULT_PANIC; + else + return -EINVAL; + + return 0; +} +early_param("kfence.fault", early_kfence_fault); + /* Helper function to either print to a seq_file or to console. */ __printf(2, 3) static void seq_con_printf(struct seq_file *seq, const char *fmt, ...) @@ -189,8 +212,9 @@ static const char *get_access_type(bool is_write) return str_write_read(is_write); } -void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, - const struct kfence_metadata *meta, enum kfence_error_type type) +enum kfence_fault +kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, + const struct kfence_metadata *meta, enum kfence_error_type type) { unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 }; const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1; @@ -206,7 +230,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r /* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */ if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta)) - return; + return KFENCE_FAULT_NONE; /* * Because we may generate reports in printk-unfriendly parts of the @@ -282,6 +306,25 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r /* We encountered a memory safety error, taint the kernel! */ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); + + return kfence_fault; +} + +void kfence_handle_fault(enum kfence_fault fault) +{ + switch (fault) { + case KFENCE_FAULT_NONE: + case KFENCE_FAULT_REPORT: + break; + case KFENCE_FAULT_OOPS: + BUG(); + break; + case KFENCE_FAULT_PANIC: + /* Disable KFENCE to avoid recursion if check_on_panic is set. */ + WRITE_ONCE(kfence_enabled, false); + panic("kfence.fault=panic set ...\n"); + break; + } } #ifdef CONFIG_PRINTK diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1dd3cfca610d..b8452dbdb043 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -46,6 +46,7 @@ enum scan_result { SCAN_PAGE_LRU, SCAN_PAGE_LOCK, SCAN_PAGE_ANON, + SCAN_PAGE_LAZYFREE, SCAN_PAGE_COMPOUND, SCAN_ANY_PROCESS, SCAN_VMA_NULL, @@ -68,7 +69,10 @@ enum scan_result { static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */ +/* + * default scan 8*HPAGE_PMD_NR ptes, pte_mapped_hugepage, pmd_mapped, + * no_pte_table or vmas every 10 second. + */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; @@ -85,6 +89,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * * Note that these are only respected if collapse was initiated by khugepaged. */ +#define KHUGEPAGED_MAX_PTES_LIMIT (HPAGE_PMD_NR - 1) unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; @@ -100,6 +105,9 @@ struct collapse_control { /* Num pages scanned per node */ u32 node_load[MAX_NUMNODES]; + /* Num pages scanned (see khugepaged_pages_to_scan) */ + unsigned int progress; + /* nodemask for allocation fallback */ nodemask_t alloc_nmask; }; @@ -252,7 +260,7 @@ static ssize_t max_ptes_none_store(struct kobject *kobj, unsigned long max_ptes_none; err = kstrtoul(buf, 10, &max_ptes_none); - if (err || max_ptes_none > HPAGE_PMD_NR - 1) + if (err || max_ptes_none > KHUGEPAGED_MAX_PTES_LIMIT) return -EINVAL; khugepaged_max_ptes_none = max_ptes_none; @@ -277,7 +285,7 @@ static ssize_t max_ptes_swap_store(struct kobject *kobj, unsigned long max_ptes_swap; err = kstrtoul(buf, 10, &max_ptes_swap); - if (err || max_ptes_swap > HPAGE_PMD_NR - 1) + if (err || max_ptes_swap > KHUGEPAGED_MAX_PTES_LIMIT) return -EINVAL; khugepaged_max_ptes_swap = max_ptes_swap; @@ -303,7 +311,7 @@ static ssize_t max_ptes_shared_store(struct kobject *kobj, unsigned long max_ptes_shared; err = kstrtoul(buf, 10, &max_ptes_shared); - if (err || max_ptes_shared > HPAGE_PMD_NR - 1) + if (err || max_ptes_shared > KHUGEPAGED_MAX_PTES_LIMIT) return -EINVAL; khugepaged_max_ptes_shared = max_ptes_shared; @@ -375,7 +383,7 @@ int __init khugepaged_init(void) return -ENOMEM; khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; - khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; + khugepaged_max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; @@ -387,14 +395,14 @@ void __init khugepaged_destroy(void) kmem_cache_destroy(mm_slot_cache); } -static inline int hpage_collapse_test_exit(struct mm_struct *mm) +static inline int collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } -static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) +static inline int collapse_test_exit_or_disable(struct mm_struct *mm) { - return hpage_collapse_test_exit(mm) || + return collapse_test_exit(mm) || mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } @@ -428,7 +436,7 @@ void __khugepaged_enter(struct mm_struct *mm) int wakeup; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); + VM_BUG_ON_MM(collapse_test_exit(mm), mm); if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; @@ -482,7 +490,7 @@ void __khugepaged_exit(struct mm_struct *mm) } else if (slot) { /* * This is required to serialize against - * hpage_collapse_test_exit() (which is guaranteed to run + * collapse_test_exit() (which is guaranteed to run * under mmap sem read mode). Stop here (after we return all * pagetables will be destroyed) until khugepaged has finished * working on the pagetables under the mmap_lock. @@ -571,7 +579,17 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma, folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); - /* See hpage_collapse_scan_pmd(). */ + /* + * If the vma has the VM_DROPPABLE flag, the collapse will + * preserve the lazyfree property without needing to skip. + */ + if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) && + folio_test_lazyfree(folio) && !pte_dirty(pteval)) { + result = SCAN_PAGE_LAZYFREE; + goto out; + } + + /* See collapse_scan_pmd(). */ if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && @@ -822,7 +840,7 @@ static struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, }; -static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) +static bool collapse_scan_abort(int nid, struct collapse_control *cc) { int i; @@ -857,7 +875,7 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) } #ifdef CONFIG_NUMA -static int hpage_collapse_find_target_node(struct collapse_control *cc) +static int collapse_find_target_node(struct collapse_control *cc) { int nid, target_node = 0, max_value = 0; @@ -876,7 +894,7 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc) return target_node; } #else -static int hpage_collapse_find_target_node(struct collapse_control *cc) +static int collapse_find_target_node(struct collapse_control *cc) { return 0; } @@ -895,7 +913,7 @@ static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned l enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE; - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + if (unlikely(collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); @@ -966,7 +984,7 @@ static enum scan_result check_pmd_still_valid(struct mm_struct *mm, /* * Bring missing pages in from swap, to complete THP collapse. - * Only done if hpage_collapse_scan_pmd believes it is worthwhile. + * Only done if khugepaged_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. @@ -1052,7 +1070,7 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE); - int node = hpage_collapse_find_target_node(cc); + int node = collapse_find_target_node(cc); struct folio *folio; folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); @@ -1230,9 +1248,9 @@ out_nolock: return result; } -static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked, - struct collapse_control *cc) +static enum scan_result collapse_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long start_addr, + bool *lock_dropped, struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; @@ -1247,19 +1265,24 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); - if (result != SCAN_SUCCEED) + if (result != SCAN_SUCCEED) { + cc->progress++; goto out; + } memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { + cc->progress++; result = SCAN_NO_PTE_TABLE; goto out; } for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { + cc->progress++; + pte_t pteval = ptep_get(_pte); if (pte_none_or_zero(pteval)) { ++none_or_zero; @@ -1314,6 +1337,16 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, } folio = page_folio(page); + /* + * If the vma has the VM_DROPPABLE flag, the collapse will + * preserve the lazyfree property without needing to skip. + */ + if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) && + folio_test_lazyfree(folio) && !pte_dirty(pteval)) { + result = SCAN_PAGE_LAZYFREE; + goto out_unmap; + } + if (!folio_test_anon(folio)) { result = SCAN_PAGE_ANON; goto out_unmap; @@ -1340,7 +1373,7 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, * hit record. */ node = folio_nid(folio); - if (hpage_collapse_scan_abort(node, cc)) { + if (collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } @@ -1392,7 +1425,7 @@ out_unmap: result = collapse_huge_page(mm, start_addr, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ - *mmap_locked = false; + *lock_dropped = true; } out: trace_mm_khugepaged_scan_pmd(mm, folio, referenced, @@ -1406,7 +1439,7 @@ static void collect_mm_slot(struct mm_slot *slot) lockdep_assert_held(&khugepaged_mm_lock); - if (hpage_collapse_test_exit(mm)) { + if (collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); @@ -1508,7 +1541,7 @@ static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsign if (IS_ERR(folio)) return SCAN_PAGE_NULL; - if (folio_order(folio) != HPAGE_PMD_ORDER) { + if (!is_pmd_order(folio_order(folio))) { result = SCAN_PAGE_COMPOUND; goto drop_folio; } @@ -1761,7 +1794,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; - if (hpage_collapse_test_exit(mm)) + if (collapse_test_exit(mm)) continue; if (!file_backed_vma_is_retractable(vma)) @@ -1991,9 +2024,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration. */ - if (folio_order(folio) == HPAGE_PMD_ORDER && - folio->index == start) { - /* Maybe PMD-mapped */ + if (is_pmd_order(folio_order(folio))) { result = SCAN_PTE_MAPPED_HUGEPAGE; goto out_unlock; } @@ -2279,8 +2310,9 @@ out: return result; } -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, struct collapse_control *cc) +static enum scan_result collapse_scan_file(struct mm_struct *mm, + unsigned long addr, struct file *file, pgoff_t start, + struct collapse_control *cc) { struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; @@ -2320,22 +2352,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned continue; } - if (folio_order(folio) == HPAGE_PMD_ORDER && - folio->index == start) { - /* Maybe PMD-mapped */ + if (is_pmd_order(folio_order(folio))) { result = SCAN_PTE_MAPPED_HUGEPAGE; /* - * For SCAN_PTE_MAPPED_HUGEPAGE, further processing - * by the caller won't touch the page cache, and so - * it's safe to skip LRU and refcount checks before - * returning. + * PMD-sized THP implies that we can only try + * retracting the PTE table. */ folio_put(folio); break; } node = folio_nid(folio); - if (hpage_collapse_scan_abort(node, cc)) { + if (collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; folio_put(folio); break; @@ -2370,6 +2398,10 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned } } rcu_read_unlock(); + if (result == SCAN_PTE_MAPPED_HUGEPAGE) + cc->progress++; + else + cc->progress += HPAGE_PMD_NR; if (result == SCAN_SUCCEED) { if (cc->is_khugepaged && @@ -2385,8 +2417,69 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned return result; } -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result *result, - struct collapse_control *cc) +/* + * Try to collapse a single PMD starting at a PMD aligned addr, and return + * the results. + */ +static enum scan_result collapse_single_pmd(unsigned long addr, + struct vm_area_struct *vma, bool *lock_dropped, + struct collapse_control *cc) +{ + struct mm_struct *mm = vma->vm_mm; + bool triggered_wb = false; + enum scan_result result; + struct file *file; + pgoff_t pgoff; + + mmap_assert_locked(mm); + + if (vma_is_anonymous(vma)) { + result = collapse_scan_pmd(mm, vma, addr, lock_dropped, cc); + goto end; + } + + file = get_file(vma->vm_file); + pgoff = linear_page_index(vma, addr); + + mmap_read_unlock(mm); + *lock_dropped = true; +retry: + result = collapse_scan_file(mm, addr, file, pgoff, cc); + + /* + * For MADV_COLLAPSE, when encountering dirty pages, try to writeback, + * then retry the collapse one time. + */ + if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK && + !triggered_wb && mapping_can_writeback(file->f_mapping)) { + const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT; + const loff_t lend = lstart + HPAGE_PMD_SIZE - 1; + + filemap_write_and_wait_range(file->f_mapping, lstart, lend); + triggered_wb = true; + goto retry; + } + fput(file); + + if (result == SCAN_PTE_MAPPED_HUGEPAGE) { + mmap_read_lock(mm); + if (collapse_test_exit_or_disable(mm)) + result = SCAN_ANY_PROCESS; + else + result = try_collapse_pte_mapped_thp(mm, addr, + !cc->is_khugepaged); + if (result == SCAN_PMD_MAPPED) + result = SCAN_SUCCEED; + mmap_read_unlock(mm); + } +end: + if (cc->is_khugepaged && result == SCAN_SUCCEED) + ++khugepaged_pages_collapsed; + return result; +} + +static void collapse_scan_mm_slot(unsigned int progress_max, + enum scan_result *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { @@ -2394,9 +2487,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; - int progress = 0; + unsigned int progress_prev = cc->progress; - VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); *result = SCAN_FAIL; @@ -2419,8 +2511,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; - progress++; - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + cc->progress++; + if (unlikely(collapse_test_exit_or_disable(mm))) goto breakouterloop; vma_iter_init(&vmi, mm, khugepaged_scan.address); @@ -2428,18 +2520,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result unsigned long hstart, hend; cond_resched(); - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { - progress++; + if (unlikely(collapse_test_exit_or_disable(mm))) { + cc->progress++; break; } if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { - progress++; + cc->progress++; continue; } hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) { - progress++; + cc->progress++; continue; } if (khugepaged_scan.address < hstart) @@ -2447,47 +2539,21 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { - bool mmap_locked = true; + bool lock_dropped = false; cond_resched(); - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + if (unlikely(collapse_test_exit_or_disable(mm))) goto breakouterloop; - VM_BUG_ON(khugepaged_scan.address < hstart || + VM_WARN_ON_ONCE(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (!vma_is_anonymous(vma)) { - struct file *file = get_file(vma->vm_file); - pgoff_t pgoff = linear_page_index(vma, - khugepaged_scan.address); - - mmap_read_unlock(mm); - mmap_locked = false; - *result = hpage_collapse_scan_file(mm, - khugepaged_scan.address, file, pgoff, cc); - fput(file); - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { - mmap_read_lock(mm); - if (hpage_collapse_test_exit_or_disable(mm)) - goto breakouterloop; - *result = try_collapse_pte_mapped_thp(mm, - khugepaged_scan.address, false); - if (*result == SCAN_PMD_MAPPED) - *result = SCAN_SUCCEED; - mmap_read_unlock(mm); - } - } else { - *result = hpage_collapse_scan_pmd(mm, vma, - khugepaged_scan.address, &mmap_locked, cc); - } - - if (*result == SCAN_SUCCEED) - ++khugepaged_pages_collapsed; + *result = collapse_single_pmd(khugepaged_scan.address, + vma, &lock_dropped, cc); /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; - progress += HPAGE_PMD_NR; - if (!mmap_locked) + if (lock_dropped) /* * We released mmap_lock so break loop. Note * that we drop mmap_lock before all hugepage @@ -2496,7 +2562,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result * correct result back to caller. */ goto breakouterloop_mmap_lock; - if (progress >= pages) + if (cc->progress >= progress_max) goto breakouterloop; } } @@ -2508,9 +2574,9 @@ breakouterloop_mmap_lock: VM_BUG_ON(khugepaged_scan.mm_slot != slot); /* * Release the current mm_slot if this mm is about to die, or - * if we scanned all vmas of this mm. + * if we scanned all vmas of this mm, or THP got disabled. */ - if (hpage_collapse_test_exit(mm) || !vma) { + if (collapse_test_exit_or_disable(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find @@ -2527,7 +2593,8 @@ breakouterloop_mmap_lock: collect_mm_slot(slot); } - return progress; + trace_mm_khugepaged_scan(mm, cc->progress - progress_prev, + khugepaged_scan.mm_slot == NULL); } static int khugepaged_has_work(void) @@ -2543,13 +2610,14 @@ static int khugepaged_wait_event(void) static void khugepaged_do_scan(struct collapse_control *cc) { - unsigned int progress = 0, pass_through_head = 0; - unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); + const unsigned int progress_max = READ_ONCE(khugepaged_pages_to_scan); + unsigned int pass_through_head = 0; bool wait = true; enum scan_result result = SCAN_SUCCEED; lru_add_drain_all(); + cc->progress = 0; while (true) { cond_resched(); @@ -2561,13 +2629,12 @@ static void khugepaged_do_scan(struct collapse_control *cc) pass_through_head++; if (khugepaged_has_work() && pass_through_head < 2) - progress += khugepaged_scan_mm_slot(pages - progress, - &result, cc); + collapse_scan_mm_slot(progress_max, &result, cc); else - progress = pages; + cc->progress = progress_max; spin_unlock(&khugepaged_mm_lock); - if (progress >= pages) + if (cc->progress >= progress_max) break; if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { @@ -2630,7 +2697,7 @@ static int khugepaged(void *none) return 0; } -static void set_recommended_min_free_kbytes(void) +void set_recommended_min_free_kbytes(void) { struct zone *zone; int nr_zones = 0; @@ -2671,8 +2738,8 @@ static void set_recommended_min_free_kbytes(void) if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) - pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", - min_free_kbytes, recommended_min); + pr_info_ratelimited("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", + min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; } @@ -2761,7 +2828,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, unsigned long hstart, hend, addr; enum scan_result last_fail = SCAN_FAIL; int thps = 0; - bool mmap_locked = true; + bool mmap_unlocked = false; BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); @@ -2773,6 +2840,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, if (!cc) return -ENOMEM; cc->is_khugepaged = false; + cc->progress = 0; mmgrab(mm); lru_add_drain_all(); @@ -2782,13 +2850,12 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { enum scan_result result = SCAN_FAIL; - bool triggered_wb = false; -retry: - if (!mmap_locked) { + if (mmap_unlocked) { cond_resched(); mmap_read_lock(mm); - mmap_locked = true; + mmap_unlocked = false; + *lock_dropped = true; result = hugepage_vma_revalidate(mm, addr, false, &vma, cc); if (result != SCAN_SUCCEED) { @@ -2798,47 +2865,14 @@ retry: hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } - mmap_assert_locked(mm); - if (!vma_is_anonymous(vma)) { - struct file *file = get_file(vma->vm_file); - pgoff_t pgoff = linear_page_index(vma, addr); - mmap_read_unlock(mm); - mmap_locked = false; - *lock_dropped = true; - result = hpage_collapse_scan_file(mm, addr, file, pgoff, - cc); - - if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb && - mapping_can_writeback(file->f_mapping)) { - loff_t lstart = (loff_t)pgoff << PAGE_SHIFT; - loff_t lend = lstart + HPAGE_PMD_SIZE - 1; - - filemap_write_and_wait_range(file->f_mapping, lstart, lend); - triggered_wb = true; - fput(file); - goto retry; - } - fput(file); - } else { - result = hpage_collapse_scan_pmd(mm, vma, addr, - &mmap_locked, cc); - } - if (!mmap_locked) - *lock_dropped = true; + result = collapse_single_pmd(addr, vma, &mmap_unlocked, cc); -handle_result: switch (result) { case SCAN_SUCCEED: case SCAN_PMD_MAPPED: ++thps; break; - case SCAN_PTE_MAPPED_HUGEPAGE: - BUG_ON(mmap_locked); - mmap_read_lock(mm); - result = try_collapse_pte_mapped_thp(mm, addr, true); - mmap_read_unlock(mm); - goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_NO_PTE_TABLE: case SCAN_PTE_NON_PRESENT: @@ -2861,8 +2895,10 @@ handle_result: out_maybelock: /* Caller expects us to hold mmap_lock on return */ - if (!mmap_locked) + if (mmap_unlocked) { + *lock_dropped = true; mmap_read_lock(mm); + } out_nolock: mmap_assert_locked(mm); mmdrop(mm); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d79acf5c5100..2eff0d6b622b 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -241,7 +241,7 @@ static int kmemleak_skip_disable; /* If there are leaks that can be reported */ static bool kmemleak_found_leaks; -static bool kmemleak_verbose; +static bool kmemleak_verbose = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_VERBOSE); module_param_named(verbose, kmemleak_verbose, bool, 0600); static void kmemleak_disable(void); @@ -1505,12 +1505,10 @@ static int scan_should_stop(void) * This function may be called from either process or kthread context, * hence the need to check for both stop conditions. */ - if (current->mm) - return signal_pending(current); - else + if (current->flags & PF_KTHREAD) return kthread_should_stop(); - return 0; + return signal_pending(current); } /* @@ -735,21 +735,24 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } -static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags) +static bool ksm_compatible(const struct file *file, vma_flags_t vma_flags) { - if (vm_flags & (VM_SHARED | VM_MAYSHARE | VM_SPECIAL | - VM_HUGETLB | VM_DROPPABLE)) - return false; /* just ignore the advice */ - + /* Just ignore the advice. */ + if (vma_flags_test_any(&vma_flags, VMA_SHARED_BIT, VMA_MAYSHARE_BIT, + VMA_HUGETLB_BIT)) + return false; + if (vma_flags_test_single_mask(&vma_flags, VMA_DROPPABLE)) + return false; + if (vma_flags_test_any_mask(&vma_flags, VMA_SPECIAL_FLAGS)) + return false; if (file_is_dax(file)) return false; - #ifdef VM_SAO - if (vm_flags & VM_SAO) + if (vma_flags_test(&vma_flags, VMA_SAO_BIT)) return false; #endif #ifdef VM_SPARC_ADI - if (vm_flags & VM_SPARC_ADI) + if (vma_flags_test(&vma_flags, VMA_SPARC_ADI_BIT)) return false; #endif @@ -758,7 +761,7 @@ static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags) static bool vma_ksm_compatible(struct vm_area_struct *vma) { - return ksm_compatible(vma->vm_file, vma->vm_flags); + return ksm_compatible(vma->vm_file, vma->flags); } static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, @@ -2825,17 +2828,17 @@ static int ksm_scan_thread(void *nothing) return 0; } -static bool __ksm_should_add_vma(const struct file *file, vm_flags_t vm_flags) +static bool __ksm_should_add_vma(const struct file *file, vma_flags_t vma_flags) { - if (vm_flags & VM_MERGEABLE) + if (vma_flags_test(&vma_flags, VMA_MERGEABLE_BIT)) return false; - return ksm_compatible(file, vm_flags); + return ksm_compatible(file, vma_flags); } static void __ksm_add_vma(struct vm_area_struct *vma) { - if (__ksm_should_add_vma(vma->vm_file, vma->vm_flags)) + if (__ksm_should_add_vma(vma->vm_file, vma->flags)) vm_flags_set(vma, VM_MERGEABLE); } @@ -2860,16 +2863,16 @@ static int __ksm_del_vma(struct vm_area_struct *vma) * * @mm: Proposed VMA's mm_struct * @file: Proposed VMA's file-backed mapping, if any. - * @vm_flags: Proposed VMA"s flags. + * @vma_flags: Proposed VMA"s flags. * - * Returns: @vm_flags possibly updated to mark mergeable. + * Returns: @vma_flags possibly updated to mark mergeable. */ -vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, - vm_flags_t vm_flags) +vma_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, + vma_flags_t vma_flags) { if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && - __ksm_should_add_vma(file, vm_flags)) { - vm_flags |= VM_MERGEABLE; + __ksm_should_add_vma(file, vma_flags)) { + vma_flags_set(&vma_flags, VMA_MERGEABLE_BIT); /* * Generally, the flags here always include MMF_VM_MERGEABLE. * However, in rare cases, this flag may be cleared by ksmd who @@ -2879,7 +2882,7 @@ vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, __ksm_enter(mm); } - return vm_flags; + return vma_flags; } static void ksm_add_vmas(struct mm_struct *mm) @@ -3168,6 +3171,8 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { + /* Ignore the stable/unstable/sqnr flags */ + const unsigned long addr = rmap_item->address & PAGE_MASK; struct anon_vma *anon_vma = rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; @@ -3180,16 +3185,13 @@ again: } anon_vma_lock_read(anon_vma); } + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { - unsigned long addr; cond_resched(); vma = vmac->vma; - /* Ignore the stable/unstable/sqnr flags */ - addr = rmap_item->address & PAGE_MASK; - if (addr < vma->vm_start || addr >= vma->vm_end) continue; /* diff --git a/mm/list_lru.c b/mm/list_lru.c index 26463ae29c64..dd29bcf8eb5f 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -179,6 +179,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, unlock_list_lru(l, false); return false; } +EXPORT_SYMBOL_GPL(list_lru_add); bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { diff --git a/mm/madvise.c b/mm/madvise.c index dbb69400786d..69708e953cf5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -151,13 +151,15 @@ static int madvise_update_vma(vm_flags_t new_flags, struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; + vma_flags_t new_vma_flags = legacy_to_vma_flags(new_flags); struct madvise_behavior_range *range = &madv_behavior->range; struct anon_vma_name *anon_name = madv_behavior->anon_name; bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME; VMA_ITERATOR(vmi, madv_behavior->mm, range->start); - if (new_flags == vma->vm_flags && (!set_new_anon_name || - anon_vma_name_eq(anon_vma_name(vma), anon_name))) + if (vma_flags_same_mask(&vma->flags, new_vma_flags) && + (!set_new_anon_name || + anon_vma_name_eq(anon_vma_name(vma), anon_name))) return 0; if (set_new_anon_name) @@ -165,7 +167,7 @@ static int madvise_update_vma(vm_flags_t new_flags, range->start, range->end, anon_name); else vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, - range->start, range->end, &new_flags); + range->start, range->end, &new_vma_flags); if (IS_ERR(vma)) return PTR_ERR(vma); @@ -174,7 +176,7 @@ static int madvise_update_vma(vm_flags_t new_flags, /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); - vm_flags_reset(vma, new_flags); + vma->flags = new_vma_flags; if (set_new_anon_name) return replace_anon_vma_name(vma, anon_name); @@ -799,9 +801,10 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; struct vm_area_struct *vma = madv_behavior->vma; - unsigned long start_addr = madv_behavior->range.start; - unsigned long end_addr = madv_behavior->range.end; - struct mmu_notifier_range range; + struct mmu_notifier_range range = { + .start = madv_behavior->range.start, + .end = madv_behavior->range.end, + }; struct mmu_gather *tlb = madv_behavior->tlb; struct mm_walk_ops walk_ops = { .pmd_entry = madvise_free_pte_range, @@ -811,12 +814,6 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) if (!vma_is_anonymous(vma)) return -EINVAL; - range.start = max(vma->vm_start, start_addr); - if (range.start >= vma->vm_end) - return -EINVAL; - range.end = min(vma->vm_end, end_addr); - if (range.end <= vma->vm_start) - return -EINVAL; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); @@ -837,7 +834,7 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range_single call sets things up for shrink_active_list to actually + * zap_vma_range call sets things up for shrink_active_list to actually * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. @@ -858,12 +855,10 @@ static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior) struct madvise_behavior_range *range = &madv_behavior->range; struct zap_details details = { .reclaim_pt = true, - .even_cows = true, }; - zap_page_range_single_batched( - madv_behavior->tlb, madv_behavior->vma, range->start, - range->end - range->start, &details); + zap_vma_range_batched(madv_behavior->tlb, madv_behavior->vma, + range->start, range->end - range->start, &details); return 0; } @@ -1198,8 +1193,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ - zap_page_range_single(vma, range->start, - range->end - range->start, NULL); + zap_vma_range(vma, range->start, range->end - range->start); } /* diff --git a/mm/memblock.c b/mm/memblock.c index b3ddfdec7a80..a6a1c91e276d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -17,6 +17,7 @@ #include <linux/seq_file.h> #include <linux/memblock.h> #include <linux/mutex.h> +#include <linux/string_helpers.h> #ifdef CONFIG_KEXEC_HANDOVER #include <linux/libfdt.h> @@ -384,26 +385,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u */ void __init memblock_discard(void) { - phys_addr_t addr, size; + phys_addr_t size; + void *addr; if (memblock.reserved.regions != memblock_reserved_init_regions) { - addr = __pa(memblock.reserved.regions); + addr = memblock.reserved.regions; size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.reserved.max); if (memblock_reserved_in_slab) - kfree(memblock.reserved.regions); + kfree(addr); else - memblock_free_late(addr, size); + memblock_free(addr, size); } if (memblock.memory.regions != memblock_memory_init_regions) { - addr = __pa(memblock.memory.regions); + addr = memblock.memory.regions; size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.memory.max); if (memblock_memory_in_slab) - kfree(memblock.memory.regions); + kfree(addr); else - memblock_free_late(addr, size); + memblock_free(addr, size); } memblock_memory = NULL; @@ -893,13 +895,81 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.memory, base, size); } +static unsigned long __free_reserved_area(phys_addr_t start, phys_addr_t end, + int poison) +{ + unsigned long pages = 0, pfn; + + if (deferred_pages_enabled()) { + WARN(1, "Cannot free reserved memory because of deferred initialization of the memory map"); + return 0; + } + + for_each_valid_pfn(pfn, PFN_UP(start), PFN_DOWN(end)) { + struct page *page = pfn_to_page(pfn); + void *direct_map_addr; + + /* + * 'direct_map_addr' might be different from the kernel virtual + * address because some architectures use aliases. + * Going via physical address, pfn_to_page() and page_address() + * ensures that we get a _writeable_ alias for the memset(). + */ + direct_map_addr = page_address(page); + /* + * Perform a kasan-unchecked memset() since this memory + * has not been initialized. + */ + direct_map_addr = kasan_reset_tag(direct_map_addr); + if ((unsigned int)poison <= 0xFF) + memset(direct_map_addr, poison, PAGE_SIZE); + + free_reserved_page(page); + pages++; + } + return pages; +} + +unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) +{ + phys_addr_t start_pa, end_pa; + unsigned long pages; + + /* + * end is the first address past the region and it may be beyond what + * __pa() or __pa_symbol() can handle. + * Use the address included in the range for the conversion and add back + * 1 afterwards. + */ + if (__is_kernel((unsigned long)start)) { + start_pa = __pa_symbol(start); + end_pa = __pa_symbol(end - 1) + 1; + } else { + start_pa = __pa(start); + end_pa = __pa(end - 1) + 1; + } + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { + if (start_pa < end_pa) + memblock_remove_range(&memblock.reserved, + start_pa, end_pa - start_pa); + } + + pages = __free_reserved_area(start_pa, end_pa, poison); + if (pages && s) + pr_info("Freeing %s memory: %ldK\n", s, K(pages)); + + return pages; +} + /** * memblock_free - free boot memory allocation * @ptr: starting address of the boot memory allocation * @size: size of the boot memory block in bytes * * Free boot memory block previously allocated by memblock_alloc_xx() API. - * The freeing memory will not be released to the buddy allocator. + * If called after the buddy allocator is available, the memory is released to + * the buddy allocator. */ void __init_memblock memblock_free(void *ptr, size_t size) { @@ -913,17 +983,24 @@ void __init_memblock memblock_free(void *ptr, size_t size) * @size: size of the boot memory block in bytes * * Free boot memory block previously allocated by memblock_phys_alloc_xx() API. - * The freeing memory will not be released to the buddy allocator. + * If called after the buddy allocator is available, the memory is released to + * the buddy allocator. */ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; + int ret; memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); - return memblock_remove_range(&memblock.reserved, base, size); + ret = memblock_remove_range(&memblock.reserved, base, size); + + if (slab_is_available()) + __free_reserved_area(base, base + size, -1); + + return ret; } int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size, @@ -973,7 +1050,7 @@ __init void memmap_init_kho_scratch_pages(void) /* * Initialize struct pages for free scratch memory. * The struct pages for reserved scratch memory will be set up in - * reserve_bootmem_region() + * memmap_init_reserved_pages() */ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) { @@ -1116,6 +1193,21 @@ int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t } /** + * memblock_reserved_mark_kern - Mark a reserved memory region with flag + * MEMBLOCK_RSRV_KERN + * + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return: 0 on success, -errno on failure. + */ +int __init_memblock memblock_reserved_mark_kern(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.reserved, base, size, 1, + MEMBLOCK_RSRV_KERN); +} + +/** * memblock_mark_kho_scratch - Mark a memory region as MEMBLOCK_KHO_SCRATCH. * @base: the base phys addr of the region * @size: the size of the region @@ -1751,32 +1843,6 @@ void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, return addr; } -/** - * memblock_free_late - free pages directly to buddy allocator - * @base: phys starting address of the boot memory block - * @size: size of the boot memory block in bytes - * - * This is only useful when the memblock allocator has already been torn - * down, but we are still initializing the system. Pages are released directly - * to the buddy allocator. - */ -void __init memblock_free_late(phys_addr_t base, phys_addr_t size) -{ - phys_addr_t cursor, end; - - end = base + size - 1; - memblock_dbg("%s: [%pa-%pa] %pS\n", - __func__, &base, &end, (void *)_RET_IP_); - kmemleak_free_part_phys(base, size); - cursor = PFN_UP(base); - end = PFN_DOWN(base + size); - - for (; cursor < end; cursor++) { - memblock_free_pages(cursor, 0); - totalram_pages_inc(); - } -} - /* * Remaining API functions */ @@ -2240,6 +2306,31 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return end_pfn - start_pfn; } +/* + * Initialised pages do not have PageReserved set. This function is called + * for each reserved range and marks the pages PageReserved. + * When deferred initialization of struct pages is enabled it also ensures + * that struct pages are properly initialised. + */ +static void __init memmap_init_reserved_range(phys_addr_t start, + phys_addr_t end, int nid) +{ + unsigned long pfn; + + for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) { + struct page *page = pfn_to_page(pfn); + + init_deferred_page(pfn, nid); + + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); + } +} + static void __init memmap_init_reserved_pages(void) { struct memblock_region *region; @@ -2259,7 +2350,7 @@ repeat: end = start + region->size; if (memblock_is_nomap(region)) - reserve_bootmem_region(start, end, nid); + memmap_init_reserved_range(start, end, nid); memblock_set_node(start, region->size, &memblock.reserved, nid); } @@ -2284,7 +2375,7 @@ repeat: if (!numa_valid_node(nid)) nid = early_pfn_to_nid(PFN_DOWN(start)); - reserve_bootmem_region(start, end, nid); + memmap_init_reserved_range(start, end, nid); } } } @@ -2434,7 +2525,7 @@ int reserve_mem_release_by_name(const char *name) return 0; start = phys_to_virt(map->start); - end = start + map->size - 1; + end = start + map->size; snprintf(buf, sizeof(buf), "reserve_mem:%s", name); free_reserved_area(start, end, 0, buf); map->size = 0; @@ -2510,7 +2601,7 @@ static int __init prepare_kho_fdt(void) if (err) goto err_unpreserve_fdt; - err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt); + err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt, fdt_totalsize(fdt)); if (err) goto err_unpreserve_fdt; @@ -2555,7 +2646,7 @@ static void *__init reserve_mem_kho_retrieve_fdt(void) if (fdt) return fdt; - err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys); + err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys, NULL); if (err) { if (err != -ENOENT) pr_warn("failed to retrieve FDT '%s' from KHO: %d\n", @@ -2642,23 +2733,25 @@ static int __init reserve_mem(char *p) int len; if (!p) - return -EINVAL; + goto err_param; /* Check if there's room for more reserved memory */ - if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES) + if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES) { + pr_err("reserve_mem: no more room for reserved memory\n"); return -EBUSY; + } oldp = p; size = memparse(p, &p); if (!size || p == oldp) - return -EINVAL; + goto err_param; if (*p != ':') - return -EINVAL; + goto err_param; align = memparse(p+1, &p); if (*p != ':') - return -EINVAL; + goto err_param; /* * memblock_phys_alloc() doesn't like a zero size align, @@ -2672,7 +2765,7 @@ static int __init reserve_mem(char *p) /* name needs to have length but not too big */ if (!len || len >= RESERVE_MEM_NAME_SIZE) - return -EINVAL; + goto err_param; /* Make sure that name has text */ for (p = name; *p; p++) { @@ -2680,11 +2773,13 @@ static int __init reserve_mem(char *p) break; } if (!*p) - return -EINVAL; + goto err_param; /* Make sure the name is not already used */ - if (reserve_mem_find_by_name(name, &start, &tmp)) + if (reserve_mem_find_by_name(name, &start, &tmp)) { + pr_err("reserve_mem: name \"%s\" was already used\n", name); return -EBUSY; + } /* Pick previous allocations up from KHO if available */ if (reserve_mem_kho_revive(name, size, align)) @@ -2692,16 +2787,22 @@ static int __init reserve_mem(char *p) /* TODO: Allocation must be outside of scratch region */ start = memblock_phys_alloc(size, align); - if (!start) + if (!start) { + pr_err("reserve_mem: memblock allocation failed\n"); return -ENOMEM; + } reserved_mem_add(start, size, name); return 1; +err_param: + pr_err("reserve_mem: empty or malformed parameter\n"); + return -EINVAL; } __setup("reserve_mem=", reserve_mem); -#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) +#ifdef CONFIG_DEBUG_FS +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK static const char * const flagname[] = { [ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG", [ilog2(MEMBLOCK_MIRROR)] = "MIRROR", @@ -2748,10 +2849,8 @@ static int memblock_debug_show(struct seq_file *m, void *private) } DEFINE_SHOW_ATTRIBUTE(memblock_debug); -static int __init memblock_init_debugfs(void) +static inline void memblock_debugfs_expose_arrays(struct dentry *root) { - struct dentry *root = debugfs_create_dir("memblock", NULL); - debugfs_create_file("memory", 0444, root, &memblock.memory, &memblock_debug_fops); debugfs_create_file("reserved", 0444, root, @@ -2760,7 +2859,48 @@ static int __init memblock_init_debugfs(void) debugfs_create_file("physmem", 0444, root, &physmem, &memblock_debug_fops); #endif +} + +#else + +static inline void memblock_debugfs_expose_arrays(struct dentry *root) { } + +#endif /* CONFIG_ARCH_KEEP_MEMBLOCK */ + +static int memblock_reserve_mem_show(struct seq_file *m, void *private) +{ + struct reserve_mem_table *map; + char txtsz[16]; + + guard(mutex)(&reserve_mem_lock); + for (int i = 0; i < reserved_mem_count; i++) { + map = &reserved_mem_table[i]; + if (!map->size) + continue; + + memset(txtsz, 0, sizeof(txtsz)); + string_get_size(map->size, 1, STRING_UNITS_2, txtsz, sizeof(txtsz)); + seq_printf(m, "%s\t\t(%s)\n", map->name, txtsz); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(memblock_reserve_mem); + +static int __init memblock_init_debugfs(void) +{ + struct dentry *root; + + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK) && !reserved_mem_count) + return 0; + + root = debugfs_create_dir("memblock", NULL); + + if (reserved_mem_count) + debugfs_create_file("reserve_mem_param", 0444, root, NULL, + &memblock_reserve_mem_fops); + memblock_debugfs_expose_arrays(root); return 0; } __initcall(memblock_init_debugfs); diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 597af8a80163..433bba9dfe71 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -613,6 +613,7 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) void memcg1_swapout(struct folio *folio, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; + struct obj_cgroup *objcg; unsigned int nr_entries; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); @@ -624,22 +625,20 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) if (!do_memsw_account()) return; - memcg = folio_memcg(folio); - - VM_WARN_ON_ONCE_FOLIO(!memcg, folio); - if (!memcg) + objcg = folio_objcg(folio); + VM_WARN_ON_ONCE_FOLIO(!objcg, folio); + if (!objcg) return; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); /* * In case the memcg owning these pages has been offlined and doesn't * have an ID allocated to it anymore, charge the closest online * ancestor for the swap instead and transfer the memory+swap charge. */ - swap_memcg = mem_cgroup_private_id_get_online(memcg); nr_entries = folio_nr_pages(folio); - /* Get references for the tail pages, too */ - if (nr_entries > 1) - mem_cgroup_private_id_get_many(swap_memcg, nr_entries - 1); + swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry); @@ -647,7 +646,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) folio_unqueue_deferred_split(folio); folio->memcg_data = 0; - if (!mem_cgroup_is_root(memcg)) + if (!obj_cgroup_is_root(objcg)) page_counter_uncharge(&memcg->memory, nr_entries); if (memcg != swap_memcg) { @@ -668,7 +667,8 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) preempt_enable_nested(); memcg1_check_events(memcg, folio_nid(folio)); - css_put(&memcg->css); + rcu_read_unlock(); + obj_cgroup_put(objcg); } /* @@ -1887,6 +1887,22 @@ static const unsigned int memcg1_events[] = { PGMAJFAULT, }; +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) + reparent_memcg_state_local(memcg, parent, memcg1_stats[i]); +} + +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ + int i; + + for (i = 0; i < NR_LRU_LISTS; i++) + reparent_memcg_lruvec_state_local(memcg, parent, i); +} + void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { unsigned long memory, memsw; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index eb3c3c105657..f92f81108d5e 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -27,8 +27,8 @@ void drain_all_stock(struct mem_cgroup *root_memcg); unsigned long memcg_events(struct mem_cgroup *memcg, int event); int memory_stat_show(struct seq_file *m, void *v); -void mem_cgroup_private_id_get_many(struct mem_cgroup *memcg, unsigned int n); -struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg); +struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg, + unsigned int n); /* Cgroup v1-specific declarations */ #ifdef CONFIG_MEMCG_V1 @@ -73,6 +73,13 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_memory, int nid); void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s); +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent); +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent); + +void reparent_memcg_state_local(struct mem_cgroup *memcg, + struct mem_cgroup *parent, int idx); +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg, + struct mem_cgroup *parent, int idx); void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages); static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 772bac21d155..c3d98ab41f1f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -34,7 +34,7 @@ #include <linux/shmem_fs.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> -#include <linux/pagevec.h> +#include <linux/folio_batch.h> #include <linux/vm_event_item.h> #include <linux/smp.h> #include <linux/page-flags.h> @@ -206,26 +206,100 @@ static struct obj_cgroup *obj_cgroup_alloc(void) return objcg; } -static void memcg_reparent_objcgs(struct mem_cgroup *memcg, - struct mem_cgroup *parent) +static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memcg, + struct mem_cgroup *parent, + int nid) { struct obj_cgroup *objcg, *iter; + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node *parent_pn = parent->nodeinfo[nid]; - objcg = rcu_replace_pointer(memcg->objcg, NULL, true); - - spin_lock_irq(&objcg_lock); - + objcg = rcu_replace_pointer(pn->objcg, NULL, true); /* 1) Ready to reparent active objcg. */ - list_add(&objcg->list, &memcg->objcg_list); + list_add(&objcg->list, &pn->objcg_list); /* 2) Reparent active objcg and already reparented objcgs to parent. */ - list_for_each_entry(iter, &memcg->objcg_list, list) + list_for_each_entry(iter, &pn->objcg_list, list) WRITE_ONCE(iter->memcg, parent); /* 3) Move already reparented objcgs to the parent's list */ - list_splice(&memcg->objcg_list, &parent->objcg_list); + list_splice(&pn->objcg_list, &parent_pn->objcg_list); + + return objcg; +} + +#ifdef CONFIG_MEMCG_V1 +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force); + +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + /* + * Reparent stats exposed non-hierarchically. Flush @memcg's stats first + * to read its stats accurately , and conservatively flush @parent's + * stats after reparenting to avoid hiding a potentially large stat + * update (e.g. from callers of mem_cgroup_flush_stats_ratelimited()). + */ + __mem_cgroup_flush_stats(memcg, true); + + /* The following counts are all non-hierarchical and need to be reparented. */ + reparent_memcg1_state_local(memcg, parent); + reparent_memcg1_lruvec_state_local(memcg, parent); + + __mem_cgroup_flush_stats(parent, true); +} +#else +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ +} +#endif + +static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + spin_lock_irq(&objcg_lock); + spin_lock_nested(&mem_cgroup_lruvec(memcg, NODE_DATA(nid))->lru_lock, 1); + spin_lock_nested(&mem_cgroup_lruvec(parent, NODE_DATA(nid))->lru_lock, 2); +} +static inline void reparent_unlocks(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + spin_unlock(&mem_cgroup_lruvec(parent, NODE_DATA(nid))->lru_lock); + spin_unlock(&mem_cgroup_lruvec(memcg, NODE_DATA(nid))->lru_lock); spin_unlock_irq(&objcg_lock); +} - percpu_ref_kill(&objcg->refcnt); +static void memcg_reparent_objcgs(struct mem_cgroup *memcg) +{ + struct obj_cgroup *objcg; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + int nid; + + for_each_node(nid) { +retry: + if (lru_gen_enabled()) + max_lru_gen_memcg(parent, nid); + + reparent_locks(memcg, parent, nid); + + if (lru_gen_enabled()) { + if (!recheck_lru_gen_max_memcg(parent, nid)) { + reparent_unlocks(memcg, parent, nid); + cond_resched(); + goto retry; + } + lru_gen_reparent_memcg(memcg, parent, nid); + } else { + lru_reparent_memcg(memcg, parent, nid); + } + + objcg = __memcg_reparent_objcgs(memcg, parent, nid); + + reparent_unlocks(memcg, parent, nid); + + percpu_ref_kill(&objcg->refcnt); + } + + reparent_state_local(memcg, parent); } /* @@ -241,7 +315,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key); EXPORT_SYMBOL(memcg_bpf_enabled_key); /** - * mem_cgroup_css_from_folio - css of the memcg associated with a folio + * get_mem_cgroup_css_from_folio - acquire a css of the memcg associated with a folio * @folio: folio of interest * * If memcg is bound to the default hierarchy, css of the memcg associated @@ -251,14 +325,16 @@ EXPORT_SYMBOL(memcg_bpf_enabled_key); * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. */ -struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) +struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio) { - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; - if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) - memcg = root_mem_cgroup; + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return &root_mem_cgroup->css; - return &memcg->css; + memcg = get_mem_cgroup_from_folio(folio); + + return memcg ? &memcg->css : &root_mem_cgroup->css; } /** @@ -317,6 +393,7 @@ static const unsigned int memcg_node_stat_items[] = { NR_SHMEM_THPS, NR_FILE_THPS, NR_ANON_THPS, + NR_VMALLOC, NR_KERNEL_STACK_KB, NR_PAGETABLE, NR_SECONDARY_PAGETABLE, @@ -330,6 +407,19 @@ static const unsigned int memcg_node_stat_items[] = { PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, PGDEMOTE_PROACTIVE, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, + PGSTEAL_PROACTIVE, + PGSTEAL_ANON, + PGSTEAL_FILE, + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, + PGSCAN_PROACTIVE, + PGSCAN_ANON, + PGSCAN_FILE, + PGREFILL, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif @@ -339,10 +429,10 @@ static const unsigned int memcg_stat_items[] = { MEMCG_SWAP, MEMCG_SOCK, MEMCG_PERCPU_B, - MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, + MEMCG_ZSWAP_INCOMP, }; #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items) @@ -435,6 +525,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } +#ifdef CONFIG_MEMCG_V1 +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn, + enum node_stat_item idx, long val); + +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg, + struct mem_cgroup *parent, int idx) +{ + int nid; + + for_each_node(nid) { + struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid)); + unsigned long value = lruvec_page_state_local(child_lruvec, idx); + struct mem_cgroup_per_node *child_pn, *parent_pn; + + child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec); + parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec); + + __mod_memcg_lruvec_state(child_pn, idx, -value); + __mod_memcg_lruvec_state(parent_pn, idx, value); + } +} +#endif + /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { #ifdef CONFIG_MEMCG_V1 @@ -443,17 +557,8 @@ static const unsigned int memcg_vm_event_stat[] = { #endif PSWPIN, PSWPOUT, - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSCAN_KHUGEPAGED, - PGSCAN_PROACTIVE, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGSTEAL_KHUGEPAGED, - PGSTEAL_PROACTIVE, PGFAULT, PGMAJFAULT, - PGREFILL, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, @@ -503,7 +608,7 @@ static inline int memcg_events_index(enum vm_event_item idx) struct memcg_vmstats_percpu { /* Stats updates since the last flush */ - unsigned int stats_updates; + unsigned long stats_updates; /* Cached pointers for fast iteration in memcg_rstat_updated() */ struct memcg_vmstats_percpu __percpu *parent_pcpu; @@ -534,7 +639,7 @@ struct memcg_vmstats { unsigned long events_pending[NR_MEMCG_EVENTS]; /* Stats updates since the last flush */ - atomic_t stats_updates; + atomic_long_t stats_updates; }; /* @@ -560,16 +665,16 @@ static u64 flush_last_time; static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) { - return atomic_read(&vmstats->stats_updates) > + return atomic_long_read(&vmstats->stats_updates) > MEMCG_CHARGE_BATCH * num_online_cpus(); } -static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, long val, int cpu) { struct memcg_vmstats_percpu __percpu *statc_pcpu; struct memcg_vmstats_percpu *statc; - unsigned int stats_updates; + unsigned long stats_updates; if (!val) return; @@ -592,7 +697,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, continue; stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0); - atomic_add(stats_updates, &statc->vmstats->stats_updates); + atomic_long_add(stats_updates, &statc->vmstats->stats_updates); } } @@ -600,7 +705,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) { bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats); - trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates), + trace_memcg_flush_stats(memcg, atomic_long_read(&memcg->vmstats->stats_updates), force, needs_flush); if (!force && !needs_flush) @@ -679,31 +784,64 @@ static int memcg_page_state_unit(int item); * Normalize the value passed into memcg_rstat_updated() to be in pages. Round * up non-zero sub-page updates to 1 page as zero page updates are ignored. */ -static int memcg_state_val_in_pages(int idx, int val) +static long memcg_state_val_in_pages(int idx, long val) { int unit = memcg_page_state_unit(idx); + long res; if (!val || unit == PAGE_SIZE) return val; - else - return max(val * unit / PAGE_SIZE, 1UL); + + /* Get the absolute value of (val * unit / PAGE_SIZE). */ + res = mult_frac(abs(val), unit, PAGE_SIZE); + /* Round up zero values. */ + res = res ? : 1; + + return val < 0 ? -res : res; } -/** - * mod_memcg_state - update cgroup memory statistics - * @memcg: the memory cgroup - * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item - * @val: delta to add to the counter, can be negative +#ifdef CONFIG_MEMCG_V1 +/* + * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with + * reparenting of non-hierarchical state_locals. */ -void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, - int val) +static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg) { - int i = memcg_stats_index(idx); - int cpu; + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return memcg; - if (mem_cgroup_disabled()) + rcu_read_lock(); + + while (memcg_is_dying(memcg)) + memcg = parent_mem_cgroup(memcg); + + return memcg; +} + +static inline void get_non_dying_memcg_end(void) +{ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; + rcu_read_unlock(); +} +#else +static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg) +{ + return memcg; +} + +static inline void get_non_dying_memcg_end(void) +{ +} +#endif + +static void __mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, long val) +{ + int i = memcg_stats_index(idx); + int cpu; + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; @@ -712,11 +850,29 @@ void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, this_cpu_add(memcg->vmstats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); memcg_rstat_updated(memcg, val, cpu); + trace_mod_memcg_state(memcg, idx, val); put_cpu(); } +/** + * mod_memcg_state - update cgroup memory statistics + * @memcg: the memory cgroup + * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item + * @val: delta to add to the counter, can be negative + */ +void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, + int val) +{ + if (mem_cgroup_disabled()) + return; + + memcg = get_non_dying_memcg_start(memcg); + __mod_memcg_state(memcg, idx, val); + get_non_dying_memcg_end(); +} + #ifdef CONFIG_MEMCG_V1 /* idx can be of type enum memcg_stat_item or node_stat_item. */ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) @@ -734,23 +890,27 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) #endif return x; } + +void reparent_memcg_state_local(struct mem_cgroup *memcg, + struct mem_cgroup *parent, int idx) +{ + unsigned long value = memcg_page_state_local(memcg, idx); + + __mod_memcg_state(memcg, idx, -value); + __mod_memcg_state(parent, idx, value); +} #endif -static void mod_memcg_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, - int val) +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn, + enum node_stat_item idx, long val) { - struct mem_cgroup_per_node *pn; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = pn->memcg; int i = memcg_stats_index(idx); int cpu; if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - memcg = pn->memcg; - cpu = get_cpu(); /* Update memcg */ @@ -766,6 +926,23 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec, put_cpu(); } +static void mod_memcg_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, + int val) +{ + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + struct mem_cgroup_per_node *pn; + struct mem_cgroup *memcg; + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = get_non_dying_memcg_start(pn->memcg); + pn = memcg->nodeinfo[pgdat->node_id]; + + __mod_memcg_lruvec_state(pn, idx, val); + + get_non_dying_memcg_end(); +} + /** * mod_lruvec_state - update lruvec memory statistics * @lruvec: the lruvec @@ -986,17 +1163,23 @@ again: /** * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg. * @folio: folio from which memcg should be extracted. + * + * See folio_memcg() for folio->objcg/memcg binding rules. */ struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) { - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return NULL; + if (!folio_memcg_charged(folio)) + return root_mem_cgroup; + rcu_read_lock(); - if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) - memcg = root_mem_cgroup; + do { + memcg = folio_memcg(folio); + } while (unlikely(!css_tryget(&memcg->css))); rcu_read_unlock(); return memcg; } @@ -1193,23 +1376,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, } } -#ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ - struct mem_cgroup *memcg; - - if (mem_cgroup_disabled()) - return; - - memcg = folio_memcg(folio); - - if (!memcg) - VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); - else - VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); -} -#endif - /** * folio_lruvec_lock - Lock the lruvec for a folio. * @folio: Pointer to the folio. @@ -1219,14 +1385,20 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) * - folio_test_lru false * - folio frozen (refcount of 0) * - * Return: The lruvec this folio is on with its lock held. + * Return: The lruvec this folio is on with its lock held and rcu read lock held. */ struct lruvec *folio_lruvec_lock(struct folio *folio) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock(&lruvec->lru_lock); - lruvec_memcg_debug(lruvec, folio); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock(&lruvec->lru_lock); + goto retry; + } return lruvec; } @@ -1241,14 +1413,20 @@ struct lruvec *folio_lruvec_lock(struct folio *folio) * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held and interrupts - * disabled. + * disabled and rcu read lock held. */ struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock_irq(&lruvec->lru_lock); - lruvec_memcg_debug(lruvec, folio); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irq(&lruvec->lru_lock); + goto retry; + } return lruvec; } @@ -1264,15 +1442,21 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio) * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held and interrupts - * disabled. + * disabled and rcu read lock held. */ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock_irqsave(&lruvec->lru_lock, *flags); - lruvec_memcg_debug(lruvec, folio); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irqrestore(&lruvec->lru_lock, *flags); + goto retry; + } return lruvec; } @@ -1288,7 +1472,7 @@ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, * to or just after a page is removed from an lru list. */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zid, int nr_pages) + int zid, long nr_pages) { struct mem_cgroup_per_node *mz; unsigned long *lru_size; @@ -1305,7 +1489,7 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, size = *lru_size; if (WARN_ONCE(size < 0, - "%s(%p, %d, %d): lru_size %ld\n", + "%s(%p, %d, %ld): lru_size %ld\n", __func__, lruvec, lru, nr_pages, size)) { VM_BUG_ON(1); *lru_size = 0; @@ -1359,11 +1543,12 @@ static const struct memory_stat memory_stats[] = { { "sec_pagetables", NR_SECONDARY_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, - { "vmalloc", MEMCG_VMALLOC }, + { "vmalloc", NR_VMALLOC }, { "shmem", NR_SHMEM }, #ifdef CONFIG_ZSWAP { "zswap", MEMCG_ZSWAP_B }, { "zswapped", MEMCG_ZSWAPPED }, + { "zswap_incomp", MEMCG_ZSWAP_INCOMP }, #endif { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, @@ -1400,6 +1585,15 @@ static const struct memory_stat memory_stats[] = { { "pgdemote_direct", PGDEMOTE_DIRECT }, { "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED }, { "pgdemote_proactive", PGDEMOTE_PROACTIVE }, + { "pgsteal_kswapd", PGSTEAL_KSWAPD }, + { "pgsteal_direct", PGSTEAL_DIRECT }, + { "pgsteal_khugepaged", PGSTEAL_KHUGEPAGED }, + { "pgsteal_proactive", PGSTEAL_PROACTIVE }, + { "pgscan_kswapd", PGSCAN_KSWAPD }, + { "pgscan_direct", PGSCAN_DIRECT }, + { "pgscan_khugepaged", PGSCAN_KHUGEPAGED }, + { "pgscan_proactive", PGSCAN_PROACTIVE }, + { "pgrefill", PGREFILL }, #ifdef CONFIG_NUMA_BALANCING { "pgpromote_success", PGPROMOTE_SUCCESS }, #endif @@ -1443,6 +1637,15 @@ static int memcg_page_state_output_unit(int item) case PGDEMOTE_DIRECT: case PGDEMOTE_KHUGEPAGED: case PGDEMOTE_PROACTIVE: + case PGSTEAL_KSWAPD: + case PGSTEAL_DIRECT: + case PGSTEAL_KHUGEPAGED: + case PGSTEAL_PROACTIVE: + case PGSCAN_KSWAPD: + case PGSCAN_DIRECT: + case PGSCAN_KHUGEPAGED: + case PGSCAN_PROACTIVE: + case PGREFILL: #ifdef CONFIG_NUMA_BALANCING case PGPROMOTE_SUCCESS: #endif @@ -1514,15 +1717,15 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) /* Accumulated memory events */ seq_buf_printf(s, "pgscan %lu\n", - memcg_events(memcg, PGSCAN_KSWAPD) + - memcg_events(memcg, PGSCAN_DIRECT) + - memcg_events(memcg, PGSCAN_PROACTIVE) + - memcg_events(memcg, PGSCAN_KHUGEPAGED)); + memcg_page_state(memcg, PGSCAN_KSWAPD) + + memcg_page_state(memcg, PGSCAN_DIRECT) + + memcg_page_state(memcg, PGSCAN_PROACTIVE) + + memcg_page_state(memcg, PGSCAN_KHUGEPAGED)); seq_buf_printf(s, "pgsteal %lu\n", - memcg_events(memcg, PGSTEAL_KSWAPD) + - memcg_events(memcg, PGSTEAL_DIRECT) + - memcg_events(memcg, PGSTEAL_PROACTIVE) + - memcg_events(memcg, PGSTEAL_KHUGEPAGED)); + memcg_page_state(memcg, PGSTEAL_KSWAPD) + + memcg_page_state(memcg, PGSTEAL_DIRECT) + + memcg_page_state(memcg, PGSTEAL_PROACTIVE) + + memcg_page_state(memcg, PGSTEAL_KHUGEPAGED)); for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { #ifdef CONFIG_MEMCG_V1 @@ -2361,7 +2564,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, struct page_counter *counter; unsigned long nr_reclaimed; bool passed_oom = false; - unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; + unsigned int reclaim_options; bool drained = false; bool raised_max_event = false; unsigned long pflags; @@ -2375,6 +2578,7 @@ retry: /* Avoid the refill and flush of the older stock */ batch = nr_pages; + reclaim_options = MEMCG_RECLAIM_MAY_SWAP; if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { if (page_counter_try_charge(&memcg->memory, batch, &counter)) @@ -2556,17 +2760,17 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return try_charge_memcg(memcg, gfp_mask, nr_pages); } -static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) +static void commit_charge(struct folio *folio, struct obj_cgroup *objcg) { VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio); /* - * Any of the following ensures page's memcg stability: + * Any of the following ensures folio's objcg stability: * * - the page lock * - LRU isolation * - exclusive reference */ - folio->memcg_data = (unsigned long)memcg; + folio->memcg_data = (unsigned long)objcg; } #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC @@ -2668,14 +2872,26 @@ struct mem_cgroup *mem_cgroup_from_virt(void *p) static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) { - struct obj_cgroup *objcg = NULL; + int nid = numa_node_id(); + + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + struct obj_cgroup *objcg = rcu_dereference(memcg->nodeinfo[nid]->objcg); - for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { - objcg = rcu_dereference(memcg->objcg); if (likely(objcg && obj_cgroup_tryget(objcg))) - break; - objcg = NULL; + return objcg; } + + return NULL; +} + +static inline struct obj_cgroup *get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) +{ + struct obj_cgroup *objcg; + + rcu_read_lock(); + objcg = __get_obj_cgroup_from_memcg(memcg); + rcu_read_unlock(); + return objcg; } @@ -2734,6 +2950,7 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) { struct mem_cgroup *memcg; struct obj_cgroup *objcg; + int nid = numa_node_id(); if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi()) return NULL; @@ -2750,53 +2967,39 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) * Objcg reference is kept by the task, so it's safe * to use the objcg by the current task. */ - return objcg; + return objcg ? : rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); } memcg = this_cpu_read(int_active_memcg); if (unlikely(memcg)) goto from_memcg; - return NULL; + return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); from_memcg: - objcg = NULL; - for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { + for (; memcg; memcg = parent_mem_cgroup(memcg)) { /* * Memcg pointer is protected by scope (see set_active_memcg()) * and is pinning the corresponding objcg, so objcg can't go * away and can be used within the scope without any additional * protection. */ - objcg = rcu_dereference_check(memcg->objcg, 1); + objcg = rcu_dereference_check(memcg->nodeinfo[nid]->objcg, 1); if (likely(objcg)) - break; + return objcg; } - return objcg; + return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); } struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { struct obj_cgroup *objcg; - if (!memcg_kmem_online()) - return NULL; - - if (folio_memcg_kmem(folio)) { - objcg = __folio_objcg(folio); + objcg = folio_objcg(folio); + if (objcg) obj_cgroup_get(objcg); - } else { - struct mem_cgroup *memcg; - rcu_read_lock(); - memcg = __folio_memcg(folio); - if (memcg) - objcg = __get_obj_cgroup_from_memcg(memcg); - else - objcg = NULL; - rcu_read_unlock(); - } return objcg; } @@ -2897,7 +3100,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) int ret = 0; objcg = current_obj_cgroup(); - if (objcg) { + if (objcg && !obj_cgroup_is_root(objcg)) { ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { obj_cgroup_get(objcg); @@ -2926,12 +3129,30 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) obj_cgroup_put(objcg); } +static struct obj_stock_pcp *trylock_stock(void) +{ + if (local_trylock(&obj_stock.lock)) + return this_cpu_ptr(&obj_stock); + + return NULL; +} + +static void unlock_stock(struct obj_stock_pcp *stock) +{ + if (stock) + local_unlock(&obj_stock.lock); +} + +/* Call after __refill_obj_stock() to ensure stock->cached_objg == objcg */ static void __account_obj_stock(struct obj_cgroup *objcg, struct obj_stock_pcp *stock, int nr, struct pglist_data *pgdat, enum node_stat_item idx) { int *bytes; + if (!stock || READ_ONCE(stock->cached_objcg) != objcg) + goto direct; + /* * Save vmstat data in stock and skip vmstat array update unless * accumulating over a page of vmstat data or when pgdat changes. @@ -2971,29 +3192,35 @@ static void __account_obj_stock(struct obj_cgroup *objcg, nr = 0; } } +direct: if (nr) mod_objcg_mlstate(objcg, pgdat, idx, nr); } -static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, - struct pglist_data *pgdat, enum node_stat_item idx) +static bool __consume_obj_stock(struct obj_cgroup *objcg, + struct obj_stock_pcp *stock, + unsigned int nr_bytes) +{ + if (objcg == READ_ONCE(stock->cached_objcg) && + stock->nr_bytes >= nr_bytes) { + stock->nr_bytes -= nr_bytes; + return true; + } + + return false; +} + +static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) { struct obj_stock_pcp *stock; bool ret = false; - if (!local_trylock(&obj_stock.lock)) + stock = trylock_stock(); + if (!stock) return ret; - stock = this_cpu_ptr(&obj_stock); - if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { - stock->nr_bytes -= nr_bytes; - ret = true; - - if (pgdat) - __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); - } - - local_unlock(&obj_stock.lock); + ret = __consume_obj_stock(objcg, stock, nr_bytes); + unlock_stock(stock); return ret; } @@ -3077,23 +3304,20 @@ static bool obj_stock_flush_required(struct obj_stock_pcp *stock, return flush; } -static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, - bool allow_uncharge, int nr_acct, struct pglist_data *pgdat, - enum node_stat_item idx) +static void __refill_obj_stock(struct obj_cgroup *objcg, + struct obj_stock_pcp *stock, + unsigned int nr_bytes, + bool allow_uncharge) { - struct obj_stock_pcp *stock; unsigned int nr_pages = 0; - if (!local_trylock(&obj_stock.lock)) { - if (pgdat) - mod_objcg_mlstate(objcg, pgdat, idx, nr_acct); + if (!stock) { nr_pages = nr_bytes >> PAGE_SHIFT; nr_bytes = nr_bytes & (PAGE_SIZE - 1); atomic_add(nr_bytes, &objcg->nr_charged_bytes); goto out; } - stock = this_cpu_ptr(&obj_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ drain_obj_stock(stock); obj_cgroup_get(objcg); @@ -3105,27 +3329,45 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, } stock->nr_bytes += nr_bytes; - if (pgdat) - __account_obj_stock(objcg, stock, nr_acct, pgdat, idx); - if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { nr_pages = stock->nr_bytes >> PAGE_SHIFT; stock->nr_bytes &= (PAGE_SIZE - 1); } - local_unlock(&obj_stock.lock); out: if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); } -static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size, - struct pglist_data *pgdat, enum node_stat_item idx) +static void refill_obj_stock(struct obj_cgroup *objcg, + unsigned int nr_bytes, + bool allow_uncharge) +{ + struct obj_stock_pcp *stock = trylock_stock(); + __refill_obj_stock(objcg, stock, nr_bytes, allow_uncharge); + unlock_stock(stock); +} + +static int __obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, + size_t size, size_t *remainder) +{ + size_t charge_size; + int ret; + + charge_size = PAGE_ALIGN(size); + ret = obj_cgroup_charge_pages(objcg, gfp, charge_size >> PAGE_SHIFT); + if (!ret) + *remainder = charge_size - size; + + return ret; +} + +int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) { - unsigned int nr_pages, nr_bytes; + size_t remainder; int ret; - if (likely(consume_obj_stock(objcg, size, pgdat, idx))) + if (likely(consume_obj_stock(objcg, size))) return 0; /* @@ -3151,28 +3393,16 @@ static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data * race. */ - nr_pages = size >> PAGE_SHIFT; - nr_bytes = size & (PAGE_SIZE - 1); - - if (nr_bytes) - nr_pages += 1; - - ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); - if (!ret && (nr_bytes || pgdat)) - refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0, - false, size, pgdat, idx); + ret = __obj_cgroup_charge(objcg, gfp, size, &remainder); + if (!ret && remainder) + refill_obj_stock(objcg, remainder, false); return ret; } -int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) -{ - return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0); -} - void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) { - refill_obj_stock(objcg, size, true, 0, NULL, 0); + refill_obj_stock(objcg, size, true); } static inline size_t obj_full_size(struct kmem_cache *s) @@ -3187,6 +3417,7 @@ static inline size_t obj_full_size(struct kmem_cache *s) bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p) { + size_t obj_size = obj_full_size(s); struct obj_cgroup *objcg; struct slab *slab; unsigned long off; @@ -3198,7 +3429,7 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, * obj_cgroup_get() is used to get a permanent reference. */ objcg = current_obj_cgroup(); - if (!objcg) + if (!objcg || obj_cgroup_is_root(objcg)) return true; /* @@ -3227,6 +3458,7 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, for (i = 0; i < size; i++) { unsigned long obj_exts; struct slabobj_ext *obj_ext; + struct obj_stock_pcp *stock; slab = virt_to_slab(p[i]); @@ -3246,9 +3478,20 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, * TODO: we could batch this until slab_pgdat(slab) changes * between iterations, with a more complicated undo */ - if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s), - slab_pgdat(slab), cache_vmstat_idx(s))) - return false; + stock = trylock_stock(); + if (!stock || !__consume_obj_stock(objcg, stock, obj_size)) { + size_t remainder; + + unlock_stock(stock); + if (__obj_cgroup_charge(objcg, flags, obj_size, &remainder)) + return false; + stock = trylock_stock(); + if (remainder) + __refill_obj_stock(objcg, stock, remainder, false); + } + __account_obj_stock(objcg, stock, obj_size, + slab_pgdat(slab), cache_vmstat_idx(s)); + unlock_stock(stock); obj_exts = slab_obj_exts(slab); get_slab_obj_exts(obj_exts); @@ -3270,6 +3513,7 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, for (int i = 0; i < objects; i++) { struct obj_cgroup *objcg; struct slabobj_ext *obj_ext; + struct obj_stock_pcp *stock; unsigned int off; off = obj_to_index(s, slab, p[i]); @@ -3279,8 +3523,13 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, continue; obj_ext->objcg = NULL; - refill_obj_stock(objcg, obj_size, true, -obj_size, - slab_pgdat(slab), cache_vmstat_idx(s)); + + stock = trylock_stock(); + __refill_obj_stock(objcg, stock, obj_size, true); + __account_obj_stock(objcg, stock, -obj_size, + slab_pgdat(slab), cache_vmstat_idx(s)); + unlock_stock(stock); + obj_cgroup_put(objcg); } } @@ -3312,33 +3561,20 @@ void folio_split_memcg_refs(struct folio *folio, unsigned old_order, return; new_refs = (1 << (old_order - new_order)) - 1; - css_get_many(&__folio_memcg(folio)->css, new_refs); + obj_cgroup_get_many(folio_objcg(folio), new_refs); } -static int memcg_online_kmem(struct mem_cgroup *memcg) +static void memcg_online_kmem(struct mem_cgroup *memcg) { - struct obj_cgroup *objcg; - if (mem_cgroup_kmem_disabled()) - return 0; + return; if (unlikely(mem_cgroup_is_root(memcg))) - return 0; - - objcg = obj_cgroup_alloc(); - if (!objcg) - return -ENOMEM; - - objcg->memcg = memcg; - rcu_assign_pointer(memcg->objcg, objcg); - obj_cgroup_get(objcg); - memcg->orig_objcg = objcg; + return; static_branch_enable(&memcg_kmem_online_key); memcg->kmemcg_id = memcg->id.id; - - return 0; } static void memcg_offline_kmem(struct mem_cgroup *memcg) @@ -3352,16 +3588,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) return; parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - memcg_reparent_list_lrus(memcg, parent); - - /* - * Objcg's reparenting must be after list_lru's, make sure list_lru - * helpers won't use parent's list_lru until child is drained. - */ - memcg_reparent_objcgs(memcg, parent); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -3612,13 +3839,7 @@ static void mem_cgroup_private_id_remove(struct mem_cgroup *memcg) } } -void __maybe_unused mem_cgroup_private_id_get_many(struct mem_cgroup *memcg, - unsigned int n) -{ - refcount_add(n, &memcg->id.ref); -} - -static void mem_cgroup_private_id_put_many(struct mem_cgroup *memcg, unsigned int n) +static inline void mem_cgroup_private_id_put(struct mem_cgroup *memcg, unsigned int n) { if (refcount_sub_and_test(n, &memcg->id.ref)) { mem_cgroup_private_id_remove(memcg); @@ -3628,14 +3849,9 @@ static void mem_cgroup_private_id_put_many(struct mem_cgroup *memcg, unsigned in } } -static inline void mem_cgroup_private_id_put(struct mem_cgroup *memcg) -{ - mem_cgroup_private_id_put_many(memcg, 1); -} - -struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg) +struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg, unsigned int n) { - while (!refcount_inc_not_zero(&memcg->id.ref)) { + while (!refcount_add_not_zero(n, &memcg->id.ref)) { /* * The root cgroup cannot be destroyed, so it's refcount must * always be >= 1. @@ -3645,8 +3861,6 @@ struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg) break; } memcg = parent_mem_cgroup(memcg); - if (!memcg) - memcg = root_mem_cgroup; } return memcg; } @@ -3711,6 +3925,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn->lruvec_stats_percpu) goto fail; + INIT_LIST_HEAD(&pn->objcg_list); + lruvec_init(&pn->lruvec); pn->memcg = memcg; @@ -3725,10 +3941,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - obj_cgroup_put(memcg->orig_objcg); + for_each_node(node) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + if (!pn) + continue; - for_each_node(node) - free_mem_cgroup_per_node_info(memcg->nodeinfo[node]); + obj_cgroup_put(pn->orig_objcg); + free_mem_cgroup_per_node_info(pn); + } memcg1_free_events(memcg); kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); @@ -3799,7 +4019,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) #endif memcg1_memcg_init(memcg); memcg->kmemcg_id = -1; - INIT_LIST_HEAD(&memcg->objcg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) @@ -3875,9 +4094,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct obj_cgroup *objcg; + int nid; - if (memcg_online_kmem(memcg)) - goto remove_id; + memcg_online_kmem(memcg); /* * A memcg must be visible for expand_shrinker_info() @@ -3887,6 +4107,20 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (alloc_shrinker_info(memcg)) goto offline_kmem; + for_each_node(nid) { + objcg = obj_cgroup_alloc(); + if (!objcg) + goto free_objcg; + + if (unlikely(mem_cgroup_is_root(memcg))) + objcg->is_root = true; + + objcg->memcg = memcg; + rcu_assign_pointer(memcg->nodeinfo[nid]->objcg, objcg); + obj_cgroup_get(objcg); + memcg->nodeinfo[nid]->orig_objcg = objcg; + } + if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled()) queue_delayed_work(system_dfl_wq, &stats_flush_dwork, FLUSH_TIME); @@ -3909,9 +4143,27 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL); return 0; +free_objcg: + for_each_node(nid) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + + objcg = rcu_replace_pointer(pn->objcg, NULL, true); + if (objcg) + percpu_ref_kill(&objcg->refcnt); + + if (pn->orig_objcg) { + obj_cgroup_put(pn->orig_objcg); + /* + * Reset pn->orig_objcg to NULL to prevent + * obj_cgroup_put() from being called again in + * __mem_cgroup_free(). + */ + pn->orig_objcg = NULL; + } + } + free_shrinker_info(memcg); offline_kmem: memcg_offline_kmem(memcg); -remove_id: mem_cgroup_private_id_remove(memcg); return -ENOMEM; } @@ -3929,13 +4181,19 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); reparent_deferred_split_queue(memcg); + /* + * The reparenting of objcg must be after the reparenting of the + * list_lru and deferred_split_queue above, which ensures that they will + * not mistakenly get the parent list_lru and deferred_split_queue. + */ + memcg_reparent_objcgs(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); lru_gen_offline_memcg(memcg); drain_all_stock(memcg); - mem_cgroup_private_id_put(memcg); + mem_cgroup_private_id_put(memcg, 1); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -4161,8 +4419,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) } WRITE_ONCE(statc->stats_updates, 0); /* We are in a per-cpu loop here, only do the atomic write once */ - if (atomic_read(&memcg->vmstats->stats_updates)) - atomic_set(&memcg->vmstats->stats_updates, 0); + if (atomic_long_read(&memcg->vmstats->stats_updates)) + atomic_long_set(&memcg->vmstats->stats_updates, 0); } static void mem_cgroup_fork(struct task_struct *task) @@ -4739,16 +4997,20 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, gfp_t gfp) { - int ret; - - ret = try_charge(memcg, gfp, folio_nr_pages(folio)); - if (ret) - goto out; + int ret = 0; + struct obj_cgroup *objcg; - css_get(&memcg->css); - commit_charge(folio, memcg); + objcg = get_obj_cgroup_from_memcg(memcg); + /* Do not account at the root objcg level. */ + if (!obj_cgroup_is_root(objcg)) + ret = try_charge_memcg(memcg, gfp, folio_nr_pages(folio)); + if (ret) { + obj_cgroup_put(objcg); + return ret; + } + commit_charge(folio, objcg); memcg1_commit_charge(folio, memcg); -out: + return ret; } @@ -4834,7 +5096,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, } struct uncharge_gather { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; @@ -4848,58 +5110,52 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = obj_cgroup_memcg(ug->objcg); if (ug->nr_memory) { - memcg_uncharge(ug->memcg, ug->nr_memory); + memcg_uncharge(memcg, ug->nr_memory); if (ug->nr_kmem) { - mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem); - memcg1_account_kmem(ug->memcg, -ug->nr_kmem); + mod_memcg_state(memcg, MEMCG_KMEM, -ug->nr_kmem); + memcg1_account_kmem(memcg, -ug->nr_kmem); } - memcg1_oom_recover(ug->memcg); + memcg1_oom_recover(memcg); } - memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid); + memcg1_uncharge_batch(memcg, ug->pgpgout, ug->nr_memory, ug->nid); + rcu_read_unlock(); /* drop reference from uncharge_folio */ - css_put(&ug->memcg->css); + obj_cgroup_put(ug->objcg); } static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) { long nr_pages; - struct mem_cgroup *memcg; struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Nobody should be changing or seriously looking at - * folio memcg or objcg at this point, we have fully - * exclusive access to the folio. + * folio objcg at this point, we have fully exclusive + * access to the folio. */ - if (folio_memcg_kmem(folio)) { - objcg = __folio_objcg(folio); - /* - * This get matches the put at the end of the function and - * kmem pages do not hold memcg references anymore. - */ - memcg = get_mem_cgroup_from_objcg(objcg); - } else { - memcg = __folio_memcg(folio); - } - - if (!memcg) + objcg = folio_objcg(folio); + if (!objcg) return; - if (ug->memcg != memcg) { - if (ug->memcg) { + if (ug->objcg != objcg) { + if (ug->objcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = memcg; + ug->objcg = objcg; ug->nid = folio_nid(folio); - /* pairs with css_put in uncharge_batch */ - css_get(&memcg->css); + /* pairs with obj_cgroup_put in uncharge_batch */ + obj_cgroup_get(objcg); } nr_pages = folio_nr_pages(folio); @@ -4907,20 +5163,17 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) if (folio_memcg_kmem(folio)) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; - - folio->memcg_data = 0; - obj_cgroup_put(objcg); } else { /* LRU pages aren't accounted at the root level */ - if (!mem_cgroup_is_root(memcg)) + if (!obj_cgroup_is_root(objcg)) ug->nr_memory += nr_pages; ug->pgpgout++; WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); - folio->memcg_data = 0; } - css_put(&memcg->css); + folio->memcg_data = 0; + obj_cgroup_put(objcg); } void __mem_cgroup_uncharge(struct folio *folio) @@ -4944,7 +5197,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios) uncharge_gather_clear(&ug); for (i = 0; i < folios->nr; i++) uncharge_folio(folios->folios[i], &ug); - if (ug.memcg) + if (ug.objcg) uncharge_batch(&ug); } @@ -4961,6 +5214,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios) void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; + struct obj_cgroup *objcg; long nr_pages = folio_nr_pages(new); VM_BUG_ON_FOLIO(!folio_test_locked(old), old); @@ -4975,21 +5229,24 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) if (folio_memcg_charged(new)) return; - memcg = folio_memcg(old); - VM_WARN_ON_ONCE_FOLIO(!memcg, old); - if (!memcg) + objcg = folio_objcg(old); + VM_WARN_ON_ONCE_FOLIO(!objcg, old); + if (!objcg) return; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); /* Force-charge the new page. The old one will be freed soon */ - if (!mem_cgroup_is_root(memcg)) { + if (!obj_cgroup_is_root(objcg)) { page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); } - css_get(&memcg->css); - commit_charge(new, memcg); + obj_cgroup_get(objcg); + commit_charge(new, objcg); memcg1_commit_charge(new, memcg); + rcu_read_unlock(); } /** @@ -5005,7 +5262,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) */ void mem_cgroup_migrate(struct folio *old, struct folio *new) { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(!folio_test_locked(old), old); VM_BUG_ON_FOLIO(!folio_test_locked(new), new); @@ -5016,18 +5273,18 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) if (mem_cgroup_disabled()) return; - memcg = folio_memcg(old); + objcg = folio_objcg(old); /* - * Note that it is normal to see !memcg for a hugetlb folio. + * Note that it is normal to see !objcg for a hugetlb folio. * For e.g, it could have been allocated when memory_hugetlb_accounting * was not selected. */ - VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); - if (!memcg) + VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !objcg, old); + if (!objcg) return; - /* Transfer the charge and the css ref */ - commit_charge(new, memcg); + /* Transfer the charge and the objcg ref */ + commit_charge(new, objcg); /* Warning should never happen, so don't worry about refcount non-0 */ WARN_ON_ONCE(folio_unqueue_deferred_split(old)); @@ -5210,34 +5467,35 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) unsigned int nr_pages = folio_nr_pages(folio); struct page_counter *counter; struct mem_cgroup *memcg; + struct obj_cgroup *objcg; if (do_memsw_account()) return 0; - memcg = folio_memcg(folio); - - VM_WARN_ON_ONCE_FOLIO(!memcg, folio); - if (!memcg) + objcg = folio_objcg(folio); + VM_WARN_ON_ONCE_FOLIO(!objcg, folio); + if (!objcg) return 0; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); if (!entry.val) { memcg_memory_event(memcg, MEMCG_SWAP_FAIL); + rcu_read_unlock(); return 0; } - memcg = mem_cgroup_private_id_get_online(memcg); + memcg = mem_cgroup_private_id_get_online(memcg, nr_pages); + /* memcg is pined by memcg ID. */ + rcu_read_unlock(); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); - mem_cgroup_private_id_put(memcg); + mem_cgroup_private_id_put(memcg, nr_pages); return -ENOMEM; } - - /* Get references for the tail pages, too */ - if (nr_pages > 1) - mem_cgroup_private_id_get_many(memcg, nr_pages - 1); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); swap_cgroup_record(folio, mem_cgroup_private_id(memcg), entry); @@ -5266,7 +5524,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) page_counter_uncharge(&memcg->swap, nr_pages); } mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); - mem_cgroup_private_id_put_many(memcg, nr_pages); + mem_cgroup_private_id_put(memcg, nr_pages); } rcu_read_unlock(); } @@ -5287,27 +5545,29 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) bool mem_cgroup_swap_full(struct folio *folio) { struct mem_cgroup *memcg; + bool ret = false; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (vm_swap_full()) return true; - if (do_memsw_account()) - return false; + if (do_memsw_account() || !folio_memcg_charged(folio)) + return ret; + rcu_read_lock(); memcg = folio_memcg(folio); - if (!memcg) - return false; - for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { unsigned long usage = page_counter_read(&memcg->swap); if (usage * 2 >= READ_ONCE(memcg->swap.high) || - usage * 2 >= READ_ONCE(memcg->swap.max)) - return true; + usage * 2 >= READ_ONCE(memcg->swap.max)) { + ret = true; + break; + } } + rcu_read_unlock(); - return false; + return ret; } static int __init setup_swap_account(char *s) @@ -5503,6 +5763,9 @@ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; + if (obj_cgroup_is_root(objcg)) + return; + VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); /* PF_MEMALLOC context, charging must succeed */ @@ -5513,6 +5776,8 @@ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) memcg = obj_cgroup_memcg(objcg); mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); + if (size == PAGE_SIZE) + mod_memcg_state(memcg, MEMCG_ZSWAP_INCOMP, 1); rcu_read_unlock(); } @@ -5530,12 +5795,17 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; + if (obj_cgroup_is_root(objcg)) + return; + obj_cgroup_uncharge(objcg, size); rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); + if (size == PAGE_SIZE) + mod_memcg_state(memcg, MEMCG_ZSWAP_INCOMP, -1); rcu_read_unlock(); } diff --git a/mm/memfd.c b/mm/memfd.c index 919c2a53eb96..fb425f4e315f 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -227,7 +227,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) F_SEAL_WRITE | \ F_SEAL_FUTURE_WRITE) -static int memfd_add_seals(struct file *file, unsigned int seals) +int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); unsigned int *file_seals; @@ -309,7 +309,7 @@ unlock: return error; } -static int memfd_get_seals(struct file *file) +int memfd_get_seals(struct file *file) { unsigned int *seals = memfd_file_seals_ptr(file); diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index b8edb9f981d7..b02b503c750d 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -79,6 +79,8 @@ #include <linux/shmem_fs.h> #include <linux/vmalloc.h> #include <linux/memfd.h> +#include <uapi/linux/memfd.h> + #include "internal.h" static int memfd_luo_preserve_folios(struct file *file, @@ -103,7 +105,6 @@ static int memfd_luo_preserve_folios(struct file *file, if (!size) { *nr_foliosp = 0; *out_folios_ser = NULL; - memset(kho_vmalloc, 0, sizeof(*kho_vmalloc)); return 0; } @@ -259,7 +260,7 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args) struct memfd_luo_folio_ser *folios_ser; struct memfd_luo_ser *ser; u64 nr_folios; - int err = 0; + int err = 0, seals; inode_lock(inode); shmem_freeze(inode, true); @@ -271,8 +272,21 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args) goto err_unlock; } + seals = memfd_get_seals(args->file); + if (seals < 0) { + err = seals; + goto err_free_ser; + } + + /* Make sure the file only has the seals supported by this version. */ + if (seals & ~MEMFD_LUO_ALL_SEALS) { + err = -EOPNOTSUPP; + goto err_free_ser; + } + ser->pos = args->file->f_pos; ser->size = i_size_read(inode); + ser->seals = seals; err = memfd_luo_preserve_folios(args->file, &ser->folios, &folios_ser, &nr_folios); @@ -395,6 +409,7 @@ static int memfd_luo_retrieve_folios(struct file *file, struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; struct folio *folio; + long npages, nr_added_pages = 0; int err = -EIO; long i; @@ -441,21 +456,26 @@ static int memfd_luo_retrieve_folios(struct file *file, if (flags & MEMFD_LUO_FOLIO_DIRTY) folio_mark_dirty(folio); - err = shmem_inode_acct_blocks(inode, 1); + npages = folio_nr_pages(folio); + err = shmem_inode_acct_blocks(inode, npages); if (err) { - pr_err("shmem: failed to account folio index %ld: %d\n", - i, err); - goto unlock_folio; + pr_err("shmem: failed to account folio index %ld(%ld pages): %d\n", + i, npages, err); + goto remove_from_cache; } - shmem_recalc_inode(inode, 1, 0); + nr_added_pages += npages; folio_add_lru(folio); folio_unlock(folio); folio_put(folio); } + shmem_recalc_inode(inode, nr_added_pages, 0); + return 0; +remove_from_cache: + filemap_remove_folio(folio); unlock_folio: folio_unlock(folio); folio_put(folio); @@ -466,12 +486,19 @@ put_folios: */ for (long j = i + 1; j < nr_folios; j++) { const struct memfd_luo_folio_ser *pfolio = &folios_ser[j]; + phys_addr_t phys; + + if (!pfolio->pfn) + continue; - folio = kho_restore_folio(pfolio->pfn); + phys = PFN_PHYS(pfolio->pfn); + folio = kho_restore_folio(phys); if (folio) folio_put(folio); } + shmem_recalc_inode(inode, nr_added_pages, 0); + return err; } @@ -486,15 +513,31 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) if (!ser) return -EINVAL; - file = memfd_alloc_file("", 0); + /* Make sure the file only has seals supported by this version. */ + if (ser->seals & ~MEMFD_LUO_ALL_SEALS) { + err = -EOPNOTSUPP; + goto free_ser; + } + + /* + * The seals are preserved. Allow sealing here so they can be added + * later. + */ + file = memfd_alloc_file("", MFD_ALLOW_SEALING); if (IS_ERR(file)) { pr_err("failed to setup file: %pe\n", file); err = PTR_ERR(file); goto free_ser; } + err = memfd_add_seals(file, ser->seals); + if (err) { + pr_err("failed to add seals: %pe\n", ERR_PTR(err)); + goto put_file; + } + vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); - file->f_inode->i_size = ser->size; + i_size_write(file_inode(file), ser->size); if (ser->nr_folios) { folios_ser = kho_restore_vmalloc(&ser->folios); @@ -529,6 +572,11 @@ static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler, return shmem_file(file) && !inode->i_nlink; } +static unsigned long memfd_luo_get_id(struct file *file) +{ + return (unsigned long)file_inode(file); +} + static const struct liveupdate_file_ops memfd_luo_file_ops = { .freeze = memfd_luo_freeze, .finish = memfd_luo_finish, @@ -536,6 +584,7 @@ static const struct liveupdate_file_ops memfd_luo_file_ops = { .preserve = memfd_luo_preserve, .unpreserve = memfd_luo_unpreserve, .can_preserve = memfd_luo_can_preserve, + .get_id = memfd_luo_get_id, .owner = THIS_MODULE, }; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 986f809376eb..54851d8a195b 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -69,7 +69,7 @@ bool folio_use_access_time(struct folio *folio) } #endif -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION static int top_tier_adistance; /* * node_demotion[] examples: @@ -129,7 +129,7 @@ static int top_tier_adistance; * */ static struct demotion_nodes *node_demotion __read_mostly; -#endif /* CONFIG_MIGRATION */ +#endif /* CONFIG_NUMA_MIGRATION */ static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); @@ -273,7 +273,7 @@ static struct memory_tier *__node_get_memory_tier(int node) lockdep_is_held(&memory_tier_lock)); } -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION bool node_is_toptier(int node) { bool toptier; @@ -519,7 +519,7 @@ static void establish_demotion_targets(void) #else static inline void establish_demotion_targets(void) {} -#endif /* CONFIG_MIGRATION */ +#endif /* CONFIG_NUMA_MIGRATION */ static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) { @@ -911,7 +911,7 @@ static int __init memory_tier_init(void) if (ret) panic("%s() failed to register memory tier subsystem\n", __func__); -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION node_demotion = kzalloc_objs(struct demotion_nodes, nr_node_ids); WARN_ON(!node_demotion); #endif @@ -938,7 +938,7 @@ subsys_initcall(memory_tier_init); bool numa_demotion_enabled = false; -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION #ifdef CONFIG_SYSFS static ssize_t demotion_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/mm/memory.c b/mm/memory.c index 2f815a34d924..ea6568571131 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -162,21 +162,8 @@ static int __init disable_randmaps(char *s) } __setup("norandmaps", disable_randmaps); -unsigned long zero_pfn __read_mostly; -EXPORT_SYMBOL(zero_pfn); - unsigned long highest_memmap_pfn __read_mostly; -/* - * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() - */ -static int __init init_zero_pfn(void) -{ - zero_pfn = page_to_pfn(ZERO_PAGE(0)); - return 0; -} -early_initcall(init_zero_pfn); - void mm_trace_rss_stat(struct mm_struct *mm, int member) { trace_rss_stat(mm, member); @@ -1346,7 +1333,7 @@ again: if (ret == -EIO) { VM_WARN_ON_ONCE(!entry.val); - if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { + if (swap_retry_table_alloc(entry, GFP_KERNEL) < 0) { ret = -ENOMEM; goto out; } @@ -1567,11 +1554,13 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ - if (!details || details->reclaim_pt) + if (!details) return true; + VM_WARN_ON_ONCE(details->skip_cows && details->reclaim_pt); + /* Or, we zap COWed pages only if the caller wants to */ - return details->even_cows; + return !details->skip_cows; } /* Decides whether we should zap this folio with the folio pointer specified */ @@ -2006,13 +1995,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, } else if (details && details->single_folio && folio_test_pmd_mappable(details->single_folio) && next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { - spinlock_t *ptl = pmd_lock(tlb->mm, pmd); - /* - * Take and drop THP pmd lock so that we cannot return - * prematurely, while zap_huge_pmd() has cleared *pmd, - * but not yet decremented compound_mapcount(). - */ - spin_unlock(ptl); + sync_with_folio_pmd_zap(tlb->mm, pmd); } if (pmd_none(*pmd)) { addr = next; @@ -2073,65 +2056,74 @@ static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, return addr; } -void unmap_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - struct zap_details *details) +static void __zap_vma_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct zap_details *details) { - pgd_t *pgd; - unsigned long next; + const bool reaping = details && details->reaping; - BUG_ON(addr >= end); - tlb_start_vma(tlb, vma); - pgd = pgd_offset(vma->vm_mm, addr); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - next = zap_p4d_range(tlb, vma, pgd, addr, next, details); - } while (pgd++, addr = next, addr != end); - tlb_end_vma(tlb, vma); -} + VM_WARN_ON_ONCE(start >= end || !range_in_vma(vma, start, end)); + /* uprobe_munmap() might sleep, so skip it when reaping. */ + if (vma->vm_file && !reaping) + uprobe_munmap(vma, start, end); -static void unmap_single_vma(struct mmu_gather *tlb, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, struct zap_details *details) -{ - unsigned long start = max(vma->vm_start, start_addr); - unsigned long end; + if (unlikely(is_vm_hugetlb_page(vma))) { + zap_flags_t zap_flags = details ? details->zap_flags : 0; - if (start >= vma->vm_end) - return; - end = min(vma->vm_end, end_addr); - if (end <= vma->vm_start) - return; + VM_WARN_ON_ONCE(reaping); + /* + * vm_file will be NULL when we fail early while instantiating + * a new mapping. In this case, no pages were mapped yet and + * there is nothing to do. + */ + if (!vma->vm_file) + return; + __unmap_hugepage_range(tlb, vma, start, end, NULL, zap_flags); + } else { + unsigned long next, addr = start; + pgd_t *pgd; - if (vma->vm_file) - uprobe_munmap(vma, start, end); + tlb_start_vma(tlb, vma); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + next = zap_p4d_range(tlb, vma, pgd, addr, next, details); + } while (pgd++, addr = next, addr != end); + tlb_end_vma(tlb, vma); + } +} - if (start != end) { - if (unlikely(is_vm_hugetlb_page(vma))) { - /* - * It is undesirable to test vma->vm_file as it - * should be non-null for valid hugetlb area. - * However, vm_file will be NULL in the error - * cleanup path of mmap_region. When - * hugetlbfs ->mmap method fails, - * mmap_region() nullifies vma->vm_file - * before calling this function to clean up. - * Since no pte has actually been setup, it is - * safe to do nothing in this case. - */ - if (vma->vm_file) { - zap_flags_t zap_flags = details ? - details->zap_flags : 0; - __unmap_hugepage_range(tlb, vma, start, end, - NULL, zap_flags); - } - } else - unmap_page_range(tlb, vma, start, end, details); +/** + * zap_vma_for_reaping - zap all page table entries in the vma without blocking + * @vma: The vma to zap. + * + * Zap all page table entries in the vma without blocking for use by the oom + * killer. Hugetlb vmas are not supported. + * + * Returns: 0 on success, -EBUSY if we would have to block. + */ +int zap_vma_for_reaping(struct vm_area_struct *vma) +{ + struct zap_details details = { + .reaping = true, + }; + struct mmu_notifier_range range; + struct mmu_gather tlb; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, + vma->vm_start, vma->vm_end); + tlb_gather_mmu(&tlb, vma->vm_mm); + if (mmu_notifier_invalidate_range_start_nonblock(&range)) { + tlb_finish_mmu(&tlb); + return -EBUSY; } + __zap_vma_range(&tlb, vma, range.start, range.end, &details); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); + return 0; } /** @@ -2156,8 +2148,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) struct mmu_notifier_range range; struct zap_details details = { .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, - /* Careful - we need to zap private pages too! */ - .even_cows = true, }; vma = unmap->first; @@ -2165,10 +2155,11 @@ void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) unmap->vma_start, unmap->vma_end); mmu_notifier_invalidate_range_start(&range); do { - unsigned long start = unmap->vma_start; - unsigned long end = unmap->vma_end; + unsigned long start = max(vma->vm_start, unmap->vma_start); + unsigned long end = min(vma->vm_end, unmap->vma_end); + hugetlb_zap_begin(vma, &start, &end); - unmap_single_vma(tlb, vma, start, end, &details); + __zap_vma_range(tlb, vma, start, end, &details); hugetlb_zap_end(vma, &details); vma = mas_find(unmap->mas, unmap->tree_end - 1); } while (vma); @@ -2176,17 +2167,20 @@ void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) } /** - * zap_page_range_single_batched - remove user pages in a given range + * zap_vma_range_batched - zap page table entries in a vma range * @tlb: pointer to the caller's struct mmu_gather - * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to remove - * @size: number of bytes to remove - * @details: details of shared cache invalidation + * @vma: the vma covering the range to zap + * @address: starting address of the range to zap + * @size: number of bytes to zap + * @details: details specifying zapping behavior + * + * @tlb must not be NULL. The provided address range must be fully + * contained within @vma. If @vma is for hugetlb, @tlb is flushed and + * re-initialized by this function. * - * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for - * hugetlb, @tlb is flushed and re-initialized by this function. + * If @details is NULL, this function will zap all page table entries. */ -void zap_page_range_single_batched(struct mmu_gather *tlb, +void zap_vma_range_batched(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { @@ -2195,6 +2189,9 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm); + if (unlikely(!size)) + return; + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); @@ -2204,7 +2201,7 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(tlb, vma, address, end, details); + __zap_vma_range(tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(&range); if (is_vm_hugetlb_page(vma)) { /* @@ -2218,45 +2215,42 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, } /** - * zap_page_range_single - remove user pages in a given range - * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap + * zap_vma_range - zap all page table entries in a vma range + * @vma: the vma covering the range to zap + * @address: starting address of the range to zap * @size: number of bytes to zap - * @details: details of shared cache invalidation * - * The range must fit into one VMA. + * The provided address range must be fully contained within @vma. */ -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *details) +void zap_vma_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size) { struct mmu_gather tlb; tlb_gather_mmu(&tlb, vma->vm_mm); - zap_page_range_single_batched(&tlb, vma, address, size, details); + zap_vma_range_batched(&tlb, vma, address, size, NULL); tlb_finish_mmu(&tlb); } /** - * zap_vma_ptes - remove ptes mapping the vma - * @vma: vm_area_struct holding ptes to be zapped - * @address: starting address of pages to zap + * zap_special_vma_range - zap all page table entries in a special vma range + * @vma: the vma covering the range to zap + * @address: starting address of the range to zap * @size: number of bytes to zap * - * This function only unmaps ptes assigned to VM_PFNMAP vmas. - * - * The entire address range must be fully contained within the vma. - * + * This function does nothing when the provided address range is not fully + * contained in @vma, or when the @vma is not VM_PFNMAP or VM_MIXEDMAP. */ -void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, +void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (!range_in_vma(vma, address, address + size) || - !(vma->vm_flags & VM_PFNMAP)) + !(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) return; - zap_page_range_single(vma, address, size, NULL); + zap_vma_range(vma, address, size); } -EXPORT_SYMBOL_GPL(zap_vma_ptes); +EXPORT_SYMBOL_GPL(zap_special_vma_range); static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) { @@ -2490,13 +2484,14 @@ out: int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num) { - const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; + const unsigned long nr_pages = *num; + const unsigned long end = addr + PAGE_SIZE * nr_pages; - if (addr < vma->vm_start || end_addr >= vma->vm_end) + if (!range_in_vma(vma, addr, end)) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { - BUG_ON(mmap_read_trylock(vma->vm_mm)); - BUG_ON(vma->vm_flags & VM_PFNMAP); + VM_WARN_ON_ONCE(mmap_read_trylock(vma->vm_mm)); + VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } /* Defer page refcount checking till we're about to map that page. */ @@ -2504,6 +2499,39 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_pages); +int map_kernel_pages_prepare(struct vm_area_desc *desc) +{ + const struct mmap_action *action = &desc->action; + const unsigned long addr = action->map_kernel.start; + unsigned long nr_pages, end; + + if (!vma_desc_test(desc, VMA_MIXEDMAP_BIT)) { + VM_WARN_ON_ONCE(mmap_read_trylock(desc->mm)); + VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_PFNMAP_BIT)); + vma_desc_set_flags(desc, VMA_MIXEDMAP_BIT); + } + + nr_pages = action->map_kernel.nr_pages; + end = addr + PAGE_SIZE * nr_pages; + if (!range_in_vma_desc(desc, addr, end)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(map_kernel_pages_prepare); + +int map_kernel_pages_complete(struct vm_area_struct *vma, + struct mmap_action *action) +{ + unsigned long nr_pages; + + nr_pages = action->map_kernel.nr_pages; + return insert_pages(vma, action->map_kernel.start, + action->map_kernel.pages, + &nr_pages, vma->vm_page_prot); +} +EXPORT_SYMBOL(map_kernel_pages_complete); + /** * vm_insert_page - insert single page into user vma * @vma: user vma to map to @@ -2988,7 +3016,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - VM_WARN_ON_ONCE(!vma_test_all_flags_mask(vma, VMA_REMAP_FLAGS)); + VM_WARN_ON_ONCE(!vma_test_all_mask(vma, VMA_REMAP_FLAGS)); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -3022,7 +3050,7 @@ static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long add * maintain page reference counts, and callers may free * pages due to the error. So zap it early. */ - zap_page_range_single(vma, addr, size, NULL); + zap_vma_range(vma, addr, size); return error; } @@ -3105,26 +3133,37 @@ static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } #endif -void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +int remap_pfn_range_prepare(struct vm_area_desc *desc) { - /* - * We set addr=VMA start, end=VMA end here, so this won't fail, but we - * check it again on complete and will fail there if specified addr is - * invalid. - */ - get_remap_pgoff(vma_desc_is_cow_mapping(desc), desc->start, desc->end, - desc->start, desc->end, pfn, &desc->pgoff); + const struct mmap_action *action = &desc->action; + const unsigned long start = action->remap.start; + const unsigned long end = start + action->remap.size; + const unsigned long pfn = action->remap.start_pfn; + const bool is_cow = vma_desc_is_cow_mapping(desc); + int err; + + if (!range_in_vma_desc(desc, start, end)) + return -EFAULT; + + err = get_remap_pgoff(is_cow, start, end, desc->start, desc->end, pfn, + &desc->pgoff); + if (err) + return err; + vma_desc_set_flags_mask(desc, VMA_REMAP_FLAGS); + return 0; } -static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size) +static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, + unsigned long size) { - unsigned long end = addr + PAGE_ALIGN(size); + const unsigned long end = addr + PAGE_ALIGN(size); + const bool is_cow = is_cow_mapping(vma->vm_flags); int err; - err = get_remap_pgoff(is_cow_mapping(vma->vm_flags), addr, end, - vma->vm_start, vma->vm_end, pfn, &vma->vm_pgoff); + err = get_remap_pgoff(is_cow, addr, end, vma->vm_start, vma->vm_end, + pfn, &vma->vm_pgoff); if (err) return err; @@ -3157,10 +3196,67 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); -int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +int remap_pfn_range_complete(struct vm_area_struct *vma, + struct mmap_action *action) { - return do_remap_pfn_range(vma, addr, pfn, size, prot); + const unsigned long start = action->remap.start; + const unsigned long pfn = action->remap.start_pfn; + const unsigned long size = action->remap.size; + const pgprot_t prot = action->remap.pgprot; + + return do_remap_pfn_range(vma, start, pfn, size, prot); +} + +static int __simple_ioremap_prep(unsigned long vm_len, pgoff_t vm_pgoff, + phys_addr_t start_phys, unsigned long size, + unsigned long *pfnp) +{ + unsigned long pfn, pages; + + /* Check that the physical memory area passed in looks valid */ + if (start_phys + size < start_phys) + return -EINVAL; + /* + * You *really* shouldn't map things that aren't page-aligned, + * but we've historically allowed it because IO memory might + * just have smaller alignment. + */ + size += start_phys & ~PAGE_MASK; + pfn = start_phys >> PAGE_SHIFT; + pages = (size + ~PAGE_MASK) >> PAGE_SHIFT; + if (pfn + pages < pfn) + return -EINVAL; + + /* We start the mapping 'vm_pgoff' pages into the area */ + if (vm_pgoff > pages) + return -EINVAL; + pfn += vm_pgoff; + pages -= vm_pgoff; + + /* Can we fit all of the mapping? */ + if ((vm_len >> PAGE_SHIFT) > pages) + return -EINVAL; + + *pfnp = pfn; + return 0; +} + +int simple_ioremap_prepare(struct vm_area_desc *desc) +{ + struct mmap_action *action = &desc->action; + const phys_addr_t start = action->simple_ioremap.start_phys_addr; + const unsigned long size = action->simple_ioremap.size; + unsigned long pfn; + int err; + + err = __simple_ioremap_prep(vma_desc_size(desc), desc->pgoff, + start, size, &pfn); + if (err) + return err; + + /* The I/O remap logic does the heavy lifting. */ + mmap_action_ioremap_full(desc, pfn); + return io_remap_pfn_range_prepare(desc); } /** @@ -3180,32 +3276,15 @@ int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { - unsigned long vm_len, pfn, pages; - - /* Check that the physical memory area passed in looks valid */ - if (start + len < start) - return -EINVAL; - /* - * You *really* shouldn't map things that aren't page-aligned, - * but we've historically allowed it because IO memory might - * just have smaller alignment. - */ - len += start & ~PAGE_MASK; - pfn = start >> PAGE_SHIFT; - pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; - if (pfn + pages < pfn) - return -EINVAL; - - /* We start the mapping 'vm_pgoff' pages into the area */ - if (vma->vm_pgoff > pages) - return -EINVAL; - pfn += vma->vm_pgoff; - pages -= vma->vm_pgoff; + const unsigned long vm_start = vma->vm_start; + const unsigned long vm_end = vma->vm_end; + const unsigned long vm_len = vm_end - vm_start; + unsigned long pfn; + int err; - /* Can we fit all of the mapping? */ - vm_len = vma->vm_end - vma->vm_start; - if (vm_len >> PAGE_SHIFT > pages) - return -EINVAL; + err = __simple_ioremap_prep(vm_len, vma->vm_pgoff, start, len, &pfn); + if (err) + return err; /* Ok, let it rip */ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); @@ -4241,31 +4320,25 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) return wp_page_copy(vmf); } -static void unmap_mapping_range_vma(struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr, - struct zap_details *details) -{ - zap_page_range_single(vma, start_addr, end_addr - start_addr, details); -} - static inline void unmap_mapping_range_tree(struct rb_root_cached *root, pgoff_t first_index, pgoff_t last_index, struct zap_details *details) { struct vm_area_struct *vma; - pgoff_t vba, vea, zba, zea; + unsigned long start, size; + struct mmu_gather tlb; vma_interval_tree_foreach(vma, root, first_index, last_index) { - vba = vma->vm_pgoff; - vea = vba + vma_pages(vma) - 1; - zba = max(first_index, vba); - zea = min(last_index, vea); + const pgoff_t start_idx = max(first_index, vma->vm_pgoff); + const pgoff_t end_idx = min(last_index, vma_last_pgoff(vma)) + 1; + + start = vma->vm_start + ((start_idx - vma->vm_pgoff) << PAGE_SHIFT); + size = (end_idx - start_idx) << PAGE_SHIFT; - unmap_mapping_range_vma(vma, - ((zba - vba) << PAGE_SHIFT) + vma->vm_start, - ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, - details); + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_vma_range_batched(&tlb, vma, start, size, details); + tlb_finish_mmu(&tlb); } } @@ -4292,7 +4365,7 @@ void unmap_mapping_folio(struct folio *folio) first_index = folio->index; last_index = folio_next_index(folio) - 1; - details.even_cows = false; + details.skip_cows = true; details.single_folio = folio; details.zap_flags = ZAP_FLAG_DROP_MARKER; @@ -4322,7 +4395,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t first_index = start; pgoff_t last_index = start + nr - 1; - details.even_cows = even_cows; + details.skip_cows = !even_cows; if (last_index < first_index) last_index = ULONG_MAX; @@ -5209,6 +5282,37 @@ fallback: return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } +void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte, + struct vm_area_struct *vma, unsigned long addr, + bool uffd_wp) +{ + const unsigned int nr_pages = folio_nr_pages(folio); + pte_t entry = folio_mk_pte(folio, vma->vm_page_prot); + + entry = pte_sw_mkyoung(entry); + + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry), vma); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); + + folio_ref_add(folio, nr_pages - 1); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); + folio_add_lru_vma(folio, vma); + set_ptes(vma->vm_mm, addr, pte, entry, nr_pages); + update_mmu_cache_range(NULL, vma, addr, pte, nr_pages); +} + +static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte, + struct vm_area_struct *vma, unsigned long addr, bool uffd_wp) +{ + const unsigned int order = folio_order(folio); + + map_anon_folio_pte_nopf(folio, pte, vma, addr, uffd_wp); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1L << order); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC); +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -5220,7 +5324,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) unsigned long addr = vmf->address; struct folio *folio; vm_fault_t ret = 0; - int nr_pages = 1; + int nr_pages; pte_t entry; /* File mapping without ->vm_ops ? */ @@ -5237,7 +5341,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { - entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), + entry = pte_mkspecial(pfn_pte(zero_pfn(vmf->address), vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); @@ -5255,7 +5359,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_MISSING); } - goto setpte; + if (vmf_orig_pte_uffd_wp(vmf)) + entry = pte_mkuffd_wp(entry); + set_pte_at(vma->vm_mm, addr, vmf->pte, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, vmf->pte); + goto unlock; } /* Allocate our own private page. */ @@ -5279,11 +5389,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) */ __folio_mark_uptodate(folio); - entry = folio_mk_pte(folio, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); - if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry), vma); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) goto release; @@ -5305,19 +5410,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_put(folio); return handle_userfault(vmf, VM_UFFD_MISSING); } - - folio_ref_add(folio, nr_pages - 1); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); - count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); - folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); -setpte: - if (vmf_orig_pte_uffd_wp(vmf)) - entry = pte_mkuffd_wp(entry); - set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); + map_anon_folio_pte_pf(folio, vmf->pte, vma, addr, + vmf_orig_pte_uffd_wp(vmf)); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -5426,7 +5520,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; - if (folio_order(folio) != HPAGE_PMD_ORDER) + if (!is_pmd_order(folio_order(folio))) return ret; page = &folio->page; @@ -6815,11 +6909,16 @@ retry: pudp = pud_offset(p4dp, address); pud = pudp_get(pudp); - if (pud_none(pud)) + if (!pud_present(pud)) goto out; if (pud_leaf(pud)) { lock = pud_lock(mm, pudp); - if (!unlikely(pud_leaf(pud))) { + pud = pudp_get(pudp); + + if (unlikely(!pud_present(pud))) { + spin_unlock(lock); + goto out; + } else if (unlikely(!pud_leaf(pud))) { spin_unlock(lock); goto retry; } @@ -6831,9 +6930,16 @@ retry: pmdp = pmd_offset(pudp, address); pmd = pmdp_get_lockless(pmdp); + if (!pmd_present(pmd)) + goto out; if (pmd_leaf(pmd)) { lock = pmd_lock(mm, pmdp); - if (!unlikely(pmd_leaf(pmd))) { + pmd = pmdp_get(pmdp); + + if (unlikely(!pmd_present(pmd))) { + spin_unlock(lock); + goto out; + } else if (unlikely(!pmd_leaf(pmd))) { spin_unlock(lock); goto retry; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index bc805029da51..2a943ec57c85 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -221,7 +221,7 @@ void put_online_mems(void) bool movable_node_enabled = false; static int mhp_default_online_type = -1; -int mhp_get_default_online_type(void) +enum mmop mhp_get_default_online_type(void) { if (mhp_default_online_type >= 0) return mhp_default_online_type; @@ -240,7 +240,7 @@ int mhp_get_default_online_type(void) return mhp_default_online_type; } -void mhp_set_default_online_type(int online_type) +void mhp_set_default_online_type(enum mmop online_type) { mhp_default_online_type = online_type; } @@ -319,21 +319,13 @@ static void release_memory_resource(struct resource *res) static int check_pfn_span(unsigned long pfn, unsigned long nr_pages) { /* - * Disallow all operations smaller than a sub-section and only - * allow operations smaller than a section for - * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range() - * enforces a larger memory_block_size_bytes() granularity for - * memory that will be marked online, so this check should only - * fire for direct arch_{add,remove}_memory() users outside of - * add_memory_resource(). + * Disallow all operations smaller than a sub-section. + * Note that check_hotplug_memory_range() enforces a larger + * memory_block_size_bytes() granularity for memory that will be marked + * online, so this check should only fire for direct + * arch_{add,remove}_memory() users outside of add_memory_resource(). */ - unsigned long min_align; - - if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) - min_align = PAGES_PER_SUBSECTION; - else - min_align = PAGES_PER_SECTION; - if (!IS_ALIGNED(pfn | nr_pages, min_align)) + if (!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SUBSECTION)) return -EINVAL; return 0; } @@ -1046,7 +1038,7 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn return movable_node_enabled ? movable_zone : kernel_zone; } -struct zone *zone_for_pfn_range(int online_type, int nid, +struct zone *zone_for_pfn_range(enum mmop online_type, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages) { @@ -1209,6 +1201,13 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, if (node_arg.nid >= 0) node_set_state(nid, N_MEMORY); + /* + * Check whether we are adding normal memory to the node for the first + * time. + */ + if (!node_state(nid, N_NORMAL_MEMORY) && zone_idx(zone) <= ZONE_NORMAL) + node_set_state(nid, N_NORMAL_MEMORY); + if (need_zonelists_rebuild) build_all_zonelists(NULL); @@ -1745,7 +1744,8 @@ static int scan_movable_pages(unsigned long start, unsigned long end, { unsigned long pfn; - for_each_valid_pfn(pfn, start, end) { + for (pfn = start; pfn < end; pfn++) { + unsigned long nr_pages; struct page *page; struct folio *folio; @@ -1762,9 +1762,9 @@ static int scan_movable_pages(unsigned long start, unsigned long end, if (PageOffline(page) && page_count(page)) return -EBUSY; - if (!PageHuge(page)) - continue; folio = page_folio(page); + if (!folio_test_hugetlb(folio)) + continue; /* * This test is racy as we hold no reference or lock. The * hugetlb page could have been free'ed and head is no longer @@ -1774,7 +1774,11 @@ static int scan_movable_pages(unsigned long start, unsigned long end, */ if (folio_test_hugetlb_migratable(folio)) goto found; - pfn |= folio_nr_pages(folio) - 1; + nr_pages = folio_nr_pages(folio); + if (unlikely(nr_pages < 1 || nr_pages > MAX_FOLIO_NR_PAGES || + !is_power_of_2(nr_pages))) + continue; + pfn |= nr_pages - 1; } return -ENOENT; found: @@ -1790,7 +1794,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - for_each_valid_pfn(pfn, start_pfn, end_pfn) { + for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct page *page; page = pfn_to_page(pfn); @@ -1908,6 +1912,8 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, unsigned long flags; char *reason; int ret; + unsigned long normal_pages = 0; + enum zone_type zt; /* * {on,off}lining is constrained to full memory sections (or more @@ -2056,6 +2062,17 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, init_per_zone_wmark_min(); /* + * Check whether this operation removes the last normal memory from + * the node. We do this before clearing N_MEMORY to avoid the possible + * transient "!N_MEMORY && N_NORMAL_MEMORY" state. + */ + if (zone_idx(zone) <= ZONE_NORMAL) { + for (zt = 0; zt <= ZONE_NORMAL; zt++) + normal_pages += pgdat->node_zones[zt].present_pages; + if (!normal_pages) + node_clear_state(node, N_NORMAL_MEMORY); + } + /* * Make sure to mark the node as memory-less before rebuilding the zone * list. Otherwise this node would still appear in the fallback lists. */ @@ -2305,7 +2322,7 @@ EXPORT_SYMBOL_GPL(remove_memory); static int try_offline_memory_block(struct memory_block *mem, void *arg) { - uint8_t online_type = MMOP_ONLINE_KERNEL; + enum mmop online_type = MMOP_ONLINE_KERNEL; uint8_t **online_types = arg; struct page *page; int rc; @@ -2338,7 +2355,7 @@ static int try_reonline_memory_block(struct memory_block *mem, void *arg) int rc; if (**online_types != MMOP_OFFLINE) { - mem->online_type = **online_types; + mem->online_type = (enum mmop)**online_types; rc = device_online(&mem->dev); if (rc < 0) pr_warn("%s: Failed to re-online memory: %d", diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e5175f1c767..4e4421b22b59 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -487,7 +487,13 @@ void __mpol_put(struct mempolicy *pol) { if (!atomic_dec_and_test(&pol->refcnt)) return; - kmem_cache_free(policy_cache, pol); + /* + * Required to allow mmap_lock_speculative*() access, see for example + * futex_key_to_node_opt(). All accesses are serialized by mmap_lock, + * however the speculative lock section unbound by the normal lock + * boundaries, requiring RCU freeing. + */ + kfree_rcu(pol, rcu); } EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm"); @@ -1020,7 +1026,7 @@ static int vma_replace_policy(struct vm_area_struct *vma, } old = vma->vm_policy; - vma->vm_policy = new; /* protected by mmap_lock */ + WRITE_ONCE(vma->vm_policy, new); /* protected by mmap_lock */ mpol_put(old); return 0; @@ -1239,7 +1245,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { @@ -2449,7 +2455,7 @@ static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && /* filter "hugepage" allocation, unless from alloc_pages() */ - order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { + is_pmd_order(order) && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred @@ -3700,18 +3706,19 @@ static ssize_t weighted_interleave_auto_store(struct kobject *kobj, new_wi_state->iw_table[i] = 1; mutex_lock(&wi_state_lock); - if (!input) { - old_wi_state = rcu_dereference_protected(wi_state, - lockdep_is_held(&wi_state_lock)); - if (!old_wi_state) - goto update_wi_state; - if (input == old_wi_state->mode_auto) { - mutex_unlock(&wi_state_lock); - return count; - } + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); - memcpy(new_wi_state->iw_table, old_wi_state->iw_table, - nr_node_ids * sizeof(u8)); + if (old_wi_state && input == old_wi_state->mode_auto) { + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + return count; + } + + if (!input) { + if (old_wi_state) + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); goto update_wi_state; } @@ -3781,9 +3788,11 @@ static void wi_state_free(void) } } -static struct kobj_attribute wi_auto_attr = - __ATTR(auto, 0664, weighted_interleave_auto_show, - weighted_interleave_auto_store); +static struct kobj_attribute wi_auto_attr = { + .attr = { .name = "auto", .mode = 0664 }, + .show = weighted_interleave_auto_show, + .store = weighted_interleave_auto_store, +}; static void wi_cleanup(void) { sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); diff --git a/mm/memremap.c b/mm/memremap.c index ac7be07e3361..053842d45cb1 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -454,7 +454,7 @@ void free_zone_device_folio(struct folio *folio) if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free)) break; pgmap->ops->folio_free(folio); - percpu_ref_put_many(&folio->pgmap->ref, nr); + percpu_ref_put_many(&pgmap->ref, nr); break; case MEMORY_DEVICE_GENERIC: diff --git a/mm/migrate.c b/mm/migrate.c index 2c3d489ecf51..8a64291ab5b4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -321,7 +321,7 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, if (!pages_identical(page, ZERO_PAGE(0))) return false; - newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address), + newpte = pte_mkspecial(pfn_pte(zero_pfn(pvmw->address), pvmw->vma->vm_page_prot)); if (pte_swp_soft_dirty(old_pte)) @@ -672,6 +672,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, struct lruvec *old_lruvec, *new_lruvec; struct mem_cgroup *memcg; + rcu_read_lock(); memcg = folio_memcg(folio); old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); @@ -699,6 +700,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); } + rcu_read_unlock(); } local_irq_enable(); @@ -1358,6 +1360,8 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, int rc; int old_page_state = 0; struct anon_vma *anon_vma = NULL; + bool src_deferred_split = false; + bool src_partially_mapped = false; struct list_head *prev; __migrate_folio_extract(dst, &old_page_state, &anon_vma); @@ -1371,11 +1375,26 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, goto out_unlock_both; } + if (folio_order(src) > 1 && + !data_race(list_empty(&src->_deferred_list))) { + src_deferred_split = true; + src_partially_mapped = folio_test_partially_mapped(src); + } + rc = move_to_new_folio(dst, src, mode); if (rc) goto out; /* + * Requeue the destination folio on the deferred split queue if + * the source was on the queue. The source is unqueued in + * __folio_migrate_mapping(), so we recorded the state from + * before move_to_new_folio(). + */ + if (src_deferred_split) + deferred_split_folio(dst, src_partially_mapped); + + /* * When successful, push dst to LRU immediately: so that if it * turns out to be an mlocked page, remove_migration_ptes() will * automatically build up the correct dst->mlock_count for it. @@ -2205,8 +2224,7 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private) return __folio_alloc(gfp_mask, order, nid, mtc->nmask); } -#ifdef CONFIG_NUMA - +#ifdef CONFIG_NUMA_MIGRATION static int store_status(int __user *status, int start, int value, int nr) { while (nr-- > 0) { @@ -2605,6 +2623,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, { return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); } +#endif /* CONFIG_NUMA_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING /* @@ -2747,4 +2766,3 @@ int migrate_misplaced_folio(struct folio *folio, int node) return nr_remaining ? -EAGAIN : 0; } #endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_NUMA */ diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 8079676c8f1f..fbfe5715f635 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -175,12 +175,6 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, return migrate_vma_collect_skip(start, end, walk); } - if (softleaf_is_migration(entry)) { - softleaf_entry_wait_on_locked(entry, ptl); - spin_unlock(ptl); - return -EAGAIN; - } - if (softleaf_is_device_private_write(entry)) write = MIGRATE_PFN_WRITE; } else { @@ -914,6 +908,10 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, unsigned long flags; int ret = 0; + /* + * take a reference, since split_huge_pmd_address() with freeze = true + * drops a reference at the end. + */ folio_get(folio); split_huge_pmd_address(migrate->vma, addr, true); ret = folio_split_unmapped(folio, 0); diff --git a/mm/mlock.c b/mm/mlock.c index 2f699c3497a5..8c227fefa2df 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -13,7 +13,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pagemap.h> -#include <linux/pagevec.h> +#include <linux/folio_batch.h> #include <linux/pagewalk.h> #include <linux/mempolicy.h> #include <linux/syscalls.h> @@ -205,7 +205,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch) } if (lruvec) - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); folios_put(fbatch); } @@ -415,13 +415,14 @@ out: * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range * @end - end of range in @vma - * @newflags - the new set of flags for @vma. + * @new_vma_flags - the new set of flags for @vma. * * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ static void mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, vm_flags_t newflags) + unsigned long start, unsigned long end, + vma_flags_t *new_vma_flags) { static const struct mm_walk_ops mlock_walk_ops = { .pmd_entry = mlock_pte_range, @@ -439,18 +440,18 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, * combination should not be visible to other mmap_lock users; * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. */ - if (newflags & VM_LOCKED) - newflags |= VM_IO; + if (vma_flags_test(new_vma_flags, VMA_LOCKED_BIT)) + vma_flags_set(new_vma_flags, VMA_IO_BIT); vma_start_write(vma); - vm_flags_reset_once(vma, newflags); + vma_flags_reset_once(vma, new_vma_flags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); lru_add_drain(); - if (newflags & VM_IO) { - newflags &= ~VM_IO; - vm_flags_reset_once(vma, newflags); + if (vma_flags_test(new_vma_flags, VMA_IO_BIT)) { + vma_flags_clear(new_vma_flags, VMA_IO_BIT); + vma_flags_reset_once(vma, new_vma_flags); } } @@ -467,18 +468,22 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) { + vma_flags_t new_vma_flags = legacy_to_vma_flags(newflags); + const vma_flags_t old_vma_flags = vma->flags; struct mm_struct *mm = vma->vm_mm; int nr_pages; int ret = 0; - vm_flags_t oldflags = vma->vm_flags; - if (newflags == oldflags || (oldflags & VM_SPECIAL) || - is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || - vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE)) - /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ + if (vma_flags_same_pair(&old_vma_flags, &new_vma_flags) || + vma_is_secretmem(vma) || !vma_supports_mlock(vma)) { + /* + * Don't set VM_LOCKED or VM_LOCKONFAULT and don't count. + * For secretmem, don't allow the memory to be unlocked. + */ goto out; + } - vma = vma_modify_flags(vmi, *prev, vma, start, end, &newflags); + vma = vma_modify_flags(vmi, *prev, vma, start, end, &new_vma_flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -488,9 +493,9 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; - if (!(newflags & VM_LOCKED)) + if (!vma_flags_test(&new_vma_flags, VMA_LOCKED_BIT)) nr_pages = -nr_pages; - else if (oldflags & VM_LOCKED) + else if (vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT)) nr_pages = 0; mm->locked_vm += nr_pages; @@ -499,12 +504,13 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ - if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { + if (vma_flags_test(&new_vma_flags, VMA_LOCKED_BIT) && + vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT)) { /* No work to do, and mlocking twice would be wrong */ vma_start_write(vma); - vm_flags_reset(vma, newflags); + vma->flags = new_vma_flags; } else { - mlock_vma_pages_range(vma, start, end, newflags); + mlock_vma_pages_range(vma, start, end, &new_vma_flags); } out: *prev = vma; diff --git a/mm/mm_init.c b/mm/mm_init.c index df34797691bd..f9f8e1af921c 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -53,6 +53,17 @@ EXPORT_SYMBOL(mem_map); void *high_memory; EXPORT_SYMBOL(high_memory); +unsigned long zero_page_pfn __ro_after_init; +EXPORT_SYMBOL(zero_page_pfn); + +#ifndef __HAVE_COLOR_ZERO_PAGE +uint8_t empty_zero_page[PAGE_SIZE] __page_aligned_bss; +EXPORT_SYMBOL(empty_zero_page); + +struct page *__zero_page __ro_after_init; +EXPORT_SYMBOL(__zero_page); +#endif /* __HAVE_COLOR_ZERO_PAGE */ + #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; @@ -772,36 +783,11 @@ void __meminit init_deferred_page(unsigned long pfn, int nid) __init_deferred_page(pfn, nid); } -/* - * Initialised pages do not have PageReserved set. This function is - * called for each range allocated by the bootmem allocator and - * marks the pages PageReserved. The remaining valid pages are later - * sent to the buddy page allocator. - */ -void __meminit reserve_bootmem_region(phys_addr_t start, - phys_addr_t end, int nid) -{ - unsigned long pfn; - - for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) { - struct page *page = pfn_to_page(pfn); - - __init_deferred_page(pfn, nid); - - /* - * no need for atomic set_bit because the struct - * page is not visible yet so nobody should - * access it yet. - */ - __SetPageReserved(page); - } -} - /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ static bool __meminit overlap_memmap_init(unsigned long zone, unsigned long *pfn) { - static struct memblock_region *r; + static struct memblock_region *r __meminitdata; if (mirrored_kernelcore && zone == ZONE_MOVABLE) { if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { @@ -1099,7 +1085,7 @@ static void __ref memmap_init_compound(struct page *head, struct page *page = pfn_to_page(pfn); __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); - prep_compound_tail(head, pfn - head_pfn); + prep_compound_tail(page, head, order); set_page_count(page, 0); } prep_compound_head(head, order); @@ -1885,7 +1871,7 @@ static void __init free_area_init(void) pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); - subsection_map_init(start_pfn, end_pfn - start_pfn); + sparse_init_subsection_map(start_pfn, end_pfn - start_pfn); } /* Initialise every node */ @@ -2672,6 +2658,22 @@ static void __init mem_init_print_info(void) ); } +#ifndef __HAVE_COLOR_ZERO_PAGE +/* + * architectures that __HAVE_COLOR_ZERO_PAGE must define this function + */ +void __init __weak arch_setup_zero_pages(void) +{ + __zero_page = virt_to_page(empty_zero_page); +} +#endif + +static void __init init_zero_page_pfn(void) +{ + arch_setup_zero_pages(); + zero_page_pfn = page_to_pfn(ZERO_PAGE(0)); +} + void __init __weak arch_mm_preinit(void) { } @@ -2694,6 +2696,7 @@ void __init mm_core_init_early(void) void __init mm_core_init(void) { arch_mm_preinit(); + init_zero_page_pfn(); /* Initializations relying on SMP setup */ BUILD_BUG_ON(MAX_ZONELISTS > 2); diff --git a/mm/mmap.c b/mm/mmap.c index 843160946aa5..5754d1c36462 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -192,7 +192,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) + if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, + EMPTY_VMA_FLAGS) < 0) goto out; mm->brk = brk; @@ -375,7 +376,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return -EOVERFLOW; /* Too many mappings? */ - if (mm->map_count > sysctl_max_map_count) + if (mm->map_count > get_sysctl_max_map_count()) return -ENOMEM; /* @@ -1201,8 +1202,10 @@ out: return ret; } -int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) +int vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec) { + const vma_flags_t vma_flags = is_exec ? + mk_vma_flags(VMA_EXEC_BIT) : EMPTY_VMA_FLAGS; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long len; @@ -1217,10 +1220,6 @@ int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) if (!len) return 0; - /* Until we need other flags, refuse anything except VM_EXEC. */ - if ((vm_flags & (~VM_EXEC)) != 0) - return -EINVAL; - if (mmap_write_lock_killable(mm)) return -EINTR; @@ -1233,7 +1232,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) goto munmap_failed; vma = vma_prev(&vmi); - ret = do_brk_flags(&vmi, vma, addr, len, vm_flags); + ret = do_brk_flags(&vmi, vma, addr, len, vma_flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); @@ -1246,7 +1245,6 @@ limits_failed: mmap_write_unlock(mm); return ret; } -EXPORT_SYMBOL(vm_brk_flags); static unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi, @@ -1332,12 +1330,13 @@ destroy: * Return true if the calling process may expand its vm space by the passed * number of pages */ -bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) +bool may_expand_vm(struct mm_struct *mm, const vma_flags_t *vma_flags, + unsigned long npages) { if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) return false; - if (is_data_mapping(flags) && + if (is_data_mapping_vma_flags(vma_flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { /* Workaround for Valgrind */ if (rlimit(RLIMIT_DATA) == 0 && diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index fe5b6a031717..3985d856de7f 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -296,6 +296,25 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch) call_rcu(&batch->rcu, tlb_remove_table_rcu); } +/** + * tlb_remove_table_sync_rcu - synchronize with software page-table walkers + * + * Like tlb_remove_table_sync_one() but uses RCU grace period instead of IPI + * broadcast. Use in slow paths where sleeping is acceptable. + * + * Software/Lockless page-table walkers use local_irq_disable(), which is also + * an RCU read-side critical section. synchronize_rcu() waits for all such + * sections, providing the same guarantee as tlb_remove_table_sync_one() but + * without disrupting all CPUs with IPIs. + * + * Do not use for freeing memory. Use RCU callbacks instead to avoid latency + * spikes. + */ +void tlb_remove_table_sync_rcu(void) +{ + synchronize_rcu(); +} + #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ static void tlb_remove_table_free(struct mmu_table_batch *batch) @@ -339,7 +358,7 @@ static inline void __tlb_remove_table_one(void *table) #else static inline void __tlb_remove_table_one(void *table) { - tlb_remove_table_sync_one(); + tlb_remove_table_sync_rcu(); __tlb_remove_table(table); } #endif /* CONFIG_PT_RECLAIM */ diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index a6cdf3674bdc..245b74f39f91 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -260,6 +260,15 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub) } EXPORT_SYMBOL_GPL(mmu_interval_read_begin); +static void mn_itree_finish_pass(struct llist_head *finish_passes) +{ + struct llist_node *first = llist_reverse_order(__llist_del_all(finish_passes)); + struct mmu_interval_notifier_finish *f, *next; + + llist_for_each_entry_safe(f, next, first, link) + f->notifier->ops->invalidate_finish(f); +} + static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, struct mm_struct *mm) { @@ -271,6 +280,7 @@ static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, .end = ULONG_MAX, }; struct mmu_interval_notifier *interval_sub; + LLIST_HEAD(finish_passes); unsigned long cur_seq; bool ret; @@ -278,11 +288,27 @@ static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, mn_itree_inv_start_range(subscriptions, &range, &cur_seq); interval_sub; interval_sub = mn_itree_inv_next(interval_sub, &range)) { - ret = interval_sub->ops->invalidate(interval_sub, &range, - cur_seq); + if (interval_sub->ops->invalidate_start) { + struct mmu_interval_notifier_finish *finish = NULL; + + ret = interval_sub->ops->invalidate_start(interval_sub, + &range, + cur_seq, + &finish); + if (ret && finish) { + finish->notifier = interval_sub; + __llist_add(&finish->link, &finish_passes); + } + + } else { + ret = interval_sub->ops->invalidate(interval_sub, + &range, + cur_seq); + } WARN_ON(!ret); } + mn_itree_finish_pass(&finish_passes); mn_itree_inv_end(subscriptions); } @@ -309,7 +335,7 @@ static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions, * ->release returns. */ id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + hlist_for_each_entry_srcu(subscription, &subscriptions->list, hlist, srcu_read_lock_held(&srcu)) /* * If ->release runs before mmu_notifier_unregister it must be @@ -364,15 +390,15 @@ void __mmu_notifier_release(struct mm_struct *mm) * unmap the address and return 1 or 0 depending if the mapping previously * existed or not. */ -int __mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +bool __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct mmu_notifier *subscription; - int young = 0, id; + bool young = false; + int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, + hlist_for_each_entry_srcu(subscription, &mm->notifier_subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { if (subscription->ops->clear_flush_young) @@ -384,15 +410,15 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } -int __mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +bool __mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct mmu_notifier *subscription; - int young = 0, id; + bool young = false; + int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, + hlist_for_each_entry_srcu(subscription, &mm->notifier_subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { if (subscription->ops->clear_young) @@ -404,14 +430,15 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, return young; } -int __mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address) +bool __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) { struct mmu_notifier *subscription; - int young = 0, id; + bool young = false; + int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, + hlist_for_each_entry_srcu(subscription, &mm->notifier_subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { if (subscription->ops->test_young) { @@ -430,7 +457,9 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, const struct mmu_notifier_range *range) { struct mmu_interval_notifier *interval_sub; + LLIST_HEAD(finish_passes); unsigned long cur_seq; + int err = 0; for (interval_sub = mn_itree_inv_start_range(subscriptions, range, &cur_seq); @@ -438,23 +467,41 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, interval_sub = mn_itree_inv_next(interval_sub, range)) { bool ret; - ret = interval_sub->ops->invalidate(interval_sub, range, - cur_seq); + if (interval_sub->ops->invalidate_start) { + struct mmu_interval_notifier_finish *finish = NULL; + + ret = interval_sub->ops->invalidate_start(interval_sub, + range, + cur_seq, + &finish); + if (ret && finish) { + finish->notifier = interval_sub; + __llist_add(&finish->link, &finish_passes); + } + + } else { + ret = interval_sub->ops->invalidate(interval_sub, + range, + cur_seq); + } if (!ret) { if (WARN_ON(mmu_notifier_range_blockable(range))) continue; - goto out_would_block; + err = -EAGAIN; + break; } } - return 0; -out_would_block: + mn_itree_finish_pass(&finish_passes); + /* * On -EAGAIN the non-blocking caller is not allowed to call * invalidate_range_end() */ - mn_itree_inv_end(subscriptions); - return -EAGAIN; + if (err) + mn_itree_inv_end(subscriptions); + + return err; } static int mn_hlist_invalidate_range_start( @@ -466,7 +513,7 @@ static int mn_hlist_invalidate_range_start( int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + hlist_for_each_entry_srcu(subscription, &subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { const struct mmu_notifier_ops *ops = subscription->ops; @@ -504,7 +551,7 @@ static int mn_hlist_invalidate_range_start( * notifiers and one or more failed start, any that succeeded * start are expecting their end to be called. Do so now. */ - hlist_for_each_entry_rcu(subscription, &subscriptions->list, + hlist_for_each_entry_srcu(subscription, &subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { if (!subscription->ops->invalidate_range_end) continue; @@ -542,7 +589,7 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + hlist_for_each_entry_srcu(subscription, &subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { if (subscription->ops->invalidate_range_end) { if (!mmu_notifier_range_blockable(range)) @@ -577,7 +624,7 @@ void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, + hlist_for_each_entry_srcu(subscription, &mm->notifier_subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { if (subscription->ops->arch_invalidate_secondary_tlbs) @@ -713,7 +760,7 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) struct mmu_notifier *subscription; spin_lock(&mm->notifier_subscriptions->lock); - hlist_for_each_entry_rcu(subscription, + hlist_for_each_entry_srcu(subscription, &mm->notifier_subscriptions->list, hlist, lockdep_is_held(&mm->notifier_subscriptions->lock)) { if (subscription->ops != ops) @@ -976,6 +1023,7 @@ int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mmu_notifier_subscriptions *subscriptions; int ret; + WARN_ON_ONCE(ops->invalidate_start && !ops->invalidate_finish); might_lock(&mm->mmap_lock); subscriptions = smp_load_acquire(&mm->notifier_subscriptions); diff --git a/mm/mprotect.c b/mm/mprotect.c index c0571445bef7..9cbf932b028c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -117,9 +117,9 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep, } /* Set nr_ptes number of ptes, starting from idx */ -static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, - int idx, bool set_write, struct mmu_gather *tlb) +static __always_inline void prot_commit_flush_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, + int nr_ptes, int idx, bool set_write, struct mmu_gather *tlb) { /* * Advance the position in the batch by idx; note that if idx > 0, @@ -143,7 +143,7 @@ static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long add * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce * that the ptes point to consecutive pages of the same anon large folio. */ -static int page_anon_exclusive_sub_batch(int start_idx, int max_len, +static __always_inline int page_anon_exclusive_sub_batch(int start_idx, int max_len, struct page *first_page, bool expected_anon_exclusive) { int idx; @@ -169,7 +169,7 @@ static int page_anon_exclusive_sub_batch(int start_idx, int max_len, * pte of the batch. Therefore, we must individually check all pages and * retrieve sub-batches. */ -static void commit_anon_folio_batch(struct vm_area_struct *vma, +static __always_inline void commit_anon_folio_batch(struct vm_area_struct *vma, struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) { @@ -188,7 +188,7 @@ static void commit_anon_folio_batch(struct vm_area_struct *vma, } } -static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, +static __always_inline void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, struct folio *folio, struct page *page, unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) { @@ -211,6 +211,111 @@ static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, commit_anon_folio_batch(vma, folio, page, addr, ptep, oldpte, ptent, nr_ptes, tlb); } +static long change_softleaf_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte, pte_t oldpte, unsigned long cp_flags) +{ + const bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + softleaf_t entry = softleaf_from_pte(oldpte); + pte_t newpte; + + if (softleaf_is_migration_write(entry)) { + const struct folio *folio = softleaf_to_folio(entry); + + /* + * A protection check is difficult so + * just be safe and disable write + */ + if (folio_test_anon(folio)) + entry = make_readable_exclusive_migration_entry(swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + } else if (softleaf_is_device_private_write(entry)) { + /* + * We do not preserve soft-dirtiness. See + * copy_nonpresent_pte() for explanation. + */ + entry = make_readable_device_private_entry(swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (softleaf_is_marker(entry)) { + /* + * Ignore error swap entries unconditionally, + * because any access should sigbus/sigsegv + * anyway. + */ + if (softleaf_is_poison_marker(entry) || + softleaf_is_guard_marker(entry)) + return 0; + /* + * If this is uffd-wp pte marker and we'd like + * to unprotect it, drop it; the next page + * fault will trigger without uffd trapping. + */ + if (uffd_wp_resolve) { + pte_clear(vma->vm_mm, addr, pte); + return 1; + } + return 0; + } else { + newpte = oldpte; + } + + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + + if (!pte_same(oldpte, newpte)) { + set_pte_at(vma->vm_mm, addr, pte, newpte); + return 1; + } + return 0; +} + +static __always_inline void change_present_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + int nr_ptes, unsigned long end, pgprot_t newprot, + struct folio *folio, struct page *page, unsigned long cp_flags) +{ + const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + const bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + pte_t ptent, oldpte; + + oldpte = modify_prot_start_ptes(vma, addr, ptep, nr_ptes); + ptent = pte_modify(oldpte, newprot); + + if (uffd_wp) + ptent = pte_mkuffd_wp(ptent); + else if (uffd_wp_resolve) + ptent = pte_clear_uffd_wp(ptent); + + /* + * In some writable, shared mappings, we might want + * to catch actual write access -- see + * vma_wants_writenotify(). + * + * In all writable, private mappings, we have to + * properly handle COW. + * + * In both cases, we can sometimes still change PTEs + * writable and avoid the write-fault handler, for + * example, if a PTE is already dirty and no other + * COW or special handling is required. + */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && + !pte_write(ptent)) + set_write_prot_commit_flush_ptes(vma, folio, page, + addr, ptep, oldpte, ptent, nr_ptes, tlb); + else + prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, + nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb); +} + static long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -221,7 +326,6 @@ static long change_pte_range(struct mmu_gather *tlb, bool is_private_single_threaded; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; - bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; int nr_ptes; tlb_change_page_size(tlb, PAGE_SIZE); @@ -242,7 +346,6 @@ static long change_pte_range(struct mmu_gather *tlb, int max_nr_ptes = (end - addr) >> PAGE_SHIFT; struct folio *folio = NULL; struct page *page; - pte_t ptent; /* Already in the desired state. */ if (prot_numa && pte_protnone(oldpte)) @@ -268,34 +371,20 @@ static long change_pte_range(struct mmu_gather *tlb, nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags); - oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes); - ptent = pte_modify(oldpte, newprot); - - if (uffd_wp) - ptent = pte_mkuffd_wp(ptent); - else if (uffd_wp_resolve) - ptent = pte_clear_uffd_wp(ptent); - /* - * In some writable, shared mappings, we might want - * to catch actual write access -- see - * vma_wants_writenotify(). - * - * In all writable, private mappings, we have to - * properly handle COW. - * - * In both cases, we can sometimes still change PTEs - * writable and avoid the write-fault handler, for - * example, if a PTE is already dirty and no other - * COW or special handling is required. + * Optimize for the small-folio common case by + * special-casing it here. Compiler constant propagation + * plus copious amounts of __always_inline does wonders. */ - if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && - !pte_write(ptent)) - set_write_prot_commit_flush_ptes(vma, folio, page, - addr, pte, oldpte, ptent, nr_ptes, tlb); - else - prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent, - nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb); + if (likely(nr_ptes == 1)) { + change_present_ptes(tlb, vma, addr, pte, 1, + end, newprot, folio, page, cp_flags); + } else { + change_present_ptes(tlb, vma, addr, pte, + nr_ptes, end, newprot, folio, page, + cp_flags); + } + pages += nr_ptes; } else if (pte_none(oldpte)) { /* @@ -317,66 +406,7 @@ static long change_pte_range(struct mmu_gather *tlb, pages++; } } else { - softleaf_t entry = softleaf_from_pte(oldpte); - pte_t newpte; - - if (softleaf_is_migration_write(entry)) { - const struct folio *folio = softleaf_to_folio(entry); - - /* - * A protection check is difficult so - * just be safe and disable write - */ - if (folio_test_anon(folio)) - entry = make_readable_exclusive_migration_entry( - swp_offset(entry)); - else - entry = make_readable_migration_entry(swp_offset(entry)); - newpte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(oldpte)) - newpte = pte_swp_mksoft_dirty(newpte); - } else if (softleaf_is_device_private_write(entry)) { - /* - * We do not preserve soft-dirtiness. See - * copy_nonpresent_pte() for explanation. - */ - entry = make_readable_device_private_entry( - swp_offset(entry)); - newpte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(oldpte)) - newpte = pte_swp_mkuffd_wp(newpte); - } else if (softleaf_is_marker(entry)) { - /* - * Ignore error swap entries unconditionally, - * because any access should sigbus/sigsegv - * anyway. - */ - if (softleaf_is_poison_marker(entry) || - softleaf_is_guard_marker(entry)) - continue; - /* - * If this is uffd-wp pte marker and we'd like - * to unprotect it, drop it; the next page - * fault will trigger without uffd trapping. - */ - if (uffd_wp_resolve) { - pte_clear(vma->vm_mm, addr, pte); - pages++; - } - continue; - } else { - newpte = oldpte; - } - - if (uffd_wp) - newpte = pte_swp_mkuffd_wp(newpte); - else if (uffd_wp_resolve) - newpte = pte_swp_clear_uffd_wp(newpte); - - if (!pte_same(oldpte, newpte)) { - set_pte_at(vma->vm_mm, addr, pte, newpte); - pages++; - } + pages += change_softleaf_pte(vma, addr, pte, oldpte, cp_flags); } } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); lazy_mmu_mode_disable(); @@ -697,7 +727,8 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; - vm_flags_t oldflags = READ_ONCE(vma->vm_flags); + const vma_flags_t old_vma_flags = READ_ONCE(vma->flags); + vma_flags_t new_vma_flags = legacy_to_vma_flags(newflags); long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; @@ -706,7 +737,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, if (vma_is_sealed(vma)) return -EPERM; - if (newflags == oldflags) { + if (vma_flags_same_pair(&old_vma_flags, &new_vma_flags)) { *pprev = vma; return 0; } @@ -717,8 +748,9 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * uncommon case, so doesn't need to be very optimized. */ if (arch_has_pfn_modify_check() && - (oldflags & (VM_PFNMAP|VM_MIXEDMAP)) && - (newflags & VM_ACCESS_FLAGS) == 0) { + vma_flags_test_any(&old_vma_flags, VMA_PFNMAP_BIT, + VMA_MIXEDMAP_BIT) && + !vma_flags_test_any_mask(&new_vma_flags, VMA_ACCESS_FLAGS)) { pgprot_t new_pgprot = vm_get_page_prot(newflags); error = walk_page_range(current->mm, start, end, @@ -736,24 +768,25 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * hugetlb mapping were accounted for even if read-only so there is * no need to account for them here. */ - if (newflags & VM_WRITE) { + if (vma_flags_test(&new_vma_flags, VMA_WRITE_BIT)) { /* Check space limits when area turns into data. */ - if (!may_expand_vm(mm, newflags, nrpages) && - may_expand_vm(mm, oldflags, nrpages)) + if (!may_expand_vm(mm, &new_vma_flags, nrpages) && + may_expand_vm(mm, &old_vma_flags, nrpages)) return -ENOMEM; - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| - VM_SHARED|VM_NORESERVE))) { + if (!vma_flags_test_any(&old_vma_flags, + VMA_ACCOUNT_BIT, VMA_WRITE_BIT, VMA_HUGETLB_BIT, + VMA_SHARED_BIT, VMA_NORESERVE_BIT)) { charged = nrpages; if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; - newflags |= VM_ACCOUNT; + vma_flags_set(&new_vma_flags, VMA_ACCOUNT_BIT); } - } else if ((oldflags & VM_ACCOUNT) && vma_is_anonymous(vma) && - !vma->anon_vma) { - newflags &= ~VM_ACCOUNT; + } else if (vma_flags_test(&old_vma_flags, VMA_ACCOUNT_BIT) && + vma_is_anonymous(vma) && !vma->anon_vma) { + vma_flags_clear(&new_vma_flags, VMA_ACCOUNT_BIT); } - vma = vma_modify_flags(vmi, *pprev, vma, start, end, &newflags); + vma = vma_modify_flags(vmi, *pprev, vma, start, end, &new_vma_flags); if (IS_ERR(vma)) { error = PTR_ERR(vma); goto fail; @@ -766,26 +799,28 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * held in write mode. */ vma_start_write(vma); - vm_flags_reset_once(vma, newflags); + vma_flags_reset_once(vma, &new_vma_flags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); change_protection(tlb, vma, start, end, mm_cp_flags); - if ((oldflags & VM_ACCOUNT) && !(newflags & VM_ACCOUNT)) + if (vma_flags_test(&old_vma_flags, VMA_ACCOUNT_BIT) && + !vma_flags_test(&new_vma_flags, VMA_ACCOUNT_BIT)) vm_unacct_memory(nrpages); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major * fault on access. */ - if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && - (newflags & VM_WRITE)) { + if (vma_flags_test(&new_vma_flags, VMA_WRITE_BIT) && + vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT) && + !vma_flags_test_any(&old_vma_flags, VMA_WRITE_BIT, VMA_SHARED_BIT)) populate_vma_page_range(vma, start, end, NULL); - } - vm_stat_account(mm, oldflags, -nrpages); + vm_stat_account(mm, vma_flags_to_legacy(old_vma_flags), -nrpages); + newflags = vma_flags_to_legacy(new_vma_flags); vm_stat_account(mm, newflags, nrpages); perf_event_mmap(vma); return 0; @@ -873,6 +908,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { vm_flags_t mask_off_old_flags; + vma_flags_t new_vma_flags; vm_flags_t newflags; int new_vma_pkey; @@ -895,6 +931,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); newflags = calc_vm_prot_bits(prot, new_vma_pkey); newflags |= (vma->vm_flags & ~mask_off_old_flags); + new_vma_flags = legacy_to_vma_flags(newflags); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { @@ -902,7 +939,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } - if (map_deny_write_exec(vma->vm_flags, newflags)) { + if (map_deny_write_exec(&vma->flags, &new_vma_flags)) { error = -EACCES; break; } @@ -978,7 +1015,7 @@ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) if (pkey == -1) goto out; - ret = arch_set_user_pkey_access(current, pkey, init_val); + ret = arch_set_user_pkey_access(pkey, init_val); if (ret) { mm_pkey_free(current->mm, pkey); goto out; diff --git a/mm/mremap.c b/mm/mremap.c index 2be876a70cc0..e9c8b1d05832 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -244,7 +244,7 @@ static int move_ptes(struct pagetable_move_control *pmc, goto out; } /* - * Now new_pte is none, so hpage_collapse_scan_file() path can not find + * Now new_pte is none, so collapse_scan_file() path can not find * this by traversing file->f_mapping, so there is no concurrency with * retract_page_tables(). In addition, we already hold the exclusive * mmap_lock, so this new_pte page is stable, so there is no need to get @@ -1028,6 +1028,75 @@ static void vrm_stat_account(struct vma_remap_struct *vrm, mm->locked_vm += pages; } +static bool __check_map_count_against_split(struct mm_struct *mm, + bool before_unmaps) +{ + const int sys_map_count = get_sysctl_max_map_count(); + int map_count = mm->map_count; + + mmap_assert_write_locked(mm); + + /* + * At the point of shrinking the VMA, if new_len < old_len, we unmap + * thusly in the worst case: + * + * old_addr+old_len old_addr+old_len + * |---------------.----.---------| |---------------| |---------| + * | . . | -> | +1 | -1 | +1 | + * |---------------.----.---------| |---------------| |---------| + * old_addr+new_len old_addr+new_len + * + * At the point of removing the portion of an existing VMA to make space + * for the moved VMA if MREMAP_FIXED, we unmap thusly in the worst case: + * + * new_addr new_addr+new_len new_addr new_addr+new_len + * |----.---------------.---------| |----| |---------| + * | . . | -> | +1 | -1 | +1 | + * |----.---------------.---------| |----| |---------| + * + * Therefore, before we consider the move anything, we have to account + * for 2 additional VMAs possibly being created upon these unmappings. + */ + if (before_unmaps) + map_count += 2; + + /* + * At the point of MOVING the VMA: + * + * We start by copying a VMA, which creates an additional VMA if no + * merge occurs, then if not MREMAP_DONTUNMAP, we unmap the source VMA. + * In the worst case we might then observe: + * + * new_addr new_addr+new_len new_addr new_addr+new_len + * |----| |---------| |----|---------------|---------| + * | | | | -> | | +1 | | + * |----| |---------| |----|---------------|---------| + * + * old_addr old_addr+old_len old_addr old_addr+old_len + * |----.---------------.---------| |----| |---------| + * | . . | -> | +1 | -1 | +1 | + * |----.---------------.---------| |----| |---------| + * + * Therefore we must check to ensure we have headroom of 2 additional + * VMAs. + */ + return map_count + 2 <= sys_map_count; +} + +/* Do we violate the map count limit if we split VMAs when moving the VMA? */ +static bool check_map_count_against_split(void) +{ + return __check_map_count_against_split(current->mm, + /*before_unmaps=*/false); +} + +/* Do we violate the map count limit if we split VMAs prior to early unmaps? */ +static bool check_map_count_against_split_early(void) +{ + return __check_map_count_against_split(current->mm, + /*before_unmaps=*/true); +} + /* * Perform checks before attempting to write a VMA prior to it being * moved. @@ -1041,10 +1110,11 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) vm_flags_t dummy = vma->vm_flags; /* - * We'd prefer to avoid failure later on in do_munmap: - * which may split one vma into three before unmapping. + * We'd prefer to avoid failure later on in do_munmap: we copy a VMA, + * which may not merge, then (if MREMAP_DONTUNMAP is not set) unmap the + * source, which may split, causing a net increase of 2 mappings. */ - if (current->mm->map_count >= sysctl_max_map_count - 3) + if (!check_map_count_against_split()) return -ENOMEM; if (vma->vm_ops && vma->vm_ops->may_split) { @@ -1402,10 +1472,10 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm) /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ if (vrm->flags & MREMAP_DONTUNMAP) { - vm_flags_t vm_flags = vrm->vma->vm_flags; + vma_flags_t vma_flags = vrm->vma->flags; unsigned long pages = vrm->old_len >> PAGE_SHIFT; - if (!may_expand_vm(mm, vm_flags, pages)) + if (!may_expand_vm(mm, &vma_flags, pages)) return -ENOMEM; } @@ -1743,7 +1813,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm) if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, vrm->delta)) return -EAGAIN; - if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) + if (!may_expand_vm(mm, &vma->flags, vrm->delta >> PAGE_SHIFT)) return -ENOMEM; return 0; @@ -1803,23 +1873,6 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) if (vrm_overlaps(vrm)) return -EINVAL; - /* - * move_vma() need us to stay 4 maps below the threshold, otherwise - * it will bail out at the very beginning. - * That is a problem if we have already unmapped the regions here - * (new_addr, and old_addr), because userspace will not know the - * state of the vma's after it gets -ENOMEM. - * So, to avoid such scenario we can pre-compute if the whole - * operation has high chances to success map-wise. - * Worst-scenario case is when both vma's (new_addr and old_addr) get - * split in 3 before unmapping it. - * That means 2 more maps (1 for each) to the ones we already hold. - * Check whether current map count plus 2 still leads us to 4 maps below - * the threshold, otherwise return -ENOMEM here to be more safe. - */ - if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) - return -ENOMEM; - return 0; } @@ -1929,6 +1982,11 @@ static unsigned long do_mremap(struct vma_remap_struct *vrm) return -EINTR; vrm->mmap_locked = true; + if (!check_map_count_against_split_early()) { + mmap_write_unlock(mm); + return -ENOMEM; + } + if (vrm_move_only(vrm)) { res = remap_move(vrm); } else { diff --git a/mm/mseal.c b/mm/mseal.c index 316b5e1dec78..e2093ae3d25c 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -56,7 +56,6 @@ static int mseal_apply(struct mm_struct *mm, unsigned long start, unsigned long end) { struct vm_area_struct *vma, *prev; - unsigned long curr_start = start; VMA_ITERATOR(vmi, mm, start); /* We know there are no gaps so this will be non-NULL. */ @@ -66,20 +65,23 @@ static int mseal_apply(struct mm_struct *mm, prev = vma; for_each_vma_range(vmi, vma, end) { + const unsigned long curr_start = MAX(vma->vm_start, start); const unsigned long curr_end = MIN(vma->vm_end, end); - if (!(vma->vm_flags & VM_SEALED)) { - vm_flags_t vm_flags = vma->vm_flags | VM_SEALED; + if (!vma_test(vma, VMA_SEALED_BIT)) { + vma_flags_t vma_flags = vma->flags; + + vma_flags_set(&vma_flags, VMA_SEALED_BIT); vma = vma_modify_flags(&vmi, prev, vma, curr_start, - curr_end, &vm_flags); + curr_end, &vma_flags); if (IS_ERR(vma)) return PTR_ERR(vma); - vm_flags_set(vma, VM_SEALED); + vma_start_write(vma); + vma_set_flags(vma, VMA_SEALED_BIT); } prev = vma; - curr_start = curr_end; } return 0; diff --git a/mm/nommu.c b/mm/nommu.c index c3a23b082adb..ed3934bc2de4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1317,7 +1317,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return -ENOMEM; mm = vma->vm_mm; - if (mm->map_count >= sysctl_max_map_count) + if (mm->map_count >= get_sysctl_max_map_count()) return -ENOMEM; region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5c6c95c169ee..5f372f6e26fa 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -135,19 +135,16 @@ struct task_struct *find_lock_task_mm(struct task_struct *p) { struct task_struct *t; - rcu_read_lock(); + guard(rcu)(); for_each_thread(p, t) { task_lock(t); if (likely(t->mm)) - goto found; + return t; task_unlock(t); } - t = NULL; -found: - rcu_read_unlock(); - return t; + return NULL; } /* @@ -548,21 +545,8 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) * count elevated without a good reason. */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { - struct mmu_notifier_range range; - struct mmu_gather tlb; - - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, - mm, vma->vm_start, - vma->vm_end); - tlb_gather_mmu(&tlb, mm); - if (mmu_notifier_invalidate_range_start_nonblock(&range)) { - tlb_finish_mmu(&tlb); + if (zap_vma_for_reaping(vma)) ret = false; - continue; - } - unmap_page_range(&tlb, vma, range.start, range.end, NULL); - mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 601a5e048d12..88cd53d4ba09 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -33,7 +33,7 @@ #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> -#include <linux/pagevec.h> +#include <linux/folio_batch.h> #include <linux/timer.h> #include <linux/sched/rt.h> #include <linux/sched/signal.h> @@ -1858,6 +1858,27 @@ free_running: break; } + /* + * Unconditionally start background writeback if it's not + * already in progress. We need to do this because the global + * dirty threshold check above (nr_dirty > gdtc->bg_thresh) + * doesn't account for these cases: + * + * a) strictlimit BDIs: throttling is calculated using per-wb + * thresholds. The per-wb threshold can be exceeded even when + * nr_dirty < gdtc->bg_thresh + * + * b) memcg-based throttling: memcg uses its own dirty count and + * thresholds and can trigger throttling even when global + * nr_dirty < gdtc->bg_thresh + * + * Writeback needs to be started else the writer stalls in the + * throttle loop waiting for dirty pages to be written back + * while no writeback is running. + */ + if (unlikely(!writeback_in_progress(wb))) + wb_start_background_writeback(wb); + mem_cgroup_flush_foreign(wb); /* @@ -2645,7 +2666,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * while this function is in progress, although it may have been truncated * before this function is called. Most callers have the folio locked. * A few have the folio blocked from truncation through other means (e.g. - * zap_vma_pages() has it mapped and is holding the page table lock). + * zap_vma() has it mapped and is holding the page table lock). * When called from mark_buffer_dirty(), the filesystem should hold a * reference to the buffer_head that is being marked dirty, which causes * try_to_free_buffers() to fail. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2d4b6f1a554e..65e205111553 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -31,7 +31,7 @@ #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/cpuset.h> -#include <linux/pagevec.h> +#include <linux/folio_batch.h> #include <linux/memory_hotplug.h> #include <linux/nodemask.h> #include <linux/vmstat.h> @@ -94,23 +94,6 @@ typedef int __bitwise fpi_t; static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) -/* - * On SMP, spin_trylock is sufficient protection. - * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. - * Pass flags to a no-op inline function to typecheck and silence the unused - * variable warning. - */ -static inline void __pcp_trylock_noop(unsigned long *flags) { } -#define pcp_trylock_prepare(flags) __pcp_trylock_noop(&(flags)) -#define pcp_trylock_finish(flags) __pcp_trylock_noop(&(flags)) -#else - -/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ -#define pcp_trylock_prepare(flags) local_irq_save(flags) -#define pcp_trylock_finish(flags) local_irq_restore(flags) -#endif - /* * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid * a migration causing the wrong PCP to be locked and remote memory being @@ -128,71 +111,52 @@ static inline void __pcp_trylock_noop(unsigned long *flags) { } #endif /* - * Generic helper to lookup and a per-cpu variable with an embedded spinlock. - * Return value should be used with equivalent unlock helper. + * A helper to lookup and trylock pcp with embedded spinlock. + * The return value should be used with the unlock helper. + * NULL return value means the trylock failed. */ -#define pcpu_spin_trylock(type, member, ptr) \ +#ifdef CONFIG_SMP +#define pcp_spin_trylock(ptr) \ ({ \ - type *_ret; \ + struct per_cpu_pages *_ret; \ pcpu_task_pin(); \ _ret = this_cpu_ptr(ptr); \ - if (!spin_trylock(&_ret->member)) { \ + if (!spin_trylock(&_ret->lock)) { \ pcpu_task_unpin(); \ _ret = NULL; \ } \ _ret; \ }) -#define pcpu_spin_unlock(member, ptr) \ +#define pcp_spin_unlock(ptr) \ ({ \ - spin_unlock(&ptr->member); \ + spin_unlock(&ptr->lock); \ pcpu_task_unpin(); \ }) -/* struct per_cpu_pages specific helpers. */ -#define pcp_spin_trylock(ptr, UP_flags) \ -({ \ - struct per_cpu_pages *__ret; \ - pcp_trylock_prepare(UP_flags); \ - __ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr); \ - if (!__ret) \ - pcp_trylock_finish(UP_flags); \ - __ret; \ -}) - -#define pcp_spin_unlock(ptr, UP_flags) \ -({ \ - pcpu_spin_unlock(lock, ptr); \ - pcp_trylock_finish(UP_flags); \ -}) - /* - * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e. - * a potentially remote cpu drain) and get interrupted by an operation that - * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP - * spinlock assumptions making the trylock a no-op. So we have to turn that - * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no - * remote cpu's so we can only be locking the only existing local one. + * On CONFIG_SMP=n the UP implementation of spin_trylock() never fails and thus + * is not compatible with our locking scheme. However we do not need pcp for + * scalability in the first place, so just make all the trylocks fail and take + * the slow path unconditionally. */ -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) -static inline void __flags_noop(unsigned long *flags) { } -#define pcp_spin_lock_maybe_irqsave(ptr, flags) \ -({ \ - __flags_noop(&(flags)); \ - spin_lock(&(ptr)->lock); \ -}) -#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \ -({ \ - spin_unlock(&(ptr)->lock); \ - __flags_noop(&(flags)); \ -}) #else -#define pcp_spin_lock_maybe_irqsave(ptr, flags) \ - spin_lock_irqsave(&(ptr)->lock, flags) -#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \ - spin_unlock_irqrestore(&(ptr)->lock, flags) +#define pcp_spin_trylock(ptr) \ + NULL + +#define pcp_spin_unlock(ptr) \ + BUG_ON(1) #endif +/* + * In some cases we do not need to pin the task to the CPU because we are + * already given a specific cpu's pcp pointer. + */ +#define pcp_spin_lock_nopin(ptr) \ + spin_lock(&(ptr)->lock) +#define pcp_spin_unlock_nopin(ptr) \ + spin_unlock(&(ptr)->lock) + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -243,6 +207,8 @@ unsigned int pageblock_order __read_mostly; static void __free_pages_ok(struct page *page, unsigned int order, fpi_t fpi_flags); +static void reserve_highatomic_pageblock(struct page *page, int order, + struct zone *zone); /* * results with 256, 32 in the lowmem_reserve sysctl: @@ -331,11 +297,6 @@ int page_group_by_mobility_disabled __read_mostly; */ DEFINE_STATIC_KEY_TRUE(deferred_pages); -static inline bool deferred_pages_enabled(void) -{ - return static_branch_unlikely(&deferred_pages); -} - /* * deferred_grow_zone() is __init, but it is called from * get_page_from_freelist() during early boot until deferred_pages permanently @@ -348,11 +309,6 @@ _deferred_grow_zone(struct zone *zone, unsigned int order) return deferred_grow_zone(zone, order); } #else -static inline bool deferred_pages_enabled(void) -{ - return false; -} - static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order) { return false; @@ -687,7 +643,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool movable; if (order > PAGE_ALLOC_COSTLY_ORDER) { - VM_BUG_ON(order != HPAGE_PMD_ORDER); + VM_BUG_ON(!is_pmd_order(order)); movable = migratetype == MIGRATE_MOVABLE; @@ -719,7 +675,7 @@ static inline bool pcp_allowed_order(unsigned int order) if (order <= PAGE_ALLOC_COSTLY_ORDER) return true; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order == HPAGE_PMD_ORDER) + if (is_pmd_order(order)) return true; #endif return false; @@ -731,7 +687,7 @@ static inline bool pcp_allowed_order(unsigned int order) * The first PAGE_SIZE page is called the "head page" and have PG_head set. * * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded - * in bit 0 of page->compound_head. The rest of bits is pointer to head page. + * in bit 0 of page->compound_info. The rest of bits is pointer to head page. * * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. @@ -744,7 +700,7 @@ void prep_compound_page(struct page *page, unsigned int order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) - prep_compound_tail(page, i); + prep_compound_tail(page + i, page, order); prep_compound_head(page, order); } @@ -1079,7 +1035,6 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif - page_pool_page_is_pp(page) | (page->flags.f & check_flags))) return false; @@ -1106,8 +1061,6 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(page_pool_page_is_pp(page))) - bad_reason = "page_pool leak"; return bad_reason; } @@ -1289,10 +1242,18 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task, union pgtag_ref_handle handle; union codetag_ref ref; - if (get_page_tag_ref(page, &ref, &handle)) { + if (likely(get_page_tag_ref(page, &ref, &handle))) { alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); update_page_tag_ref(handle, &ref); put_page_tag_ref(handle); + } else { + /* + * page_ext is not available yet, record the pfn so we can + * clear the tag ref later when page_ext is initialized. + */ + alloc_tag_add_early_pfn(page_to_pfn(page)); + if (task->alloc_tag) + alloc_tag_set_inaccurate(task->alloc_tag); } } @@ -1416,9 +1377,17 @@ __always_inline bool __free_pages_prepare(struct page *page, mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); folio->mapping = NULL; } - if (unlikely(page_has_type(page))) + if (unlikely(page_has_type(page))) { + /* networking expects to clear its page type before releasing */ + if (is_check_pages_enabled()) { + if (unlikely(PageNetpp(page))) { + bad_page(page, "page_pool leak"); + return false; + } + } /* Reset the page_type (which overlays _mapcount) */ page->page_type = UINT_MAX; + } if (is_check_pages_enabled()) { if (free_page_is_bad(page)) @@ -2588,7 +2557,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) { int high_min, to_drain, to_drain_batched, batch; - unsigned long UP_flags; bool todo = false; high_min = READ_ONCE(pcp->high_min); @@ -2608,9 +2576,9 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) to_drain = pcp->count - pcp->high; while (to_drain > 0) { to_drain_batched = min(to_drain, batch); - pcp_spin_lock_maybe_irqsave(pcp, UP_flags); + pcp_spin_lock_nopin(pcp); free_pcppages_bulk(zone, to_drain_batched, pcp, 0); - pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); + pcp_spin_unlock_nopin(pcp); todo = true; to_drain -= to_drain_batched; @@ -2627,15 +2595,14 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - unsigned long UP_flags; int to_drain, batch; batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) { - pcp_spin_lock_maybe_irqsave(pcp, UP_flags); + pcp_spin_lock_nopin(pcp); free_pcppages_bulk(zone, to_drain, pcp, 0); - pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); + pcp_spin_unlock_nopin(pcp); } } #endif @@ -2646,11 +2613,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) static void drain_pages_zone(unsigned int cpu, struct zone *zone) { struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - unsigned long UP_flags; int count; do { - pcp_spin_lock_maybe_irqsave(pcp, UP_flags); + pcp_spin_lock_nopin(pcp); count = pcp->count; if (count) { int to_drain = min(count, @@ -2659,7 +2625,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) free_pcppages_bulk(zone, to_drain, pcp, 0); count -= to_drain; } - pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); + pcp_spin_unlock_nopin(pcp); } while (count); } @@ -2858,7 +2824,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, */ static bool free_frozen_page_commit(struct zone *zone, struct per_cpu_pages *pcp, struct page *page, int migratetype, - unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags) + unsigned int order, fpi_t fpi_flags) { int high, batch; int to_free, to_free_batched; @@ -2918,9 +2884,9 @@ static bool free_frozen_page_commit(struct zone *zone, if (to_free == 0 || pcp->count == 0) break; - pcp_spin_unlock(pcp, *UP_flags); + pcp_spin_unlock(pcp); - pcp = pcp_spin_trylock(zone->per_cpu_pageset, *UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (!pcp) { ret = false; break; @@ -2932,7 +2898,7 @@ static bool free_frozen_page_commit(struct zone *zone, * returned in an unlocked state. */ if (smp_processor_id() != cpu) { - pcp_spin_unlock(pcp, *UP_flags); + pcp_spin_unlock(pcp); ret = false; break; } @@ -2964,7 +2930,6 @@ static bool free_frozen_page_commit(struct zone *zone, static void __free_frozen_pages(struct page *page, unsigned int order, fpi_t fpi_flags) { - unsigned long UP_flags; struct per_cpu_pages *pcp; struct zone *zone; unsigned long pfn = page_to_pfn(page); @@ -3000,12 +2965,12 @@ static void __free_frozen_pages(struct page *page, unsigned int order, add_page_to_zone_llist(zone, page, order); return; } - pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { if (!free_frozen_page_commit(zone, pcp, page, migratetype, - order, fpi_flags, &UP_flags)) + order, fpi_flags)) return; - pcp_spin_unlock(pcp, UP_flags); + pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, fpi_flags); } @@ -3026,7 +2991,6 @@ void free_frozen_pages_nolock(struct page *page, unsigned int order) */ void free_unref_folios(struct folio_batch *folios) { - unsigned long UP_flags; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; int i, j; @@ -3069,7 +3033,7 @@ void free_unref_folios(struct folio_batch *folios) if (zone != locked_zone || is_migrate_isolate(migratetype)) { if (pcp) { - pcp_spin_unlock(pcp, UP_flags); + pcp_spin_unlock(pcp); locked_zone = NULL; pcp = NULL; } @@ -3088,7 +3052,7 @@ void free_unref_folios(struct folio_batch *folios) * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. */ - pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (unlikely(!pcp)) { free_one_page(zone, &folio->page, pfn, order, FPI_NONE); @@ -3106,14 +3070,14 @@ void free_unref_folios(struct folio_batch *folios) trace_mm_page_free_batched(&folio->page); if (!free_frozen_page_commit(zone, pcp, &folio->page, - migratetype, order, FPI_NONE, &UP_flags)) { + migratetype, order, FPI_NONE)) { pcp = NULL; locked_zone = NULL; } } if (pcp) - pcp_spin_unlock(pcp, UP_flags); + pcp_spin_unlock(pcp); folio_batch_reinit(folios); } @@ -3275,6 +3239,13 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, spin_unlock_irqrestore(&zone->lock, flags); } while (check_new_pages(page, order)); + /* + * If this is a high-order atomic allocation then check + * if the pageblock should be reserved for the future + */ + if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) + reserve_highatomic_pageblock(page, order, zone); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); @@ -3346,6 +3317,20 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, int batch = nr_pcp_alloc(pcp, zone, order); int alloced; + /* + * Don't refill the list for a higher order atomic + * allocation under memory pressure, as this would + * not build up any HIGHATOMIC reserves, which + * might be needed soon. + * + * Instead, direct it towards the reserves by + * returning NULL, which will make the caller fall + * back to rmqueue_buddy. This will try to use the + * reserves first and grow them if needed. + */ + if (alloc_flags & ALLOC_HIGHATOMIC) + return NULL; + alloced = rmqueue_bulk(zone, order, batch, list, migratetype, alloc_flags); @@ -3371,10 +3356,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct per_cpu_pages *pcp; struct list_head *list; struct page *page; - unsigned long UP_flags; /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ - pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (!pcp) return NULL; @@ -3386,7 +3370,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp->free_count >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); - pcp_spin_unlock(pcp, UP_flags); + pcp_spin_unlock(pcp); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); @@ -3961,13 +3945,6 @@ try_this_zone: if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); - /* - * If this is a high-order atomic allocation then check - * if the pageblock should be reserved for the future - */ - if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) - reserve_highatomic_pageblock(page, order, zone); - return page; } else { if (cond_accept_memory(zone, order, alloc_flags)) @@ -5067,7 +5044,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, struct page **page_array) { struct page *page; - unsigned long UP_flags; struct zone *zone; struct zoneref *z; struct per_cpu_pages *pcp; @@ -5136,7 +5112,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, cond_accept_memory(zone, 0, alloc_flags); retry_this_zone: - mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages - nr_populated; if (zone_watermark_fast(zone, 0, mark, zonelist_zone_idx(ac.preferred_zoneref), alloc_flags, gfp)) { @@ -5161,7 +5137,7 @@ retry_this_zone: goto failed; /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ - pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (!pcp) goto failed; @@ -5180,7 +5156,7 @@ retry_this_zone: if (unlikely(!page)) { /* Try and allocate at least one page */ if (!nr_account) { - pcp_spin_unlock(pcp, UP_flags); + pcp_spin_unlock(pcp); goto failed; } break; @@ -5192,7 +5168,7 @@ retry_this_zone: page_array[nr_populated++] = page; } - pcp_spin_unlock(pcp, UP_flags); + pcp_spin_unlock(pcp); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account); @@ -6147,7 +6123,6 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) { struct per_cpu_pages *pcp; struct cpu_cacheinfo *cci; - unsigned long UP_flags; pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); cci = get_cpu_cacheinfo(cpu); @@ -6158,12 +6133,12 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) * This can reduce zone lock contention without hurting * cache-hot pages sharing. */ - pcp_spin_lock_maybe_irqsave(pcp, UP_flags); + pcp_spin_lock_nopin(pcp); if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) pcp->flags |= PCPF_FREE_HIGH_BATCH; else pcp->flags &= ~PCPF_FREE_HIGH_BATCH; - pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); + pcp_spin_unlock_nopin(pcp); } void setup_pcp_cacheinfo(unsigned int cpu) @@ -6234,42 +6209,6 @@ void adjust_managed_page_count(struct page *page, long count) } EXPORT_SYMBOL(adjust_managed_page_count); -unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) -{ - void *pos; - unsigned long pages = 0; - - start = (void *)PAGE_ALIGN((unsigned long)start); - end = (void *)((unsigned long)end & PAGE_MASK); - for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { - struct page *page = virt_to_page(pos); - void *direct_map_addr; - - /* - * 'direct_map_addr' might be different from 'pos' - * because some architectures' virt_to_page() - * work with aliases. Getting the direct map - * address ensures that we get a _writeable_ - * alias for the memset(). - */ - direct_map_addr = page_address(page); - /* - * Perform a kasan-unchecked memset() since this memory - * has not been initialized. - */ - direct_map_addr = kasan_reset_tag(direct_map_addr); - if ((unsigned int)poison <= 0xFF) - memset(direct_map_addr, poison, PAGE_SIZE); - - free_reserved_page(page); - } - - if (pages && s) - pr_info("Freeing %s memory: %ldK\n", s, K(pages)); - - return pages; -} - void free_reserved_page(struct page *page) { clear_page_tag_ref(page); @@ -6553,8 +6492,8 @@ void calculate_min_free_kbytes(void) if (new_min_free_kbytes > user_min_free_kbytes) min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144); else - pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", - new_min_free_kbytes, user_min_free_kbytes); + pr_warn_ratelimited("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", + new_min_free_kbytes, user_min_free_kbytes); } diff --git a/mm/page_idle.c b/mm/page_idle.c index 96bb94c7b6c3..9c67cbac2965 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -74,7 +74,7 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio, pmd_t pmdval = pmdp_get(pvmw.pmd); if (likely(pmd_present(pmdval))) - referenced |= pmdp_clear_young_notify(vma, addr, pvmw.pmd); + referenced |= pmdp_test_and_clear_young(vma, addr, pvmw.pmd); referenced |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PMD_SIZE); } else { /* unexpected pmd-mapped page? */ diff --git a/mm/page_io.c b/mm/page_io.c index a2c034660c80..70cea9e24d2f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -276,10 +276,14 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT); goto out_unlock; } + + rcu_read_lock(); if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) { + rcu_read_unlock(); folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; } + rcu_read_unlock(); __swap_writepage(folio, swap_plug); return 0; @@ -307,11 +311,11 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio) struct cgroup_subsys_state *css; struct mem_cgroup *memcg; - memcg = folio_memcg(folio); - if (!memcg) + if (!folio_memcg_charged(folio)) return; rcu_read_lock(); + memcg = folio_memcg(folio); css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); @@ -450,14 +454,14 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); /* - * ->flags can be updated non-atomically (scan_swap_map_slots), + * ->flags can be updated non-atomically, * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ if (data_race(sis->flags & SWP_FS_OPS)) swap_writepage_fs(folio, swap_plug); /* - * ->flags can be updated non-atomically (scan_swap_map_slots), + * ->flags can be updated non-atomically, * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race * is safe. */ @@ -493,7 +497,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret) folio_mark_uptodate(folio); folio_unlock(folio); } - count_vm_events(PSWPIN, sio->pages); + count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT); } else { for (p = 0; p < sio->pages; p++) { struct folio *folio = page_folio(sio->bvec[p].bv_page); diff --git a/mm/page_reporting.c b/mm/page_reporting.c index f0042d5743af..7418f2e500bb 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -12,7 +12,7 @@ #include "internal.h" /* Initialize to an unsupported value */ -unsigned int page_reporting_order = -1; +unsigned int page_reporting_order = PAGE_REPORTING_ORDER_UNSPECIFIED; static int page_order_update_notify(const char *val, const struct kernel_param *kp) { @@ -369,8 +369,9 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) * pageblock_order. */ - if (page_reporting_order == -1) { - if (prdev->order > 0 && prdev->order <= MAX_PAGE_ORDER) + if (page_reporting_order == PAGE_REPORTING_ORDER_UNSPECIFIED) { + if (prdev->order != PAGE_REPORTING_ORDER_UNSPECIFIED && + prdev->order <= MAX_PAGE_ORDER) page_reporting_order = prdev->order; else page_reporting_order = pageblock_order; diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 2708c2b3ac1f..53a8997ec043 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -151,9 +151,8 @@ void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - if (pte_user_accessible_page(pte, addr)) { + if (pte_user_accessible_page(mm, addr, pte)) page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT); - } } EXPORT_SYMBOL(__page_table_check_pte_clear); @@ -163,9 +162,8 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - if (pmd_user_accessible_page(pmd, addr)) { + if (pmd_user_accessible_page(mm, addr, pmd)) page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT); - } } EXPORT_SYMBOL(__page_table_check_pmd_clear); @@ -175,9 +173,8 @@ void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - if (pud_user_accessible_page(pud, addr)) { + if (pud_user_accessible_page(mm, addr, pud)) page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT); - } } EXPORT_SYMBOL(__page_table_check_pud_clear); @@ -211,7 +208,7 @@ void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, for (i = 0; i < nr; i++) __page_table_check_pte_clear(mm, addr + PAGE_SIZE * i, ptep_get(ptep + i)); - if (pte_user_accessible_page(pte, addr)) + if (pte_user_accessible_page(mm, addr, pte)) page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } EXPORT_SYMBOL(__page_table_check_ptes_set); @@ -241,7 +238,7 @@ void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, for (i = 0; i < nr; i++) __page_table_check_pmd_clear(mm, addr + PMD_SIZE * i, *(pmdp + i)); - if (pmd_user_accessible_page(pmd, addr)) + if (pmd_user_accessible_page(mm, addr, pmd)) page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd)); } EXPORT_SYMBOL(__page_table_check_pmds_set); @@ -257,7 +254,7 @@ void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, for (i = 0; i < nr; i++) __page_table_check_pud_clear(mm, addr + PUD_SIZE * i, *(pudp + i)); - if (pud_user_accessible_page(pud, addr)) + if (pud_user_accessible_page(mm, addr, pud)) page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud)); } EXPORT_SYMBOL(__page_table_check_puds_set); diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index b38a1d00c971..a4d52fdb3056 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -269,11 +269,6 @@ restart: spin_unlock(pvmw->ptl); pvmw->ptl = NULL; } else if (!pmd_present(pmde)) { - /* - * If PVMW_SYNC, take and drop THP pmd lock so that we - * cannot return prematurely, while zap_huge_pmd() has - * cleared *pmd but not decremented compound_mapcount(). - */ const softleaf_t entry = softleaf_from_pmd(pmde); if (softleaf_is_device_private(entry)) { @@ -284,11 +279,9 @@ restart: if ((pvmw->flags & PVMW_SYNC) && thp_vma_suitable_order(vma, pvmw->address, PMD_ORDER) && - (pvmw->nr_pages >= HPAGE_PMD_NR)) { - spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); + (pvmw->nr_pages >= HPAGE_PMD_NR)) + sync_with_folio_pmd_zap(mm, pvmw->pmd); - spin_unlock(ptl); - } step_forward(pvmw, PMD_SIZE); continue; } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index a94c401ab2cf..3ae2586ff45b 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -5,7 +5,6 @@ #include <linux/hugetlb.h> #include <linux/mmu_context.h> #include <linux/swap.h> -#include <linux/leafops.h> #include <asm/tlbflush.h> @@ -97,6 +96,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { + pud_t pudval = pudp_get(pud); pmd_t *pmd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; @@ -105,6 +105,24 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, int err = 0; int depth = real_depth(3); + /* + * For PTE handling, pte_offset_map_lock() takes care of checking + * whether there actually is a page table. But it also has to be + * very careful about concurrent page table reclaim. + * + * Similarly, we have to be careful here - a PUD entry that points + * to a PMD table cannot go away, so we can just walk it. But if + * it's something else, we need to ensure we didn't race something, + * so need to retry. + * + * A pertinent example of this is a PUD refault after PUD split - + * we will need to split again or risk accessing invalid memory. + */ + if (!pud_present(pudval) || pud_leaf(pudval)) { + walk->action = ACTION_AGAIN; + return 0; + } + pmd = pmd_offset(pud, addr); do { again: @@ -218,12 +236,12 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, else if (pud_leaf(*pud) || !pud_present(*pud)) continue; /* Nothing to do. */ - if (pud_none(*pud)) - goto again; - err = walk_pmd_range(pud, addr, next, walk); if (err) break; + + if (walk->action == ACTION_AGAIN) + goto again; } while (pud++, addr = next, addr != end); return err; @@ -841,9 +859,6 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, * VM as documented by vm_normal_page(). If requested, zeropages will be * returned as well. * - * As default, this function only considers present page table entries. - * If requested, it will also consider migration entries. - * * If this function returns NULL it might either indicate "there is nothing" or * "there is nothing suitable". * @@ -854,11 +869,10 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, * that call. * * @fw->page will correspond to the page that is effectively referenced by - * @addr. However, for migration entries and shared zeropages @fw->page is - * set to NULL. Note that large folios might be mapped by multiple page table - * entries, and this function will always only lookup a single entry as - * specified by @addr, which might or might not cover more than a single page of - * the returned folio. + * @addr. However, for shared zeropages @fw->page is set to NULL. Note that + * large folios might be mapped by multiple page table entries, and this + * function will always only lookup a single entry as specified by @addr, which + * might or might not cover more than a single page of the returned folio. * * This function must *not* be used as a naive replacement for * get_user_pages() / pin_user_pages(), especially not to perform DMA or @@ -885,7 +899,7 @@ struct folio *folio_walk_start(struct folio_walk *fw, folio_walk_flags_t flags) { unsigned long entry_size; - bool expose_page = true; + bool zeropage = false; struct page *page; pud_t *pudp, pud; pmd_t *pmdp, pmd; @@ -933,10 +947,6 @@ struct folio *folio_walk_start(struct folio_walk *fw, if (page) goto found; } - /* - * TODO: FW_MIGRATION support for PUD migration entries - * once there are relevant users. - */ spin_unlock(ptl); goto not_found; } @@ -970,16 +980,9 @@ pmd_table: } else if ((flags & FW_ZEROPAGE) && is_huge_zero_pmd(pmd)) { page = pfn_to_page(pmd_pfn(pmd)); - expose_page = false; + zeropage = true; goto found; } - } else if ((flags & FW_MIGRATION) && - pmd_is_migration_entry(pmd)) { - const softleaf_t entry = softleaf_from_pmd(pmd); - - page = softleaf_to_page(entry); - expose_page = false; - goto found; } spin_unlock(ptl); goto not_found; @@ -1004,15 +1007,7 @@ pte_table: if ((flags & FW_ZEROPAGE) && is_zero_pfn(pte_pfn(pte))) { page = pfn_to_page(pte_pfn(pte)); - expose_page = false; - goto found; - } - } else if (!pte_none(pte)) { - const softleaf_t entry = softleaf_from_pte(pte); - - if ((flags & FW_MIGRATION) && softleaf_is_migration(entry)) { - page = softleaf_to_page(entry); - expose_page = false; + zeropage = true; goto found; } } @@ -1021,7 +1016,7 @@ not_found: vma_pgtable_walk_end(vma); return NULL; found: - if (expose_page) + if (!zeropage) /* Note: Offset from the mapped page, not the folio start. */ fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT); else diff --git a/mm/percpu.c b/mm/percpu.c index a2107bdebf0b..b0676b8054ed 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1622,7 +1622,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, return true; objcg = current_obj_cgroup(); - if (!objcg) + if (!objcg || obj_cgroup_is_root(objcg)) return true; if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index af7966169d69..b91b1a98029c 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -81,10 +81,11 @@ int ptep_set_access_flags(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { - int young; + bool young; + young = ptep_test_and_clear_young(vma, address, ptep); if (young) flush_tlb_page(vma, address); @@ -123,10 +124,11 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { - int young; + bool young; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) diff --git a/mm/rmap.c b/mm/rmap.c index 391337282e3f..78b7fb5f367c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -457,6 +457,13 @@ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma) list_del(&avc->same_vma); anon_vma_chain_free(avc); } + + /* + * The anon_vma assigned to this VMA is no longer valid, as we were not + * able to correctly clone AVC state. Avoid inconsistent anon_vma tree + * state by resetting. + */ + vma->anon_vma = NULL; } /** @@ -958,25 +965,25 @@ static bool folio_referenced_one(struct folio *folio, return false; } - if (lru_gen_enabled() && pvmw.pte) { - if (lru_gen_look_around(&pvmw)) - referenced++; - } else if (pvmw.pte) { - if (folio_test_large(folio)) { - unsigned long end_addr = pmd_addr_end(address, vma->vm_end); - unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT; - pte_t pteval = ptep_get(pvmw.pte); + if (pvmw.pte && folio_test_large(folio)) { + const unsigned long end_addr = pmd_addr_end(address, vma->vm_end); + const unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT; + pte_t pteval = ptep_get(pvmw.pte); - nr = folio_pte_batch(folio, pvmw.pte, - pteval, max_nr); - } + nr = folio_pte_batch(folio, pvmw.pte, pteval, max_nr); + } - ptes += nr; + /* + * When LRU is switching, we don’t know where the surrounding folios + * are. —they could be on active/inactive lists or on MGLRU. So the + * simplest approach is to disable this look-around optimization. + */ + if (lru_gen_enabled() && !lru_gen_switching() && pvmw.pte) { + if (lru_gen_look_around(&pvmw, nr)) + referenced++; + } else if (pvmw.pte) { if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr)) referenced++; - /* Skip the batched PTEs */ - pvmw.pte += nr - 1; - pvmw.address += (nr - 1) * PAGE_SIZE; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) @@ -986,6 +993,7 @@ static bool folio_referenced_one(struct folio *folio, WARN_ON_ONCE(1); } + ptes += nr; pra->mapcount -= nr; /* * If we are sure that we batched the entire folio, @@ -995,6 +1003,10 @@ static bool folio_referenced_one(struct folio *folio, page_vma_mapped_walk_done(&pvmw); break; } + + /* Skip the batched PTEs */ + pvmw.pte += nr - 1; + pvmw.address += (nr - 1) * PAGE_SIZE; } if (referenced) @@ -1065,6 +1077,7 @@ int folio_referenced(struct folio *folio, int is_locked, .invalid_vma = invalid_folio_referenced_vma, }; + VM_WARN_ON_ONCE_FOLIO(folio_is_zone_device(folio), folio); *vm_flags = 0; if (!pra.mapcount) return 0; @@ -2053,7 +2066,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } if (!pvmw.pte) { - if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { + if (folio_test_lazyfree(folio)) { if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio)) goto walk_done; /* diff --git a/mm/secretmem.c b/mm/secretmem.c index 11a779c812a7..5f57ac4720d3 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -122,7 +122,7 @@ static int secretmem_mmap_prepare(struct vm_area_desc *desc) { const unsigned long len = vma_desc_size(desc); - if (!vma_desc_test_flags(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT)) + if (!vma_desc_test_any(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT)) return -EINVAL; vma_desc_set_flags(desc, VMA_LOCKED_BIT, VMA_DONTDUMP_BIT); diff --git a/mm/shmem.c b/mm/shmem.c index b40f3cd48961..3b5dc21b323c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -61,7 +61,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/writeback.h> -#include <linux/pagevec.h> +#include <linux/folio_batch.h> #include <linux/percpu_counter.h> #include <linux/falloc.h> #include <linux/splice.h> @@ -1113,7 +1113,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; struct folio_batch fbatch; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; struct folio *folio; bool same_folio; long nr_swaps_freed = 0; @@ -1425,7 +1425,10 @@ static void shmem_evict_inode(struct inode *inode) } } - simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); + if (info->xattrs) { + simple_xattrs_free(info->xattrs, sbinfo->max_inodes ? &freed : NULL); + kfree(info->xattrs); + } shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); clear_inode(inode); @@ -1510,7 +1513,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type) struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; struct folio_batch fbatch; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; int ret = 0; do { @@ -2044,14 +2047,8 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, struct shmem_inode_info *info = SHMEM_I(inode); struct folio *new, *swapcache; int nr_pages = 1 << order; - gfp_t alloc_gfp; + gfp_t alloc_gfp = gfp; - /* - * We have arrived here because our zones are constrained, so don't - * limit chance of success with further cpuset and node constraints. - */ - gfp &= ~GFP_CONSTRAINT_MASK; - alloc_gfp = gfp; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (WARN_ON_ONCE(order)) return ERR_PTR(-EINVAL); @@ -3101,7 +3098,6 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, shmem_set_inode_flags(inode, info->fsflags, NULL); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); - simple_xattrs_init(&info->xattrs); cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); @@ -3181,119 +3177,99 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, #endif /* CONFIG_TMPFS_QUOTA */ #ifdef CONFIG_USERFAULTFD -int shmem_mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) -{ - struct inode *inode = file_inode(dst_vma->vm_file); - struct shmem_inode_info *info = SHMEM_I(inode); +static struct folio *shmem_mfill_folio_alloc(struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode = file_inode(vma->vm_file); struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + pgoff_t pgoff = linear_page_index(vma, addr); gfp_t gfp = mapping_gfp_mask(mapping); - pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); - void *page_kaddr; struct folio *folio; - int ret; - pgoff_t max_off; - if (shmem_inode_acct_blocks(inode, 1)) { - /* - * We may have got a page, returned -ENOENT triggering a retry, - * and now we find ourselves with -ENOMEM. Release the page, to - * avoid a BUG_ON in our caller. - */ - if (unlikely(*foliop)) { - folio_put(*foliop); - *foliop = NULL; - } - return -ENOMEM; - } - - if (!*foliop) { - ret = -ENOMEM; - folio = shmem_alloc_folio(gfp, 0, info, pgoff); - if (!folio) - goto out_unacct_blocks; + if (unlikely(pgoff >= DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) + return NULL; - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { - page_kaddr = kmap_local_folio(folio, 0); - /* - * The read mmap_lock is held here. Despite the - * mmap_lock being read recursive a deadlock is still - * possible if a writer has taken a lock. For example: - * - * process A thread 1 takes read lock on own mmap_lock - * process A thread 2 calls mmap, blocks taking write lock - * process B thread 1 takes page fault, read lock on own mmap lock - * process B thread 2 calls mmap, blocks taking write lock - * process A thread 1 blocks taking read lock on process B - * process B thread 1 blocks taking read lock on process A - * - * Disable page faults to prevent potential deadlock - * and retry the copy outside the mmap_lock. - */ - pagefault_disable(); - ret = copy_from_user(page_kaddr, - (const void __user *)src_addr, - PAGE_SIZE); - pagefault_enable(); - kunmap_local(page_kaddr); - - /* fallback to copy_from_user outside mmap_lock */ - if (unlikely(ret)) { - *foliop = folio; - ret = -ENOENT; - /* don't free the page */ - goto out_unacct_blocks; - } + folio = shmem_alloc_folio(gfp, 0, info, pgoff); + if (!folio) + return NULL; - flush_dcache_folio(folio); - } else { /* ZEROPAGE */ - clear_user_highpage(&folio->page, dst_addr); - } - } else { - folio = *foliop; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - *foliop = NULL; + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(folio); + return NULL; } - VM_BUG_ON(folio_test_locked(folio)); - VM_BUG_ON(folio_test_swapbacked(folio)); + return folio; +} + +static int shmem_mfill_filemap_add(struct folio *folio, + struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode = file_inode(vma->vm_file); + struct address_space *mapping = inode->i_mapping; + pgoff_t pgoff = linear_page_index(vma, addr); + gfp_t gfp = mapping_gfp_mask(mapping); + int err; + __folio_set_locked(folio); __folio_set_swapbacked(folio); - __folio_mark_uptodate(folio); - - ret = -EFAULT; - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(pgoff >= max_off)) - goto out_release; - ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); - if (ret) - goto out_release; - ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); - if (ret) - goto out_release; + err = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); + if (err) + goto err_unlock; - ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, flags); - if (ret) - goto out_delete_from_cache; + if (shmem_inode_acct_blocks(inode, 1)) { + err = -ENOMEM; + goto err_delete_from_cache; + } + folio_add_lru(folio); shmem_recalc_inode(inode, 1, 0); - folio_unlock(folio); + return 0; -out_delete_from_cache: + +err_delete_from_cache: filemap_remove_folio(folio); -out_release: +err_unlock: + folio_unlock(folio); + return err; +} + +static void shmem_mfill_filemap_remove(struct folio *folio, + struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + + filemap_remove_folio(folio); + shmem_recalc_inode(inode, 0, 0); folio_unlock(folio); - folio_put(folio); -out_unacct_blocks: - shmem_inode_unacct_blocks(inode, 1); - return ret; } + +static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t pgoff) +{ + struct folio *folio; + int err; + + err = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); + if (err) + return ERR_PTR(err); + + return folio; +} + +static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) +{ + return true; +} + +static const struct vm_uffd_ops shmem_uffd_ops = { + .can_userfault = shmem_can_userfault, + .get_folio_noalloc = shmem_get_folio_noalloc, + .alloc_folio = shmem_mfill_folio_alloc, + .filemap_add = shmem_mfill_filemap_add, + .filemap_remove = shmem_mfill_filemap_remove, +}; #endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS @@ -4255,10 +4231,13 @@ static int shmem_initxattrs(struct inode *inode, struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); const struct xattr *xattr; - struct simple_xattr *new_xattr; size_t ispace = 0; size_t len; + CLASS(simple_xattrs, xattrs)(); + if (IS_ERR(xattrs)) + return PTR_ERR(xattrs); + if (sbinfo->max_inodes) { for (xattr = xattr_array; xattr->name != NULL; xattr++) { ispace += simple_xattr_space(xattr->name, @@ -4277,24 +4256,24 @@ static int shmem_initxattrs(struct inode *inode, } for (xattr = xattr_array; xattr->name != NULL; xattr++) { - new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); - if (!new_xattr) + CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len); + if (IS_ERR(new_xattr)) break; len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL_ACCOUNT); - if (!new_xattr->name) { - kvfree(new_xattr); + if (!new_xattr->name) break; - } memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); - simple_xattr_add(&info->xattrs, new_xattr); + if (simple_xattr_add(xattrs, new_xattr)) + break; + retain_and_null_ptr(new_xattr); } if (xattr->name != NULL) { @@ -4303,10 +4282,10 @@ static int shmem_initxattrs(struct inode *inode, sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } - simple_xattrs_free(&info->xattrs, NULL); return -ENOMEM; } + smp_store_release(&info->xattrs, no_free_ptr(xattrs)); return 0; } @@ -4315,9 +4294,14 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler, const char *name, void *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(inode); + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(info->xattrs); + if (!xattrs) + return -ENODATA; name = xattr_full_name(handler, name); - return simple_xattr_get(&info->xattrs, name, buffer, size); + return simple_xattr_get(xattrs, name, buffer, size); } static int shmem_xattr_handler_set(const struct xattr_handler *handler, @@ -4328,10 +4312,16 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct simple_xattrs *xattrs; struct simple_xattr *old_xattr; size_t ispace = 0; name = xattr_full_name(handler, name); + + xattrs = simple_xattrs_lazy_alloc(&info->xattrs, value, flags); + if (IS_ERR_OR_NULL(xattrs)) + return PTR_ERR(xattrs); + if (value && sbinfo->max_inodes) { ispace = simple_xattr_space(name, size); raw_spin_lock(&sbinfo->stat_lock); @@ -4344,13 +4334,13 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, return -ENOSPC; } - old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); + old_xattr = simple_xattr_set(xattrs, name, value, size, flags); if (!IS_ERR(old_xattr)) { ispace = 0; if (old_xattr && sbinfo->max_inodes) ispace = simple_xattr_space(old_xattr->name, old_xattr->size); - simple_xattr_free(old_xattr); + simple_xattr_free_rcu(old_xattr); old_xattr = NULL; inode_set_ctime_current(inode); inode_inc_iversion(inode); @@ -4391,7 +4381,9 @@ static const struct xattr_handler * const shmem_xattr_handlers[] = { static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); + + return simple_xattr_list(d_inode(dentry), READ_ONCE(info->xattrs), + buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ @@ -5313,6 +5305,9 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &shmem_uffd_ops, +#endif }; static const struct vm_operations_struct shmem_anon_vm_ops = { @@ -5322,6 +5317,9 @@ static const struct vm_operations_struct shmem_anon_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &shmem_uffd_ops, +#endif }; int shmem_init_fs_context(struct fs_context *fc) @@ -5564,8 +5562,7 @@ static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "inherit")) { /* Do not override huge allocation policy with non-PMD sized mTHP */ - if (shmem_huge == SHMEM_HUGE_FORCE && - order != HPAGE_PMD_ORDER) + if (shmem_huge == SHMEM_HUGE_FORCE && !is_pmd_order(order)) return -EINVAL; spin_lock(&huge_shmem_orders_lock); diff --git a/mm/show_mem.c b/mm/show_mem.c index 24078ac3e6bc..43aca5a2ac99 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -254,6 +254,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z " sec_pagetables:%lukB" " all_unreclaimable? %s" " Balloon:%lukB" + " gpu_active:%lukB" + " gpu_reclaim:%lukB" "\n", pgdat->node_id, K(node_page_state(pgdat, NR_ACTIVE_ANON)), @@ -279,7 +281,9 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), str_yes_no(kswapd_test_hopeless(pgdat)), - K(node_page_state(pgdat, NR_BALLOON_PAGES))); + K(node_page_state(pgdat, NR_BALLOON_PAGES)), + K(node_page_state(pgdat, NR_GPU_ACTIVE)), + K(node_page_state(pgdat, NR_GPU_RECLAIM))); } for_each_populated_zone(zone) { diff --git a/mm/shrinker.c b/mm/shrinker.c index 7b61fc0ee78f..76b3f750cf65 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -219,6 +219,8 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; + if (mem_cgroup_kmem_disabled() && !(shrinker->flags & SHRINKER_NONSLAB)) + return -ENOSYS; mutex_lock(&shrinker_mutex); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); @@ -286,14 +288,10 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) { int nid, index, offset; long nr; - struct mem_cgroup *parent; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct shrinker_info *child_info, *parent_info; struct shrinker_info_unit *child_unit, *parent_unit; - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - /* Prevent from concurrent shrinker_info expand */ mutex_lock(&shrinker_mutex); for_each_node(nid) { @@ -410,7 +408,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - freeable, delta, total_scan, priority); + freeable, delta, total_scan, priority, + shrinkctl->memcg); /* * Normally, we should not scan less than batch_size objects in one @@ -461,7 +460,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, */ new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); - trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan, + shrinkctl->memcg); return freed; } @@ -544,8 +544,11 @@ again: /* Call non-slab shrinkers even though kmem is disabled */ if (!memcg_kmem_online() && - !(shrinker->flags & SHRINKER_NONSLAB)) + !(shrinker->flags & SHRINKER_NONSLAB)) { + clear_bit(offset, unit->map); + shrinker_put(shrinker); continue; + } ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { @@ -716,6 +719,7 @@ non_memcg: * - non-memcg-aware shrinkers * - !CONFIG_MEMCG * - memcg is disabled by kernel command line + * - non-slab shrinkers: when memcg kmem is disabled */ size = sizeof(*shrinker->nr_deferred); if (flags & SHRINKER_NUMA_AWARE) diff --git a/mm/slab.h b/mm/slab.h index e9ab292acd22..bf2f87acf5e3 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -94,7 +94,7 @@ struct slab { #define SLAB_MATCH(pg, sl) \ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) SLAB_MATCH(flags, flags); -SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ +SLAB_MATCH(compound_info, slab_cache); /* Ensure bit 0 is clear */ SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG SLAB_MATCH(memcg_data, obj_exts); @@ -131,11 +131,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(struct freelist */ static inline struct slab *page_slab(const struct page *page) { - unsigned long head; - - head = READ_ONCE(page->compound_head); - if (head & 1) - page = (struct page *)(head - 1); + page = compound_head(page); if (data_race(page->page_type >> 24) != PGTY_slab) page = NULL; @@ -191,6 +187,11 @@ struct kmem_cache_order_objects { unsigned int x; }; +struct kmem_cache_per_node_ptrs { + struct node_barn *barn; + struct kmem_cache_node *node; +}; + /* * Slab cache management. */ @@ -247,7 +248,7 @@ struct kmem_cache { struct kmem_cache_stats __percpu *cpu_stats; #endif - struct kmem_cache_node *node[MAX_NUMNODES]; + struct kmem_cache_per_node_ptrs per_node[MAX_NUMNODES]; }; /* diff --git a/mm/slub.c b/mm/slub.c index 2b2d33cc735c..161079ac5ba1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -59,7 +59,7 @@ * 0. cpu_hotplug_lock * 1. slab_mutex (Global Mutex) * 2a. kmem_cache->cpu_sheaves->lock (Local trylock) - * 2b. node->barn->lock (Spinlock) + * 2b. barn->lock (Spinlock) * 2c. node->list_lock (Spinlock) * 3. slab_lock(slab) (Only on some arches) * 4. object_map_lock (Only for debugging) @@ -136,7 +136,7 @@ * or spare sheaf can handle the allocation or free, there is no other * overhead. * - * node->barn->lock (spinlock) + * barn->lock (spinlock) * * This lock protects the operations on per-NUMA-node barn. It can quickly * serve an empty or full sheaf if available, and avoid more expensive refill @@ -436,26 +436,24 @@ struct kmem_cache_node { atomic_long_t total_objects; struct list_head full; #endif - struct node_barn *barn; }; static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) { - return s->node[node]; + return s->per_node[node].node; +} + +static inline struct node_barn *get_barn_node(struct kmem_cache *s, int node) +{ + return s->per_node[node].barn; } /* - * Get the barn of the current cpu's closest memory node. It may not exist on - * systems with memoryless nodes but without CONFIG_HAVE_MEMORYLESS_NODES + * Get the barn of the current cpu's NUMA node. It may be a memoryless node. */ static inline struct node_barn *get_barn(struct kmem_cache *s) { - struct kmem_cache_node *n = get_node(s, numa_mem_id()); - - if (!n) - return NULL; - - return n->barn; + return get_barn_node(s, numa_node_id()); } /* @@ -475,6 +473,12 @@ static inline struct node_barn *get_barn(struct kmem_cache *s) static nodemask_t slab_nodes; /* + * Similar to slab_nodes but for where we have node_barn allocated. + * Corresponds to N_ONLINE nodes. + */ +static nodemask_t slab_barn_nodes; + +/* * Workqueue used for flushing cpu and kfree_rcu sheaves. */ static struct workqueue_struct *flushwq; @@ -2822,24 +2826,6 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, return 0; } -static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf); - -static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) -{ - struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp); - - if (!sheaf) - return NULL; - - if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC | __GFP_NOWARN)) { - sheaf_flush_unused(s, sheaf); - free_empty_sheaf(s, sheaf); - return NULL; - } - - return sheaf; -} - /* * Maximum number of objects freed during a single flush of main pcs sheaf. * Translates directly to an on-stack array size. @@ -4082,6 +4068,51 @@ void flush_all_rcu_sheaves(void) rcu_barrier(); } +static int slub_cpu_setup(unsigned int cpu) +{ + int nid = cpu_to_node(cpu); + struct kmem_cache *s; + int ret = 0; + + /* + * we never clear a nid so it's safe to do a quick check before taking + * the mutex, and then recheck to handle parallel cpu hotplug safely + */ + if (node_isset(nid, slab_barn_nodes)) + return 0; + + mutex_lock(&slab_mutex); + + if (node_isset(nid, slab_barn_nodes)) + goto out; + + list_for_each_entry(s, &slab_caches, list) { + struct node_barn *barn; + + /* + * barn might already exist if a previous callback failed midway + */ + if (!cache_has_sheaves(s) || get_barn_node(s, nid)) + continue; + + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); + + if (!barn) { + ret = -ENOMEM; + goto out; + } + + barn_init(barn); + s->per_node[nid].barn = barn; + } + node_set(nid, slab_barn_nodes); + +out: + mutex_unlock(&slab_mutex); + + return ret; +} + /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. @@ -4611,34 +4642,35 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, if (!allow_spin) return NULL; - if (empty) { - if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC | __GFP_NOWARN)) { - full = empty; - } else { - /* - * we must be very low on memory so don't bother - * with the barn - */ - sheaf_flush_unused(s, empty); - free_empty_sheaf(s, empty); - } - } else { - full = alloc_full_sheaf(s, gfp); + if (!empty) { + empty = alloc_empty_sheaf(s, gfp); + if (!empty) + return NULL; } - if (!full) + if (refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC | __GFP_NOWARN)) { + /* + * we must be very low on memory so don't bother + * with the barn + */ + sheaf_flush_unused(s, empty); + free_empty_sheaf(s, empty); + return NULL; + } + + full = empty; + empty = NULL; if (!local_trylock(&s->cpu_sheaves->lock)) goto barn_put; pcs = this_cpu_ptr(s->cpu_sheaves); /* - * If we are returning empty sheaf, we either got it from the - * barn or had to allocate one. If we are returning a full - * sheaf, it's due to racing or being migrated to a different - * cpu. Breaching the barn's sheaf limits should be thus rare - * enough so just ignore them to simplify the recovery. + * If we put any empty or full sheaf to the barn below, it's due to + * racing or being migrated to a different cpu. Breaching the barn's + * sheaf limits should be thus rare enough so just ignore them to + * simplify the recovery. */ if (pcs->main->size == 0) { @@ -5088,12 +5120,15 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, } /* - * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least - * the given size + * Refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least + * the given size. * - * the sheaf might be replaced by a new one when requesting more than - * s->sheaf_capacity objects if such replacement is necessary, but the refill - * fails (returning -ENOMEM), the existing sheaf is left intact + * Return: 0 on success. The sheaf will contain at least @size objects. + * The sheaf might have been replaced with a new one if more than + * sheaf->capacity objects are requested. + * + * Return: -ENOMEM on failure. Some objects might have been added to the sheaf + * but the sheaf will not be replaced. * * In practice we always refill to full sheaf's capacity. */ @@ -5788,7 +5823,6 @@ bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin) static void rcu_free_sheaf(struct rcu_head *head) { - struct kmem_cache_node *n; struct slab_sheaf *sheaf; struct node_barn *barn = NULL; struct kmem_cache *s; @@ -5811,12 +5845,10 @@ static void rcu_free_sheaf(struct rcu_head *head) if (__rcu_free_sheaf_prepare(s, sheaf)) goto flush; - n = get_node(s, sheaf->node); - if (!n) + barn = get_barn_node(s, sheaf->node); + if (!barn) goto flush; - barn = n->barn; - /* due to slab_free_hook() */ if (unlikely(sheaf->size == 0)) goto empty; @@ -5938,7 +5970,7 @@ do_free: rcu_sheaf = NULL; } else { pcs->rcu_free = NULL; - rcu_sheaf->node = numa_mem_id(); + rcu_sheaf->node = numa_node_id(); } /* @@ -5960,6 +5992,57 @@ fail: return false; } +static __always_inline bool can_free_to_pcs(struct slab *slab) +{ + int slab_node; + int numa_node; + + if (!IS_ENABLED(CONFIG_NUMA)) + goto check_pfmemalloc; + + slab_node = slab_nid(slab); + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES + /* + * numa_mem_id() points to the closest node with memory so only allow + * objects from that node to the percpu sheaves + */ + numa_node = numa_mem_id(); + + if (likely(slab_node == numa_node)) + goto check_pfmemalloc; +#else + + /* + * numa_mem_id() is only a wrapper to numa_node_id() which is where this + * cpu belongs to, but it might be a memoryless node anyway. We don't + * know what the closest node is. + */ + numa_node = numa_node_id(); + + /* freed object is from this cpu's node, proceed */ + if (likely(slab_node == numa_node)) + goto check_pfmemalloc; + + /* + * Freed object isn't from this cpu's node, but that node is memoryless + * or only has ZONE_MOVABLE memory, which slab cannot allocate from. + * Proceed as it's better to cache remote objects than falling back to + * the slowpath for everything. The allocation side can never obtain + * a local object anyway, if none exist. We don't have numa_mem_id() to + * point to the closest node as we would on a proper memoryless node + * setup. + */ + if (unlikely(!node_state(numa_node, N_NORMAL_MEMORY))) + goto check_pfmemalloc; +#endif + + return false; + +check_pfmemalloc: + return likely(!slab_test_pfmemalloc(slab)); +} + /* * Bulk free objects to the percpu sheaves. * Unlike free_to_pcs() this includes the calls to all necessary hooks @@ -5974,7 +6057,6 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) struct node_barn *barn; void *remote_objects[PCS_BATCH_MAX]; unsigned int remote_nr = 0; - int node = numa_mem_id(); next_remote_batch: while (i < size) { @@ -5988,8 +6070,7 @@ next_remote_batch: continue; } - if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node) - || slab_test_pfmemalloc(slab))) { + if (unlikely(!can_free_to_pcs(slab))) { remote_objects[remote_nr] = p[i]; p[i] = p[--size]; if (++remote_nr >= PCS_BATCH_MAX) @@ -6165,11 +6246,8 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) return; - if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()) - && likely(!slab_test_pfmemalloc(slab))) { - if (likely(free_to_pcs(s, object, true))) - return; - } + if (likely(can_free_to_pcs(slab)) && likely(free_to_pcs(s, object, true))) + return; __slab_free(s, slab, object, object, 1, addr); stat(s, FREE_SLOWPATH); @@ -6540,10 +6618,8 @@ void kfree_nolock(const void *object) */ kasan_slab_free(s, x, false, false, /* skip quarantine */true); - if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) { - if (likely(free_to_pcs(s, x, false))) - return; - } + if (likely(can_free_to_pcs(slab)) && likely(free_to_pcs(s, x, false))) + return; /* * __slab_free() can locklessly cmpxchg16 into a slab, but then it might @@ -6569,16 +6645,6 @@ __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, if (!kasan_check_byte(p)) return NULL; - /* - * If reallocation is not necessary (e. g. the new size is less - * than the current allocated size), the current allocation will be - * preserved unless __GFP_THISNODE is set. In the latter case a new - * allocation on the requested node will be attempted. - */ - if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && - nid != page_to_nid(virt_to_page(p))) - goto alloc_new; - if (is_kfence_address(p)) { ks = orig_size = kfence_ksize(p); } else { @@ -6597,6 +6663,16 @@ __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, } } + /* + * If reallocation is not necessary (e. g. the new size is less + * than the current allocated size), the current allocation will be + * preserved unless __GFP_THISNODE is set. In the latter case a new + * allocation on the requested node will be attempted. + */ + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(virt_to_page(p))) + goto alloc_new; + /* If the old object doesn't fit, allocate a bigger one */ if (new_size > ks) goto alloc_new; @@ -6631,7 +6707,7 @@ alloc_new: if (ret && p) { /* Disable KASAN checks as the object's redzone is accessed. */ kasan_disable_current(); - memcpy(ret, kasan_reset_tag(p), orig_size ?: ks); + memcpy(ret, kasan_reset_tag(p), min(new_size, (size_t)(orig_size ?: ks))); kasan_enable_current(); } @@ -6865,7 +6941,7 @@ void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long alig if (p) { /* We already know that `p` is not a vmalloc address. */ kasan_disable_current(); - memcpy(n, kasan_reset_tag(p), ksize(p)); + memcpy(n, kasan_reset_tag(p), min(size, ksize(p))); kasan_enable_current(); kfree(p); @@ -7427,7 +7503,7 @@ static inline int calculate_order(unsigned int size) } static void -init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) +init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -7437,9 +7513,6 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif - n->barn = barn; - if (barn) - barn_init(barn); } #ifdef CONFIG_SLUB_STATS @@ -7534,8 +7607,8 @@ static void early_kmem_cache_node_alloc(int node) n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; - kmem_cache_node->node[node] = n; - init_kmem_cache_node(n, NULL); + kmem_cache_node->per_node[node].node = n; + init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, slab->objects); /* @@ -7550,15 +7623,20 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) int node; struct kmem_cache_node *n; - for_each_kmem_cache_node(s, node, n) { - if (n->barn) { - WARN_ON(n->barn->nr_full); - WARN_ON(n->barn->nr_empty); - kfree(n->barn); - n->barn = NULL; - } + for_each_node(node) { + struct node_barn *barn = get_barn_node(s, node); + + if (!barn) + continue; - s->node[node] = NULL; + WARN_ON(barn->nr_full); + WARN_ON(barn->nr_empty); + kfree(barn); + s->per_node[node].barn = NULL; + } + + for_each_kmem_cache_node(s, node, n) { + s->per_node[node].node = NULL; kmem_cache_free(kmem_cache_node, n); } } @@ -7579,31 +7657,36 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) for_each_node_mask(node, slab_nodes) { struct kmem_cache_node *n; - struct node_barn *barn = NULL; if (slab_state == DOWN) { early_kmem_cache_node_alloc(node); continue; } - if (cache_has_sheaves(s)) { - barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); - - if (!barn) - return 0; - } - n = kmem_cache_alloc_node(kmem_cache_node, GFP_KERNEL, node); - if (!n) { - kfree(barn); + if (!n) return 0; - } - init_kmem_cache_node(n, barn); + init_kmem_cache_node(n); + s->per_node[node].node = n; + } + + if (slab_state == DOWN || !cache_has_sheaves(s)) + return 1; + + for_each_node_mask(node, slab_barn_nodes) { + struct node_barn *barn; + + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); - s->node[node] = n; + if (!barn) + return 0; + + barn_init(barn); + s->per_node[node].barn = barn; } + return 1; } @@ -7892,10 +7975,15 @@ int __kmem_cache_shutdown(struct kmem_cache *s) if (cache_has_sheaves(s)) rcu_barrier(); + for_each_node(node) { + struct node_barn *barn = get_barn_node(s, node); + + if (barn) + barn_shrink(s, barn); + } + /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { - if (n->barn) - barn_shrink(s, n->barn); free_partial(s, n); if (n->nr_partial || node_nr_slabs(n)) return 1; @@ -8105,14 +8193,18 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) unsigned long flags; int ret = 0; + for_each_node(node) { + struct node_barn *barn = get_barn_node(s, node); + + if (barn) + barn_shrink(s, barn); + } + for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); for (i = 0; i < SHRINK_PROMOTE_MAX; i++) INIT_LIST_HEAD(promote + i); - if (n->barn) - barn_shrink(s, n->barn); - spin_lock_irqsave(&n->list_lock, flags); /* @@ -8201,7 +8293,8 @@ static int slab_mem_going_online_callback(int nid) if (get_node(s, nid)) continue; - if (cache_has_sheaves(s)) { + if (cache_has_sheaves(s) && !get_barn_node(s, nid)) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); if (!barn) { @@ -8222,15 +8315,20 @@ static int slab_mem_going_online_callback(int nid) goto out; } - init_kmem_cache_node(n, barn); + init_kmem_cache_node(n); + s->per_node[nid].node = n; - s->node[nid] = n; + if (barn) { + barn_init(barn); + s->per_node[nid].barn = barn; + } } /* * Any cache created after this point will also have kmem_cache_node - * initialized for the new node. + * and barn initialized for the new node. */ node_set(nid, slab_nodes); + node_set(nid, slab_barn_nodes); out: mutex_unlock(&slab_mutex); return ret; @@ -8309,7 +8407,7 @@ static void __init bootstrap_cache_sheaves(struct kmem_cache *s) if (!capacity) return; - for_each_node_mask(node, slab_nodes) { + for_each_node_mask(node, slab_barn_nodes) { struct node_barn *barn; barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); @@ -8320,7 +8418,7 @@ static void __init bootstrap_cache_sheaves(struct kmem_cache *s) } barn_init(barn); - get_node(s, node)->barn = barn; + s->per_node[node].barn = barn; } for_each_possible_cpu(cpu) { @@ -8381,6 +8479,9 @@ void __init kmem_cache_init(void) for_each_node_state(node, N_MEMORY) node_set(node, slab_nodes); + for_each_online_node(node) + node_set(node, slab_barn_nodes); + create_boot_cache(kmem_cache_node, "kmem_cache_node", sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); @@ -8391,8 +8492,8 @@ void __init kmem_cache_init(void) slab_state = PARTIAL; create_boot_cache(kmem_cache, "kmem_cache", - offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *), + offsetof(struct kmem_cache, per_node) + + nr_node_ids * sizeof(struct kmem_cache_per_node_ptrs), SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); @@ -8407,7 +8508,7 @@ void __init kmem_cache_init(void) /* Setup random freelists for each cache */ init_freelist_randomization(); - cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, + cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", slub_cpu_setup, slub_cpu_dead); pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n", @@ -8874,7 +8975,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, return len; } -#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) +#define to_slab_attr(n) container_of_const(n, struct slab_attribute, attr) #define to_slab(n) container_of(n, struct kmem_cache, kobj) struct slab_attribute { @@ -8884,10 +8985,10 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400) + static const struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400) #define SLAB_ATTR(_name) \ - static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600) + static const struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { @@ -9281,7 +9382,7 @@ static ssize_t skip_kfence_store(struct kmem_cache *s, SLAB_ATTR(skip_kfence); #endif -static struct attribute *slab_attrs[] = { +static const struct attribute *const slab_attrs[] = { &slab_size_attr.attr, &object_size_attr.attr, &objs_per_slab_attr.attr, @@ -9358,15 +9459,13 @@ static struct attribute *slab_attrs[] = { NULL }; -static const struct attribute_group slab_attr_group = { - .attrs = slab_attrs, -}; +ATTRIBUTE_GROUPS(slab); static ssize_t slab_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct slab_attribute *attribute; + const struct slab_attribute *attribute; struct kmem_cache *s; attribute = to_slab_attr(attr); @@ -9382,7 +9481,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { - struct slab_attribute *attribute; + const struct slab_attribute *attribute; struct kmem_cache *s; attribute = to_slab_attr(attr); @@ -9407,6 +9506,7 @@ static const struct sysfs_ops slab_sysfs_ops = { static const struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, .release = kmem_cache_release, + .default_groups = slab_groups, }; static struct kset *slab_kset; @@ -9494,10 +9594,6 @@ static int sysfs_slab_add(struct kmem_cache *s) if (err) goto out; - err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) - goto out_del_kobj; - if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); @@ -9506,9 +9602,6 @@ out: if (!unmergeable) kfree(name); return err; -out_del_kobj: - kobject_del(&s->kobj); - goto out; } void sysfs_slab_unlink(struct kmem_cache *s) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 37522d6cb398..6eadb9d116e4 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -62,7 +62,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) if (slab_is_available()) { gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; int order = get_order(size); - static bool warned; + static bool warned __meminitdata; struct page *page; page = alloc_pages_node(node, gfp_mask, order); @@ -303,59 +303,6 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, } /* - * Undo populate_hvo, and replace it with a normal base page mapping. - * Used in memory init in case a HVO mapping needs to be undone. - * - * This can happen when it is discovered that a memblock allocated - * hugetlb page spans multiple zones, which can only be verified - * after zones have been initialized. - * - * We know that: - * 1) The first @headsize / PAGE_SIZE vmemmap pages were individually - * allocated through memblock, and mapped. - * - * 2) The rest of the vmemmap pages are mirrors of the last head page. - */ -int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end, - int node, unsigned long headsize) -{ - unsigned long maddr, pfn; - pte_t *pte; - int headpages; - - /* - * Should only be called early in boot, so nothing will - * be accessing these page structures. - */ - WARN_ON(!early_boot_irqs_disabled); - - headpages = headsize >> PAGE_SHIFT; - - /* - * Clear mirrored mappings for tail page structs. - */ - for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) { - pte = virt_to_kpte(maddr); - pte_clear(&init_mm, maddr, pte); - } - - /* - * Clear and free mappings for head page and first tail page - * structs. - */ - for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) { - pte = virt_to_kpte(maddr); - pfn = pte_pfn(ptep_get(pte)); - pte_clear(&init_mm, maddr, pte); - memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE); - } - - flush_tlb_kernel_range(addr, end); - - return vmemmap_populate(addr, end, node, NULL); -} - -/* * Write protect the mirrored tail page structs for HVO. This will be * called from the hugetlb code when gathering and initializing the * memblock allocated gigantic pages. The write protect can't be @@ -378,16 +325,54 @@ void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end, } } -/* - * Populate vmemmap pages HVO-style. The first page contains the head - * page and needed tail pages, the other ones are mirrors of the first - * page. - */ +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static __meminit struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) +{ + struct page *p, *tail; + unsigned int idx; + int node = zone_to_nid(zone); + + if (WARN_ON_ONCE(order < VMEMMAP_TAIL_MIN_ORDER)) + return NULL; + if (WARN_ON_ONCE(order > MAX_FOLIO_ORDER)) + return NULL; + + idx = order - VMEMMAP_TAIL_MIN_ORDER; + tail = zone->vmemmap_tails[idx]; + if (tail) + return tail; + + /* + * Only allocate the page, but do not initialize it. + * + * Any initialization done here will be overwritten by memmap_init(). + * + * hugetlb_vmemmap_init() will take care of initialization after + * memmap_init(). + */ + + p = vmemmap_alloc_block_zero(PAGE_SIZE, node); + if (!p) + return NULL; + + tail = virt_to_page(p); + zone->vmemmap_tails[idx] = tail; + + return tail; +} + int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, - int node, unsigned long headsize) + unsigned int order, struct zone *zone, + unsigned long headsize) { - pte_t *pte; unsigned long maddr; + struct page *tail; + pte_t *pte; + int node = zone_to_nid(zone); + + tail = vmemmap_get_tail(order, zone); + if (!tail) + return -ENOMEM; for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) { pte = vmemmap_populate_address(maddr, node, NULL, -1, 0); @@ -399,8 +384,9 @@ int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, * Reuse the last page struct page mapped above for the rest. */ return vmemmap_populate_range(maddr, end, node, NULL, - pte_pfn(ptep_get(pte)), 0); + page_to_pfn(tail), 0); } +#endif void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next) @@ -605,3 +591,307 @@ void __init sparse_vmemmap_init_nid_late(int nid) hugetlb_vmemmap_init_late(nid); } #endif + +static void subsection_mask_set(unsigned long *map, unsigned long pfn, + unsigned long nr_pages) +{ + int idx = subsection_map_index(pfn); + int end = subsection_map_index(pfn + nr_pages - 1); + + bitmap_set(map, idx, end - idx + 1); +} + +void __init sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + int end_sec_nr = pfn_to_section_nr(pfn + nr_pages - 1); + unsigned long nr, start_sec_nr = pfn_to_section_nr(pfn); + + for (nr = start_sec_nr; nr <= end_sec_nr; nr++) { + struct mem_section *ms; + unsigned long pfns; + + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + ms = __nr_to_section(nr); + subsection_mask_set(ms->usage->subsection_map, pfn, pfns); + + pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr, + pfns, subsection_map_index(pfn), + subsection_map_index(pfn + pfns - 1)); + + pfn += pfns; + nr_pages -= pfns; + } +} + +#ifdef CONFIG_MEMORY_HOTPLUG + +/* Mark all memory sections within the pfn range as online */ +void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + struct mem_section *ms = __nr_to_section(section_nr); + + ms->section_mem_map |= SECTION_IS_ONLINE; + } +} + +/* Mark all memory sections within the pfn range as offline */ +void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + struct mem_section *ms = __nr_to_section(section_nr); + + ms->section_mem_map &= ~SECTION_IS_ONLINE; + } +} + +static struct page * __meminit populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); +} + +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + unsigned long start = (unsigned long) pfn_to_page(pfn); + unsigned long end = start + nr_pages * sizeof(struct page); + + vmemmap_free(start, end, altmap); +} +static void free_map_bootmem(struct page *memmap) +{ + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + + vmemmap_free(start, end, NULL); +} + +static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + unsigned long *subsection_map = ms->usage + ? &ms->usage->subsection_map[0] : NULL; + + subsection_mask_set(map, pfn, nr_pages); + if (subsection_map) + bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); + + if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), + "section already deactivated (%#lx + %ld)\n", + pfn, nr_pages)) + return -EINVAL; + + bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); + return 0; +} + +static bool is_subsection_map_empty(struct mem_section *ms) +{ + return bitmap_empty(&ms->usage->subsection_map[0], + SUBSECTIONS_PER_SECTION); +} + +static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + struct mem_section *ms = __pfn_to_section(pfn); + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + unsigned long *subsection_map; + int rc = 0; + + subsection_mask_set(map, pfn, nr_pages); + + subsection_map = &ms->usage->subsection_map[0]; + + if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) + rc = -EINVAL; + else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) + rc = -EEXIST; + else + bitmap_or(subsection_map, map, subsection_map, + SUBSECTIONS_PER_SECTION); + + return rc; +} + +/* + * To deactivate a memory region, there are 3 cases to handle: + * + * 1. deactivation of a partial hot-added section: + * a) section was present at memory init. + * b) section was hot-added post memory init. + * 2. deactivation of a complete hot-added section. + * 3. deactivation of a complete section from memory init. + * + * For 1, when subsection_map does not empty we will not be freeing the + * usage map, but still need to free the vmemmap range. + */ +static void section_deactivate(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + bool section_is_early = early_section(ms); + struct page *memmap = NULL; + bool empty; + + if (clear_subsection_map(pfn, nr_pages)) + return; + + empty = is_subsection_map_empty(ms); + if (empty) { + /* + * Mark the section invalid so that valid_section() + * return false. This prevents code from dereferencing + * ms->usage array. + */ + ms->section_mem_map &= ~SECTION_HAS_MEM_MAP; + + /* + * When removing an early section, the usage map is kept (as the + * usage maps of other sections fall into the same page). It + * will be re-used when re-adding the section - which is then no + * longer an early section. If the usage map is PageReserved, it + * was allocated during boot. + */ + if (!PageReserved(virt_to_page(ms->usage))) { + kfree_rcu(ms->usage, rcu); + WRITE_ONCE(ms->usage, NULL); + } + memmap = pfn_to_page(SECTION_ALIGN_DOWN(pfn)); + } + + /* + * The memmap of early sections is always fully populated. See + * section_activate() and pfn_valid() . + */ + if (!section_is_early) { + memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); + depopulate_section_memmap(pfn, nr_pages, altmap); + } else if (memmap) { + memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), + PAGE_SIZE))); + free_map_bootmem(memmap); + } + + if (empty) + ms->section_mem_map = (unsigned long)NULL; +} + +static struct page * __meminit section_activate(int nid, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + struct mem_section_usage *usage = NULL; + struct page *memmap; + int rc; + + if (!ms->usage) { + usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); + if (!usage) + return ERR_PTR(-ENOMEM); + ms->usage = usage; + } + + rc = fill_subsection_map(pfn, nr_pages); + if (rc) { + if (usage) + ms->usage = NULL; + kfree(usage); + return ERR_PTR(rc); + } + + /* + * The early init code does not consider partially populated + * initial sections, it simply assumes that memory will never be + * referenced. If we hot-add memory into such a section then we + * do not need to populate the memmap and can simply reuse what + * is already there. + */ + if (nr_pages < PAGES_PER_SECTION && early_section(ms)) + return pfn_to_page(pfn); + + memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); + if (!memmap) { + section_deactivate(pfn, nr_pages, altmap); + return ERR_PTR(-ENOMEM); + } + memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); + + return memmap; +} + +/** + * sparse_add_section - add a memory section, or populate an existing one + * @nid: The node to add section on + * @start_pfn: start pfn of the memory range + * @nr_pages: number of pfns to add in the section + * @altmap: alternate pfns to allocate the memmap backing store + * @pgmap: alternate compound page geometry for devmap mappings + * + * This is only intended for hotplug. + * + * Note that only VMEMMAP supports sub-section aligned hotplug, + * the proper alignment and size are gated by check_pfn_span(). + * + * + * Return: + * * 0 - On success. + * * -EEXIST - Section has been present. + * * -ENOMEM - Out of memory. + */ +int __meminit sparse_add_section(int nid, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + struct page *memmap; + int ret; + + ret = sparse_index_init(section_nr, nid); + if (ret < 0) + return ret; + + memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap); + if (IS_ERR(memmap)) + return PTR_ERR(memmap); + + /* + * Poison uninitialized struct pages in order to catch invalid flags + * combinations. + */ + page_init_poison(memmap, sizeof(struct page) * nr_pages); + + ms = __nr_to_section(section_nr); + __section_mark_present(ms, section_nr); + + /* Align memmap to section boundary in the subsection case */ + if (section_nr_to_pfn(section_nr) != start_pfn) + memmap = pfn_to_page(section_nr_to_pfn(section_nr)); + sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0); + + return 0; +} + +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + + if (WARN_ON_ONCE(!valid_section(ms))) + return; + + section_deactivate(pfn, nr_pages, altmap); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/sparse.c b/mm/sparse.c index b5b2b6f7041b..effdac6b0ab1 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -79,7 +79,7 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) return section; } -static int __meminit sparse_index_init(unsigned long section_nr, int nid) +int __meminit sparse_index_init(unsigned long section_nr, int nid) { unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; @@ -103,7 +103,7 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) return 0; } #else /* !SPARSEMEM_EXTREME */ -static inline int sparse_index_init(unsigned long section_nr, int nid) +int sparse_index_init(unsigned long section_nr, int nid) { return 0; } @@ -161,58 +161,12 @@ static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, * those loops early. */ unsigned long __highest_present_section_nr; -static void __section_mark_present(struct mem_section *ms, - unsigned long section_nr) -{ - if (section_nr > __highest_present_section_nr) - __highest_present_section_nr = section_nr; - - ms->section_mem_map |= SECTION_MARKED_PRESENT; -} static inline unsigned long first_present_section_nr(void) { return next_present_section_nr(-1); } -#ifdef CONFIG_SPARSEMEM_VMEMMAP -static void subsection_mask_set(unsigned long *map, unsigned long pfn, - unsigned long nr_pages) -{ - int idx = subsection_map_index(pfn); - int end = subsection_map_index(pfn + nr_pages - 1); - - bitmap_set(map, idx, end - idx + 1); -} - -void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) -{ - int end_sec_nr = pfn_to_section_nr(pfn + nr_pages - 1); - unsigned long nr, start_sec_nr = pfn_to_section_nr(pfn); - - for (nr = start_sec_nr; nr <= end_sec_nr; nr++) { - struct mem_section *ms; - unsigned long pfns; - - pfns = min(nr_pages, PAGES_PER_SECTION - - (pfn & ~PAGE_SECTION_MASK)); - ms = __nr_to_section(nr); - subsection_mask_set(ms->usage->subsection_map, pfn, pfns); - - pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr, - pfns, subsection_map_index(pfn), - subsection_map_index(pfn + pfns - 1)); - - pfn += pfns; - nr_pages -= pfns; - } -} -#else -void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) -{ -} -#endif - /* Record a memory area against a node. */ static void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -260,42 +214,6 @@ static void __init memblocks_present(void) memory_present(nid, start, end); } -/* - * Subtle, we encode the real pfn into the mem_map such that - * the identity pfn - section_mem_map will return the actual - * physical page frame number. - */ -static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) -{ - unsigned long coded_mem_map = - (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); - BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT); - BUG_ON(coded_mem_map & ~SECTION_MAP_MASK); - return coded_mem_map; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -/* - * Decode mem_map from the coded memmap - */ -struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) -{ - /* mask off the extra low bits of information */ - coded_mem_map &= SECTION_MAP_MASK; - return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); -} -#endif /* CONFIG_MEMORY_HOTPLUG */ - -static void __meminit sparse_init_one_section(struct mem_section *ms, - unsigned long pnum, struct page *mem_map, - struct mem_section_usage *usage, unsigned long flags) -{ - ms->section_mem_map &= ~SECTION_MAP_MASK; - ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) - | SECTION_HAS_MEM_MAP | flags; - ms->usage = usage; -} - static unsigned long usemap_size(void) { return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); @@ -306,102 +224,6 @@ size_t mem_section_usage_size(void) return sizeof(struct mem_section_usage) + usemap_size(); } -#ifdef CONFIG_MEMORY_HOTREMOVE -static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) -{ -#ifndef CONFIG_NUMA - VM_BUG_ON(pgdat != &contig_page_data); - return __pa_symbol(&contig_page_data); -#else - return __pa(pgdat); -#endif -} - -static struct mem_section_usage * __init -sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, - unsigned long size) -{ - struct mem_section_usage *usage; - unsigned long goal, limit; - int nid; - /* - * A page may contain usemaps for other sections preventing the - * page being freed and making a section unremovable while - * other sections referencing the usemap remain active. Similarly, - * a pgdat can prevent a section being removed. If section A - * contains a pgdat and section B contains the usemap, both - * sections become inter-dependent. This allocates usemaps - * from the same section as the pgdat where possible to avoid - * this problem. - */ - goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); - limit = goal + (1UL << PA_SECTION_SHIFT); - nid = early_pfn_to_nid(goal >> PAGE_SHIFT); -again: - usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid); - if (!usage && limit) { - limit = MEMBLOCK_ALLOC_ACCESSIBLE; - goto again; - } - return usage; -} - -static void __init check_usemap_section_nr(int nid, - struct mem_section_usage *usage) -{ - unsigned long usemap_snr, pgdat_snr; - static unsigned long old_usemap_snr; - static unsigned long old_pgdat_snr; - struct pglist_data *pgdat = NODE_DATA(nid); - int usemap_nid; - - /* First call */ - if (!old_usemap_snr) { - old_usemap_snr = NR_MEM_SECTIONS; - old_pgdat_snr = NR_MEM_SECTIONS; - } - - usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT); - pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT); - if (usemap_snr == pgdat_snr) - return; - - if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr) - /* skip redundant message */ - return; - - old_usemap_snr = usemap_snr; - old_pgdat_snr = pgdat_snr; - - usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); - if (usemap_nid != nid) { - pr_info("node %d must be removed before remove section %ld\n", - nid, usemap_snr); - return; - } - /* - * There is a circular dependency. - * Some platforms allow un-removable section because they will just - * gather other removable sections for dynamic partitioning. - * Just notify un-removable section's number here. - */ - pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n", - usemap_snr, pgdat_snr, nid); -} -#else -static struct mem_section_usage * __init -sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, - unsigned long size) -{ - return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id); -} - -static void __init check_usemap_section_nr(int nid, - struct mem_section_usage *usage) -{ -} -#endif /* CONFIG_MEMORY_HOTREMOVE */ - #ifdef CONFIG_SPARSEMEM_VMEMMAP unsigned long __init section_map_size(void) { @@ -498,7 +320,6 @@ void __init sparse_init_early_section(int nid, struct page *map, unsigned long pnum, unsigned long flags) { BUG_ON(!sparse_usagebuf || sparse_usagebuf >= sparse_usagebuf_end); - check_usemap_section_nr(nid, sparse_usagebuf); sparse_init_one_section(__nr_to_section(pnum), pnum, map, sparse_usagebuf, SECTION_IS_EARLY | flags); sparse_usagebuf = (void *)sparse_usagebuf + mem_section_usage_size(); @@ -509,8 +330,7 @@ static int __init sparse_usage_init(int nid, unsigned long map_count) unsigned long size; size = mem_section_usage_size() * map_count; - sparse_usagebuf = sparse_early_usemaps_alloc_pgdat_section( - NODE_DATA(nid), size); + sparse_usagebuf = memblock_alloc_node(size, SMP_CACHE_BYTES, nid); if (!sparse_usagebuf) { sparse_usagebuf_end = NULL; return -ENOMEM; @@ -583,7 +403,6 @@ failed: ms = __nr_to_section(pnum); if (!preinited_vmemmap_section(ms)) ms->section_mem_map = 0; - ms->section_mem_map = 0; } } @@ -600,6 +419,11 @@ void __init sparse_init(void) BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); memblocks_present(); + if (compound_info_has_mask()) { + VM_WARN_ON_ONCE(!IS_ALIGNED((unsigned long) pfn_to_page(0), + MAX_FOLIO_VMEMMAP_ALIGN)); + } + pnum_begin = first_present_section_nr(); nid_begin = sparse_early_nid(__nr_to_section(pnum_begin)); @@ -623,356 +447,3 @@ void __init sparse_init(void) sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); vmemmap_populate_print_last(); } - -#ifdef CONFIG_MEMORY_HOTPLUG - -/* Mark all memory sections within the pfn range as online */ -void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(pfn); - struct mem_section *ms; - - /* onlining code should never touch invalid ranges */ - if (WARN_ON(!valid_section_nr(section_nr))) - continue; - - ms = __nr_to_section(section_nr); - ms->section_mem_map |= SECTION_IS_ONLINE; - } -} - -/* Mark all memory sections within the pfn range as offline */ -void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(pfn); - struct mem_section *ms; - - /* - * TODO this needs some double checking. Offlining code makes - * sure to check pfn_valid but those checks might be just bogus - */ - if (WARN_ON(!valid_section_nr(section_nr))) - continue; - - ms = __nr_to_section(section_nr); - ms->section_mem_map &= ~SECTION_IS_ONLINE; - } -} - -#ifdef CONFIG_SPARSEMEM_VMEMMAP -static struct page * __meminit populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) -{ - return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); -} - -static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) -{ - unsigned long start = (unsigned long) pfn_to_page(pfn); - unsigned long end = start + nr_pages * sizeof(struct page); - - vmemmap_free(start, end, altmap); -} -static void free_map_bootmem(struct page *memmap) -{ - unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); - - vmemmap_free(start, end, NULL); -} - -static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) -{ - DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; - DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; - struct mem_section *ms = __pfn_to_section(pfn); - unsigned long *subsection_map = ms->usage - ? &ms->usage->subsection_map[0] : NULL; - - subsection_mask_set(map, pfn, nr_pages); - if (subsection_map) - bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); - - if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), - "section already deactivated (%#lx + %ld)\n", - pfn, nr_pages)) - return -EINVAL; - - bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); - return 0; -} - -static bool is_subsection_map_empty(struct mem_section *ms) -{ - return bitmap_empty(&ms->usage->subsection_map[0], - SUBSECTIONS_PER_SECTION); -} - -static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) -{ - struct mem_section *ms = __pfn_to_section(pfn); - DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; - unsigned long *subsection_map; - int rc = 0; - - subsection_mask_set(map, pfn, nr_pages); - - subsection_map = &ms->usage->subsection_map[0]; - - if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) - rc = -EINVAL; - else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) - rc = -EEXIST; - else - bitmap_or(subsection_map, map, subsection_map, - SUBSECTIONS_PER_SECTION); - - return rc; -} -#else -static struct page * __meminit populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) -{ - return kvmalloc_node(array_size(sizeof(struct page), - PAGES_PER_SECTION), GFP_KERNEL, nid); -} - -static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) -{ - kvfree(pfn_to_page(pfn)); -} - -static void free_map_bootmem(struct page *memmap) -{ - unsigned long maps_section_nr, removing_section_nr, i; - unsigned long type, nr_pages; - struct page *page = virt_to_page(memmap); - - nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) - >> PAGE_SHIFT; - - for (i = 0; i < nr_pages; i++, page++) { - type = bootmem_type(page); - - BUG_ON(type == NODE_INFO); - - maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); - removing_section_nr = bootmem_info(page); - - /* - * When this function is called, the removing section is - * logical offlined state. This means all pages are isolated - * from page allocator. If removing section's memmap is placed - * on the same section, it must not be freed. - * If it is freed, page allocator may allocate it which will - * be removed physically soon. - */ - if (maps_section_nr != removing_section_nr) - put_page_bootmem(page); - } -} - -static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) -{ - return 0; -} - -static bool is_subsection_map_empty(struct mem_section *ms) -{ - return true; -} - -static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) -{ - return 0; -} -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - -/* - * To deactivate a memory region, there are 3 cases to handle across - * two configurations (SPARSEMEM_VMEMMAP={y,n}): - * - * 1. deactivation of a partial hot-added section (only possible in - * the SPARSEMEM_VMEMMAP=y case). - * a) section was present at memory init. - * b) section was hot-added post memory init. - * 2. deactivation of a complete hot-added section. - * 3. deactivation of a complete section from memory init. - * - * For 1, when subsection_map does not empty we will not be freeing the - * usage map, but still need to free the vmemmap range. - * - * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified - */ -static void section_deactivate(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) -{ - struct mem_section *ms = __pfn_to_section(pfn); - bool section_is_early = early_section(ms); - struct page *memmap = NULL; - bool empty; - - if (clear_subsection_map(pfn, nr_pages)) - return; - - empty = is_subsection_map_empty(ms); - if (empty) { - unsigned long section_nr = pfn_to_section_nr(pfn); - - /* - * Mark the section invalid so that valid_section() - * return false. This prevents code from dereferencing - * ms->usage array. - */ - ms->section_mem_map &= ~SECTION_HAS_MEM_MAP; - - /* - * When removing an early section, the usage map is kept (as the - * usage maps of other sections fall into the same page). It - * will be re-used when re-adding the section - which is then no - * longer an early section. If the usage map is PageReserved, it - * was allocated during boot. - */ - if (!PageReserved(virt_to_page(ms->usage))) { - kfree_rcu(ms->usage, rcu); - WRITE_ONCE(ms->usage, NULL); - } - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - } - - /* - * The memmap of early sections is always fully populated. See - * section_activate() and pfn_valid() . - */ - if (!section_is_early) { - memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); - depopulate_section_memmap(pfn, nr_pages, altmap); - } else if (memmap) { - memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), - PAGE_SIZE))); - free_map_bootmem(memmap); - } - - if (empty) - ms->section_mem_map = (unsigned long)NULL; -} - -static struct page * __meminit section_activate(int nid, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) -{ - struct mem_section *ms = __pfn_to_section(pfn); - struct mem_section_usage *usage = NULL; - struct page *memmap; - int rc; - - if (!ms->usage) { - usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); - if (!usage) - return ERR_PTR(-ENOMEM); - ms->usage = usage; - } - - rc = fill_subsection_map(pfn, nr_pages); - if (rc) { - if (usage) - ms->usage = NULL; - kfree(usage); - return ERR_PTR(rc); - } - - /* - * The early init code does not consider partially populated - * initial sections, it simply assumes that memory will never be - * referenced. If we hot-add memory into such a section then we - * do not need to populate the memmap and can simply reuse what - * is already there. - */ - if (nr_pages < PAGES_PER_SECTION && early_section(ms)) - return pfn_to_page(pfn); - - memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); - if (!memmap) { - section_deactivate(pfn, nr_pages, altmap); - return ERR_PTR(-ENOMEM); - } - memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); - - return memmap; -} - -/** - * sparse_add_section - add a memory section, or populate an existing one - * @nid: The node to add section on - * @start_pfn: start pfn of the memory range - * @nr_pages: number of pfns to add in the section - * @altmap: alternate pfns to allocate the memmap backing store - * @pgmap: alternate compound page geometry for devmap mappings - * - * This is only intended for hotplug. - * - * Note that only VMEMMAP supports sub-section aligned hotplug, - * the proper alignment and size are gated by check_pfn_span(). - * - * - * Return: - * * 0 - On success. - * * -EEXIST - Section has been present. - * * -ENOMEM - Out of memory. - */ -int __meminit sparse_add_section(int nid, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) -{ - unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct mem_section *ms; - struct page *memmap; - int ret; - - ret = sparse_index_init(section_nr, nid); - if (ret < 0) - return ret; - - memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap); - if (IS_ERR(memmap)) - return PTR_ERR(memmap); - - /* - * Poison uninitialized struct pages in order to catch invalid flags - * combinations. - */ - page_init_poison(memmap, sizeof(struct page) * nr_pages); - - ms = __nr_to_section(section_nr); - set_section_nid(section_nr, nid); - __section_mark_present(ms, section_nr); - - /* Align memmap to section boundary in the subsection case */ - if (section_nr_to_pfn(section_nr) != start_pfn) - memmap = pfn_to_page(section_nr_to_pfn(section_nr)); - sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0); - - return 0; -} - -void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) -{ - struct mem_section *ms = __pfn_to_section(pfn); - - if (WARN_ON_ONCE(!valid_section(ms))) - return; - - section_deactivate(pfn, nr_pages, altmap); -} -#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/swap.c b/mm/swap.c index bb19ccbece46..5cc44f0de987 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -20,7 +20,7 @@ #include <linux/swap.h> #include <linux/mman.h> #include <linux/pagemap.h> -#include <linux/pagevec.h> +#include <linux/folio_batch.h> #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> @@ -91,7 +91,7 @@ static void page_cache_release(struct folio *folio) __page_cache_release(folio, &lruvec, &flags); if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); } void __folio_put(struct folio *folio) @@ -175,7 +175,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) } if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); folios_put(fbatch); } @@ -240,6 +240,7 @@ void folio_rotate_reclaimable(struct folio *folio) void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) __releases(lruvec->lru_lock) + __releases(rcu) { unsigned long cost; @@ -253,6 +254,7 @@ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; if (!cost) { spin_unlock_irq(&lruvec->lru_lock); + rcu_read_unlock(); return; } @@ -285,8 +287,10 @@ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, spin_unlock_irq(&lruvec->lru_lock); lruvec = parent_lruvec(lruvec); - if (!lruvec) + if (!lruvec) { + rcu_read_unlock(); break; + } spin_lock_irq(&lruvec->lru_lock); } } @@ -349,7 +353,7 @@ void folio_activate(struct folio *folio) lruvec = folio_lruvec_lock_irq(folio); lru_activate(lruvec, folio); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); folio_set_lru(folio); } #endif @@ -412,18 +416,20 @@ static void lru_gen_inc_refs(struct folio *folio) static bool lru_gen_clear_refs(struct folio *folio) { - struct lru_gen_folio *lrugen; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); + unsigned long seq; if (gen < 0) return true; set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0); - lrugen = &folio_lruvec(folio)->lrugen; + rcu_read_lock(); + seq = READ_ONCE(folio_lruvec(folio)->lrugen.min_seq[type]); + rcu_read_unlock(); /* whether can do without shuffling under the LRU lock */ - return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type])); + return gen == lru_gen_from_seq(seq); } #else /* !CONFIG_LRU_GEN */ @@ -963,7 +969,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) if (folio_is_zone_device(folio)) { if (lruvec) { - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } if (folio_ref_sub_and_test(folio, nr_refs)) @@ -977,7 +983,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) /* hugetlb has its own memcg */ if (folio_test_hugetlb(folio)) { if (lruvec) { - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } free_huge_folio(folio); @@ -991,7 +997,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) j++; } if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); if (!j) { folio_batch_reinit(folios); return; @@ -1018,7 +1024,7 @@ EXPORT_SYMBOL(folios_put_refs); void release_pages(release_pages_arg arg, int nr) { struct folio_batch fbatch; - int refs[PAGEVEC_SIZE]; + int refs[FOLIO_BATCH_SIZE]; struct encoded_page **encoded = arg.encoded_pages; int i; @@ -1084,6 +1090,39 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) fbatch->nr = j; } +#ifdef CONFIG_MEMCG +static void lruvec_reparent_lru(struct lruvec *child_lruvec, + struct lruvec *parent_lruvec, + enum lru_list lru, int nid) +{ + int zid; + struct zone *zone; + + if (lru != LRU_UNEVICTABLE) + list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]); + + for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) { + unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid); + + mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size); + } +} + +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + enum lru_list lru; + struct lruvec *child_lruvec, *parent_lruvec; + + child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid)); + parent_lruvec->anon_cost += child_lruvec->anon_cost; + parent_lruvec->file_cost += child_lruvec->file_cost; + + for_each_lru(lru) + lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid); +} +#endif + static const struct ctl_table swap_sysctl_table[] = { { .procname = "page-cluster", diff --git a/mm/swap.h b/mm/swap.h index bfafa637c458..a77016f2423b 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -37,6 +37,7 @@ struct swap_cluster_info { u8 flags; u8 order; atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ + unsigned int *extend_table; /* For large swap count, protected by ci->lock */ struct list_head list; }; @@ -84,7 +85,7 @@ static inline struct swap_cluster_info *__swap_offset_to_cluster( struct swap_info_struct *si, pgoff_t offset) { VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ - VM_WARN_ON_ONCE(offset >= si->max); + VM_WARN_ON_ONCE(offset >= roundup(si->max, SWAPFILE_CLUSTER)); return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } @@ -183,6 +184,8 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) spin_unlock_irq(&ci->lock); } +extern int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp); + /* * Below are the core routines for doing swap for a folio. * All helpers requires the folio to be locked, and a locked folio @@ -192,12 +195,13 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) * * folio_alloc_swap(): the entry point for a folio to be swapped * out. It allocates swap slots and pins the slots with swap cache. - * The slots start with a swap count of zero. + * The slots start with a swap count of zero. The slots are pinned + * by swap cache reference which doesn't contribute to swap count. * * folio_dup_swap(): increases the swap count of a folio, usually * during it gets unmapped and a swap entry is installed to replace * it (e.g., swap entry in page table). A swap slot with swap - * count == 0 should only be increasd by this helper. + * count == 0 can only be increased by this helper. * * folio_put_swap(): does the opposite thing of folio_dup_swap(). */ @@ -206,9 +210,9 @@ int folio_dup_swap(struct folio *folio, struct page *subpage); void folio_put_swap(struct folio *folio, struct page *subpage); /* For internal use */ -extern void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, unsigned int nr_pages); +extern void __swap_cluster_free_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned int ci_off, unsigned int nr_pages); /* linux/mm/page_io.c */ int sio_pool_init(void); @@ -286,7 +290,6 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow); void __swap_cache_replace_folio(struct swap_cluster_info *ci, struct folio *old, struct folio *new); -void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents); void show_swap_cache_info(void); void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); @@ -446,6 +449,11 @@ static inline int swap_writeout(struct folio *folio, return 0; } +static inline int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) +{ + return -EINVAL; +} + static inline bool swap_cache_has_folio(swp_entry_t entry) { return false; diff --git a/mm/swap_state.c b/mm/swap_state.c index 6d0eef7470be..1415a5c54a43 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -15,7 +15,7 @@ #include <linux/leafops.h> #include <linux/init.h> #include <linux/pagemap.h> -#include <linux/pagevec.h> +#include <linux/folio_batch.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/migrate.h> @@ -140,21 +140,20 @@ void *swap_cache_get_shadow(swp_entry_t entry) void __swap_cache_add_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry) { - unsigned long new_tb; - unsigned int ci_start, ci_off, ci_end; + unsigned int ci_off = swp_cluster_offset(entry), ci_end; unsigned long nr_pages = folio_nr_pages(folio); + unsigned long pfn = folio_pfn(folio); + unsigned long old_tb; VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); - new_tb = folio_to_swp_tb(folio); - ci_start = swp_cluster_offset(entry); - ci_off = ci_start; - ci_end = ci_start + nr_pages; + ci_end = ci_off + nr_pages; do { - VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off))); - __swap_table_set(ci, ci_off, new_tb); + old_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); } while (++ci_off < ci_end); folio_ref_add(folio, nr_pages); @@ -183,14 +182,13 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, unsigned long old_tb; struct swap_info_struct *si; struct swap_cluster_info *ci; - unsigned int ci_start, ci_off, ci_end, offset; + unsigned int ci_start, ci_off, ci_end; unsigned long nr_pages = folio_nr_pages(folio); si = __swap_entry_to_info(entry); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; - offset = swp_offset(entry); ci = swap_cluster_lock(si, swp_offset(entry)); if (unlikely(!ci->table)) { err = -ENOENT; @@ -202,13 +200,12 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, err = -EEXIST; goto failed; } - if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { + if (unlikely(!__swp_tb_get_count(old_tb))) { err = -ENOENT; goto failed; } if (swp_tb_is_shadow(old_tb)) shadow = swp_tb_to_shadow(old_tb); - offset++; } while (++ci_off < ci_end); __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); @@ -237,8 +234,9 @@ failed: void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { + int count; + unsigned long old_tb; struct swap_info_struct *si; - unsigned long old_tb, new_tb; unsigned int ci_start, ci_off, ci_end; bool folio_swapped = false, need_free = false; unsigned long nr_pages = folio_nr_pages(folio); @@ -249,20 +247,20 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); si = __swap_entry_to_info(entry); - new_tb = shadow_swp_to_tb(shadow); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; do { - /* If shadow is NULL, we sets an empty shadow */ - old_tb = __swap_table_xchg(ci, ci_off, new_tb); + old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != folio); - if (__swap_count(swp_entry(si->type, - swp_offset(entry) + ci_off - ci_start))) + count = __swp_tb_get_count(old_tb); + if (count) folio_swapped = true; else need_free = true; + /* If shadow is NULL, we sets an empty shadow. */ + __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count)); } while (++ci_off < ci_end); folio->swap.val = 0; @@ -271,13 +269,13 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); if (!folio_swapped) { - swap_entries_free(si, ci, swp_offset(entry), nr_pages); + __swap_cluster_free_entries(si, ci, ci_start, nr_pages); } else if (need_free) { + ci_off = ci_start; do { - if (!__swap_count(entry)) - swap_entries_free(si, ci, swp_offset(entry), 1); - entry.val++; - } while (--nr_pages); + if (!__swp_tb_get_count(__swap_table_get(ci, ci_off))) + __swap_cluster_free_entries(si, ci, ci_off, 1); + } while (++ci_off < ci_end); } } @@ -324,17 +322,18 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, unsigned long nr_pages = folio_nr_pages(new); unsigned int ci_off = swp_cluster_offset(entry); unsigned int ci_end = ci_off + nr_pages; - unsigned long old_tb, new_tb; + unsigned long pfn = folio_pfn(new); + unsigned long old_tb; VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); VM_WARN_ON_ONCE(!entry.val); /* Swap cache still stores N entries instead of a high-order entry */ - new_tb = folio_to_swp_tb(new); do { - old_tb = __swap_table_xchg(ci, ci_off, new_tb); + old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); } while (++ci_off < ci_end); /* @@ -351,27 +350,6 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, } } -/** - * __swap_cache_clear_shadow - Clears a set of shadows in the swap cache. - * @entry: The starting index entry. - * @nr_ents: How many slots need to be cleared. - * - * Context: Caller must ensure the range is valid, all in one single cluster, - * not occupied by any folio, and lock the cluster. - */ -void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) -{ - struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); - unsigned int ci_off = swp_cluster_offset(entry), ci_end; - unsigned long old; - - ci_end = ci_off + nr_ents; - do { - old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); - WARN_ON_ONCE(swp_tb_is_folio(old)); - } while (++ci_off < ci_end); -} - /* * If we are the only user, then try to free up the swap cache. * @@ -407,7 +385,7 @@ void free_folio_and_swap_cache(struct folio *folio) void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { struct folio_batch folios; - unsigned int refs[PAGEVEC_SIZE]; + unsigned int refs[FOLIO_BATCH_SIZE]; folio_batch_init(&folios); for (int i = 0; i < nr; i++) { @@ -494,6 +472,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, __folio_set_locked(folio); __folio_set_swapbacked(folio); + + if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) + goto failed; + for (;;) { ret = swap_cache_add_folio(folio, entry, &shadow); if (!ret) @@ -514,11 +496,6 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, goto failed; } - if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { - swap_cache_del_folio(folio); - goto failed; - } - memcg1_swapin(entry, folio_nr_pages(folio)); if (shadow) workingset_refault(folio, shadow); diff --git a/mm/swap_table.h b/mm/swap_table.h index ea244a57a5b7..8415ffbe2b9c 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -18,10 +18,69 @@ struct swap_table { * (physical or virtual) device. The swap table in each cluster is a * 1:1 map of the swap slots in this cluster. * - * Each swap table entry could be a pointer (folio), a XA_VALUE - * (shadow), or NULL. + * Swap table entry type and bits layouts: + * + * NULL: |---------------- 0 ---------------| - Free slot + * Shadow: | SWAP_COUNT |---- SHADOW_VAL ---|1| - Swapped out slot + * PFN: | SWAP_COUNT |------ PFN -------|10| - Cached slot + * Pointer: |----------- Pointer ----------|100| - (Unused) + * Bad: |------------- 1 -------------|1000| - Bad slot + * + * SWAP_COUNT is `SWP_TB_COUNT_BITS` long, each entry is an atomic long. + * + * Usages: + * + * - NULL: Swap slot is unused, could be allocated. + * + * - Shadow: Swap slot is used and not cached (usually swapped out). It reuses + * the XA_VALUE format to be compatible with working set shadows. SHADOW_VAL + * part might be all 0 if the working shadow info is absent. In such a case, + * we still want to keep the shadow format as a placeholder. + * + * Memcg ID is embedded in SHADOW_VAL. + * + * - PFN: Swap slot is in use, and cached. Memcg info is recorded on the page + * struct. + * + * - Pointer: Unused yet. `0b100` is reserved for potential pointer usage + * because only the lower three bits can be used as a marker for 8 bytes + * aligned pointers. + * + * - Bad: Swap slot is reserved, protects swap header or holes on swap devices. */ +#if defined(MAX_POSSIBLE_PHYSMEM_BITS) +#define SWAP_CACHE_PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) +#elif defined(MAX_PHYSMEM_BITS) +#define SWAP_CACHE_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) +#else +#define SWAP_CACHE_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT) +#endif + +/* NULL Entry, all 0 */ +#define SWP_TB_NULL 0UL + +/* Swapped out: shadow */ +#define SWP_TB_SHADOW_MARK 0b1UL + +/* Cached: PFN */ +#define SWP_TB_PFN_BITS (SWAP_CACHE_PFN_BITS + SWP_TB_PFN_MARK_BITS) +#define SWP_TB_PFN_MARK 0b10UL +#define SWP_TB_PFN_MARK_BITS 2 +#define SWP_TB_PFN_MARK_MASK (BIT(SWP_TB_PFN_MARK_BITS) - 1) + +/* SWAP_COUNT part for PFN or shadow, the width can be shrunk or extended */ +#define SWP_TB_COUNT_BITS min(4, BITS_PER_LONG - SWP_TB_PFN_BITS) +#define SWP_TB_COUNT_MASK (~((~0UL) >> SWP_TB_COUNT_BITS)) +#define SWP_TB_COUNT_SHIFT (BITS_PER_LONG - SWP_TB_COUNT_BITS) +#define SWP_TB_COUNT_MAX ((1 << SWP_TB_COUNT_BITS) - 1) + +/* Bad slot: ends with 0b1000 and rests of bits are all 1 */ +#define SWP_TB_BAD ((~0UL) << 3) + +/* Macro for shadow offset calculation */ +#define SWAP_COUNT_SHIFT SWP_TB_COUNT_BITS + /* * Helpers for casting one type of info into a swap table entry. */ @@ -31,18 +90,47 @@ static inline unsigned long null_to_swp_tb(void) return 0; } -static inline unsigned long folio_to_swp_tb(struct folio *folio) +static inline unsigned long __count_to_swp_tb(unsigned char count) { + /* + * At least three values are needed to distinguish free (0), + * used (count > 0 && count < SWP_TB_COUNT_MAX), and + * overflow (count == SWP_TB_COUNT_MAX). + */ + BUILD_BUG_ON(SWP_TB_COUNT_MAX < 2 || SWP_TB_COUNT_BITS < 2); + VM_WARN_ON(count > SWP_TB_COUNT_MAX); + return ((unsigned long)count) << SWP_TB_COUNT_SHIFT; +} + +static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned int count) +{ + unsigned long swp_tb; + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); - return (unsigned long)folio; + BUILD_BUG_ON(SWAP_CACHE_PFN_BITS > + (BITS_PER_LONG - SWP_TB_PFN_MARK_BITS - SWP_TB_COUNT_BITS)); + + swp_tb = (pfn << SWP_TB_PFN_MARK_BITS) | SWP_TB_PFN_MARK; + VM_WARN_ON_ONCE(swp_tb & SWP_TB_COUNT_MASK); + + return swp_tb | __count_to_swp_tb(count); +} + +static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned int count) +{ + return pfn_to_swp_tb(folio_pfn(folio), count); } -static inline unsigned long shadow_swp_to_tb(void *shadow) +static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned int count) { BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != BITS_PER_BYTE * sizeof(unsigned long)); + BUILD_BUG_ON((unsigned long)xa_mk_value(0) != SWP_TB_SHADOW_MARK); + VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); - return (unsigned long)shadow; + VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_COUNT_MASK)); + + return (unsigned long)shadow | __count_to_swp_tb(count) | SWP_TB_SHADOW_MARK; } /* @@ -55,7 +143,7 @@ static inline bool swp_tb_is_null(unsigned long swp_tb) static inline bool swp_tb_is_folio(unsigned long swp_tb) { - return !xa_is_value((void *)swp_tb) && !swp_tb_is_null(swp_tb); + return ((swp_tb & SWP_TB_PFN_MARK_MASK) == SWP_TB_PFN_MARK); } static inline bool swp_tb_is_shadow(unsigned long swp_tb) @@ -63,19 +151,49 @@ static inline bool swp_tb_is_shadow(unsigned long swp_tb) return xa_is_value((void *)swp_tb); } +static inline bool swp_tb_is_bad(unsigned long swp_tb) +{ + return swp_tb == SWP_TB_BAD; +} + +static inline bool swp_tb_is_countable(unsigned long swp_tb) +{ + return (swp_tb_is_shadow(swp_tb) || swp_tb_is_folio(swp_tb) || + swp_tb_is_null(swp_tb)); +} + /* * Helpers for retrieving info from swap table. */ static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) { VM_WARN_ON(!swp_tb_is_folio(swp_tb)); - return (void *)swp_tb; + return pfn_folio((swp_tb & ~SWP_TB_COUNT_MASK) >> SWP_TB_PFN_MARK_BITS); } static inline void *swp_tb_to_shadow(unsigned long swp_tb) { VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); - return (void *)swp_tb; + /* No shift needed, xa_value is stored as it is in the lower bits. */ + return (void *)(swp_tb & ~SWP_TB_COUNT_MASK); +} + +static inline unsigned char __swp_tb_get_count(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + return ((swp_tb & SWP_TB_COUNT_MASK) >> SWP_TB_COUNT_SHIFT); +} + +static inline int swp_tb_get_count(unsigned long swp_tb) +{ + if (swp_tb_is_countable(swp_tb)) + return __swp_tb_get_count(swp_tb); + return -EINVAL; +} + +static inline unsigned long __swp_tb_mk_count(unsigned long swp_tb, int count) +{ + return ((swp_tb & ~SWP_TB_COUNT_MASK) | __count_to_swp_tb(count)); } /* @@ -120,6 +238,8 @@ static inline unsigned long swap_table_get(struct swap_cluster_info *ci, atomic_long_t *table; unsigned long swp_tb; + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + rcu_read_lock(); table = rcu_dereference(ci->table); swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb(); diff --git a/mm/swapfile.c b/mm/swapfile.c index 94af29d1de88..9174f1eeffb0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -48,23 +48,22 @@ #include <linux/swap_cgroup.h> #include "swap_table.h" #include "internal.h" -#include "swap_table.h" #include "swap.h" -static bool swap_count_continued(struct swap_info_struct *, pgoff_t, - unsigned char); -static void free_swap_count_continuations(struct swap_info_struct *); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); -static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr); -static void swap_put_entry_locked(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset); static bool folio_swapcache_freeable(struct folio *folio); static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags); +/* + * Protects the swap_info array, and the SWP_USED flag. swap_info contains + * lazily allocated & freed swap device info struts, and SWP_USED indicates + * which device is used, ~SWP_USED devices and can be reused. + * + * Also protects swap_active_head total_swap_pages, and the SWP_WRITEOK flag. + */ static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; @@ -110,6 +109,7 @@ struct swap_info_struct *swap_info[MAX_SWAPFILES]; static struct kmem_cache *swap_table_cachep; +/* Protects si->swap_file for /proc/swaps usage */ static DEFINE_MUTEX(swapon_mutex); static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); @@ -174,22 +174,19 @@ static long swap_usage_in_pages(struct swap_info_struct *si) /* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 -static bool swap_only_has_cache(struct swap_info_struct *si, - struct swap_cluster_info *ci, +static bool swap_only_has_cache(struct swap_cluster_info *ci, unsigned long offset, int nr_pages) { unsigned int ci_off = offset % SWAPFILE_CLUSTER; - unsigned char *map = si->swap_map + offset; - unsigned char *map_end = map + nr_pages; + unsigned int ci_end = ci_off + nr_pages; unsigned long swp_tb; do { swp_tb = __swap_table_get(ci, ci_off); VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb)); - if (*map) + if (swp_tb_get_count(swp_tb)) return false; - ++ci_off; - } while (++map < map_end); + } while (++ci_off < ci_end); return true; } @@ -248,7 +245,7 @@ again: * reference or pending writeback, and can't be allocated to others. */ ci = swap_cluster_lock(si, offset); - need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages); + need_reclaim = swap_only_has_cache(ci, offset, nr_pages); swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; @@ -446,16 +443,40 @@ static void swap_table_free(struct swap_table *table) swap_table_free_folio_rcu_cb); } +/* + * Sanity check to ensure nothing leaked, and the specified range is empty. + * One special case is that bad slots can't be freed, so check the number of + * bad slots for swapoff, and non-swapoff path must never free bad slots. + */ +static void swap_cluster_assert_empty(struct swap_cluster_info *ci, + unsigned int ci_off, unsigned int nr, + bool swapoff) +{ + unsigned int ci_end = ci_off + nr; + unsigned long swp_tb; + int bad_slots = 0; + + if (!IS_ENABLED(CONFIG_DEBUG_VM) && !swapoff) + return; + + do { + swp_tb = __swap_table_get(ci, ci_off); + if (swp_tb_is_bad(swp_tb)) + bad_slots++; + else + WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + } while (++ci_off < ci_end); + + WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0)); + WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table); +} + static void swap_cluster_free_table(struct swap_cluster_info *ci) { - unsigned int ci_off; struct swap_table *table; /* Only empty cluster's table is allow to be freed */ lockdep_assert_held(&ci->lock); - VM_WARN_ON_ONCE(!cluster_is_empty(ci)); - for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) - VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off))); table = (void *)rcu_dereference_protected(ci->table, true); rcu_assign_pointer(ci->table, NULL); @@ -476,8 +497,10 @@ swap_cluster_alloc_table(struct swap_info_struct *si, * Only cluster isolation from the allocator does table allocation. * Swap allocator uses percpu clusters and holds the local lock. */ - lockdep_assert_held(&ci->lock); lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock); + if (!(si->flags & SWP_SOLIDSTATE)) + lockdep_assert_held(&si->global_cluster_lock); + lockdep_assert_held(&ci->lock); /* The cluster must be free and was just isolated from the free list. */ VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); @@ -559,6 +582,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { + swap_cluster_assert_empty(ci, 0, SWAPFILE_CLUSTER, false); swap_cluster_free_table(ci); move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; @@ -577,6 +601,7 @@ static struct swap_cluster_info *isolate_lock_cluster( struct swap_info_struct *si, struct list_head *list) { struct swap_cluster_info *ci, *found = NULL; + u8 flags = CLUSTER_FLAG_NONE; spin_lock(&si->lock); list_for_each_entry(ci, list, list) { @@ -589,6 +614,7 @@ static struct swap_cluster_info *isolate_lock_cluster( ci->flags != CLUSTER_FLAG_FULL); list_del(&ci->list); + flags = ci->flags; ci->flags = CLUSTER_FLAG_NONE; found = ci; break; @@ -597,6 +623,7 @@ static struct swap_cluster_info *isolate_lock_cluster( if (found && !cluster_table_is_alloced(found)) { /* Only an empty free cluster's swap table can be freed. */ + VM_WARN_ON_ONCE(flags != CLUSTER_FLAG_FREE); VM_WARN_ON_ONCE(list != &si->free_clusters); VM_WARN_ON_ONCE(!cluster_is_empty(found)); return swap_cluster_alloc_table(si, found); @@ -735,12 +762,32 @@ static void relocate_cluster(struct swap_info_struct *si, * slot. The cluster will not be added to the free cluster list, and its * usage counter will be increased by 1. Only used for initialization. */ -static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info, - unsigned long offset) +static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, + struct swap_cluster_info *cluster_info, + unsigned int offset, bool mask) { + unsigned int ci_off = offset % SWAPFILE_CLUSTER; unsigned long idx = offset / SWAPFILE_CLUSTER; - struct swap_table *table; struct swap_cluster_info *ci; + struct swap_table *table; + int ret = 0; + + /* si->max may got shrunk by swap swap_activate() */ + if (offset >= si->max && !mask) { + pr_debug("Ignoring bad slot %u (max: %u)\n", offset, si->max); + return 0; + } + /* + * Account it, skip header slot: si->pages is initiated as + * si->max - 1. Also skip the masking of last cluster, + * si->pages doesn't include that part. + */ + if (offset && !mask) + si->pages -= 1; + if (!si->pages) { + pr_warn("Empty swap-file\n"); + return -EINVAL; + } ci = cluster_info + idx; if (!ci->table) { @@ -749,13 +796,20 @@ static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info, return -ENOMEM; rcu_assign_pointer(ci->table, table); } - - ci->count++; + spin_lock(&ci->lock); + /* Check for duplicated bad swap slots. */ + if (__swap_table_xchg(ci, ci_off, SWP_TB_BAD) != SWP_TB_NULL) { + pr_warn("Duplicated bad slot offset %d\n", offset); + ret = -EINVAL; + } else { + ci->count++; + } + spin_unlock(&ci->lock); WARN_ON(ci->count > SWAPFILE_CLUSTER); WARN_ON(ci->flags); - return 0; + return ret; } /* @@ -769,18 +823,16 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, { unsigned int nr_pages = 1 << order; unsigned long offset = start, end = start + nr_pages; - unsigned char *map = si->swap_map; unsigned long swp_tb; spin_unlock(&ci->lock); do { - if (READ_ONCE(map[offset])) - break; swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (swp_tb_is_folio(swp_tb)) { + if (swp_tb_get_count(swp_tb)) + break; + if (swp_tb_is_folio(swp_tb)) if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0) break; - } } while (++offset < end); spin_lock(&ci->lock); @@ -804,7 +856,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, */ for (offset = start; offset < end; offset++) { swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (map[offset] || !swp_tb_is_null(swp_tb)) + if (!swp_tb_is_null(swp_tb)) return false; } @@ -816,57 +868,35 @@ static bool cluster_scan_range(struct swap_info_struct *si, unsigned long offset, unsigned int nr_pages, bool *need_reclaim) { - unsigned long end = offset + nr_pages; - unsigned char *map = si->swap_map; + unsigned int ci_off = offset % SWAPFILE_CLUSTER; + unsigned int ci_end = ci_off + nr_pages; unsigned long swp_tb; - if (cluster_is_empty(ci)) - return true; - do { - if (map[offset]) - return false; - swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (swp_tb_is_folio(swp_tb)) { + swp_tb = __swap_table_get(ci, ci_off); + if (swp_tb_is_null(swp_tb)) + continue; + if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) { if (!vm_swap_full()) return false; *need_reclaim = true; - } else { - /* A entry with no count and no cache must be null */ - VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + continue; } - } while (++offset < end); + /* Slot with zero count can only be NULL or folio */ + VM_WARN_ON(!swp_tb_get_count(swp_tb)); + return false; + } while (++ci_off < ci_end); return true; } -/* - * Currently, the swap table is not used for count tracking, just - * do a sanity check here to ensure nothing leaked, so the swap - * table should be empty upon freeing. - */ -static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci, - unsigned int start, unsigned int nr) -{ - unsigned int ci_off = start % SWAPFILE_CLUSTER; - unsigned int ci_end = ci_off + nr; - unsigned long swp_tb; - - if (IS_ENABLED(CONFIG_DEBUG_VM)) { - do { - swp_tb = __swap_table_get(ci, ci_off); - VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); - } while (++ci_off < ci_end); - } -} - -static bool cluster_alloc_range(struct swap_info_struct *si, - struct swap_cluster_info *ci, - struct folio *folio, - unsigned int offset) +static bool __swap_cluster_alloc_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + struct folio *folio, + unsigned int ci_off) { - unsigned long nr_pages; unsigned int order; + unsigned long nr_pages; lockdep_assert_held(&ci->lock); @@ -885,13 +915,15 @@ static bool cluster_alloc_range(struct swap_info_struct *si, if (likely(folio)) { order = folio_order(folio); nr_pages = 1 << order; - __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset)); + swap_cluster_assert_empty(ci, ci_off, nr_pages, false); + __swap_cache_add_folio(ci, folio, swp_entry(si->type, + ci_off + cluster_offset(si, ci))); } else if (IS_ENABLED(CONFIG_HIBERNATION)) { order = 0; nr_pages = 1; - WARN_ON_ONCE(si->swap_map[offset]); - si->swap_map[offset] = 1; - swap_cluster_assert_table_empty(ci, offset, 1); + swap_cluster_assert_empty(ci, ci_off, 1, false); + /* Sets a fake shadow as placeholder */ + __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1)); } else { /* Allocation without folio is only possible with hibernation */ WARN_ON_ONCE(1); @@ -917,8 +949,8 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, { unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); - unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); unsigned int order = likely(folio) ? folio_order(folio) : 0; + unsigned long end = start + SWAPFILE_CLUSTER; unsigned int nr_pages = 1 << order; bool need_reclaim, ret, usable; @@ -942,7 +974,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, if (!ret) continue; } - if (!cluster_alloc_range(si, ci, folio, offset)) + if (!__swap_cluster_alloc_entries(si, ci, folio, offset % SWAPFILE_CLUSTER)) break; found = offset; offset += nr_pages; @@ -989,7 +1021,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) long to_scan = 1; unsigned long offset, end; struct swap_cluster_info *ci; - unsigned char *map = si->swap_map; + unsigned long swp_tb; int nr_reclaim; if (force) @@ -1001,8 +1033,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) to_scan--; while (offset < end) { - if (!READ_ONCE(map[offset]) && - swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) { + swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); + if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) { spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); @@ -1259,7 +1291,6 @@ static void swap_range_alloc(struct swap_info_struct *si, static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { - unsigned long begin = offset; unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); unsigned int i; @@ -1284,7 +1315,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify(si->bdev, offset); offset++; } - __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 @@ -1411,40 +1441,127 @@ start_over: return false; } +static int swap_extend_table_alloc(struct swap_info_struct *si, + struct swap_cluster_info *ci, gfp_t gfp) +{ + void *table; + + table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp); + if (!table) + return -ENOMEM; + + spin_lock(&ci->lock); + if (!ci->extend_table) + ci->extend_table = table; + else + kfree(table); + spin_unlock(&ci->lock); + return 0; +} + +int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) +{ + int ret; + struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + si = get_swap_device(entry); + if (!si) + return 0; + + ci = __swap_offset_to_cluster(si, offset); + ret = swap_extend_table_alloc(si, ci, gfp); + + put_swap_device(si); + return ret; +} + +static void swap_extend_table_try_free(struct swap_cluster_info *ci) +{ + unsigned long i; + bool can_free = true; + + if (!ci->extend_table) + return; + + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + if (ci->extend_table[i]) + can_free = false; + } + + if (can_free) { + kfree(ci->extend_table); + ci->extend_table = NULL; + } +} + +/* Decrease the swap count of one slot, without freeing it */ +static void __swap_cluster_put_entry(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + int count; + unsigned long swp_tb; + + lockdep_assert_held(&ci->lock); + swp_tb = __swap_table_get(ci, ci_off); + count = __swp_tb_get_count(swp_tb); + + VM_WARN_ON_ONCE(count <= 0); + VM_WARN_ON_ONCE(count > SWP_TB_COUNT_MAX); + + if (count == SWP_TB_COUNT_MAX) { + count = ci->extend_table[ci_off]; + /* Overflow starts with SWP_TB_COUNT_MAX */ + VM_WARN_ON_ONCE(count < SWP_TB_COUNT_MAX); + count--; + if (count == (SWP_TB_COUNT_MAX - 1)) { + ci->extend_table[ci_off] = 0; + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count)); + swap_extend_table_try_free(ci); + } else { + ci->extend_table[ci_off] = count; + } + } else { + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count)); + } +} + /** - * swap_put_entries_cluster - Decrease the swap count of a set of slots. + * swap_put_entries_cluster - Decrease the swap count of slots within one cluster * @si: The swap device. - * @start: start offset of slots. + * @offset: start offset of slots. * @nr: number of slots. - * @reclaim_cache: if true, also reclaim the swap cache. + * @reclaim_cache: if true, also reclaim the swap cache if slots are freed. * * This helper decreases the swap count of a set of slots and tries to * batch free them. Also reclaims the swap cache if @reclaim_cache is true. - * Context: The caller must ensure that all slots belong to the same - * cluster and their swap count doesn't go underflow. + * + * Context: The specified slots must be pinned by existing swap count or swap + * cache reference, so they won't be released until this helper returns. */ static void swap_put_entries_cluster(struct swap_info_struct *si, - unsigned long start, int nr, + pgoff_t offset, int nr, bool reclaim_cache) { - unsigned long offset = start, end = start + nr; - unsigned long batch_start = SWAP_ENTRY_INVALID; struct swap_cluster_info *ci; + unsigned int ci_off, ci_end; + pgoff_t end = offset + nr; bool need_reclaim = false; unsigned int nr_reclaimed; unsigned long swp_tb; - unsigned int count; + int ci_batch = -1; ci = swap_cluster_lock(si, offset); + ci_off = offset % SWAPFILE_CLUSTER; + ci_end = ci_off + nr; do { - swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - count = si->swap_map[offset]; - VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD); - if (count == 1) { + swp_tb = __swap_table_get(ci, ci_off); + if (swp_tb_get_count(swp_tb) == 1) { /* count == 1 and non-cached slots will be batch freed. */ if (!swp_tb_is_folio(swp_tb)) { - if (!batch_start) - batch_start = offset; + if (ci_batch == -1) + ci_batch = ci_off; continue; } /* count will be 0 after put, slot can be reclaimed */ @@ -1456,21 +1573,20 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, * slots will be freed when folio is removed from swap cache * (__swap_cache_del_folio). */ - swap_put_entry_locked(si, ci, offset); - if (batch_start) { - swap_entries_free(si, ci, batch_start, offset - batch_start); - batch_start = SWAP_ENTRY_INVALID; + __swap_cluster_put_entry(ci, ci_off); + if (ci_batch != -1) { + __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch); + ci_batch = -1; } - } while (++offset < end); + } while (++ci_off < ci_end); - if (batch_start) - swap_entries_free(si, ci, batch_start, offset - batch_start); + if (ci_batch != -1) + __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch); swap_cluster_unlock(ci); if (!need_reclaim || !reclaim_cache) return; - offset = start; do { nr_reclaimed = __try_to_reclaim_swap(si, offset, TTRS_UNMAPPED | TTRS_FULL); @@ -1480,6 +1596,92 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, } while (offset < end); } +/* Increase the swap count of one slot. */ +static int __swap_cluster_dup_entry(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + int count; + unsigned long swp_tb; + + lockdep_assert_held(&ci->lock); + swp_tb = __swap_table_get(ci, ci_off); + /* Bad or special slots can't be handled */ + if (WARN_ON_ONCE(swp_tb_is_bad(swp_tb))) + return -EINVAL; + count = __swp_tb_get_count(swp_tb); + /* Must be either cached or have a count already */ + if (WARN_ON_ONCE(!count && !swp_tb_is_folio(swp_tb))) + return -ENOENT; + + if (likely(count < (SWP_TB_COUNT_MAX - 1))) { + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count + 1)); + VM_WARN_ON_ONCE(ci->extend_table && ci->extend_table[ci_off]); + } else if (count == (SWP_TB_COUNT_MAX - 1)) { + if (ci->extend_table) { + VM_WARN_ON_ONCE(ci->extend_table[ci_off]); + ci->extend_table[ci_off] = SWP_TB_COUNT_MAX; + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, SWP_TB_COUNT_MAX)); + } else { + return -ENOMEM; + } + } else if (count == SWP_TB_COUNT_MAX) { + VM_WARN_ON_ONCE(ci->extend_table[ci_off] >= + type_max(typeof(ci->extend_table[0]))); + ++ci->extend_table[ci_off]; + } else { + /* Never happens unless counting went wrong */ + WARN_ON_ONCE(1); + } + + return 0; +} + +/** + * swap_dup_entries_cluster: Increase the swap count of slots within one cluster. + * @si: The swap device. + * @offset: start offset of slots. + * @nr: number of slots. + * + * Context: The specified slots must be pinned by existing swap count or swap + * cache reference, so they won't be released until this helper returns. + * Return: 0 on success. -ENOMEM if the swap count maxed out (SWP_TB_COUNT_MAX) + * and failed to allocate an extended table, -EINVAL if any entry is bad entry. + */ +static int swap_dup_entries_cluster(struct swap_info_struct *si, + pgoff_t offset, int nr) +{ + int err; + struct swap_cluster_info *ci; + unsigned int ci_start, ci_off, ci_end; + + ci_start = offset % SWAPFILE_CLUSTER; + ci_end = ci_start + nr; + ci_off = ci_start; + ci = swap_cluster_lock(si, offset); +restart: + do { + err = __swap_cluster_dup_entry(ci, ci_off); + if (unlikely(err)) { + if (err == -ENOMEM) { + spin_unlock(&ci->lock); + err = swap_extend_table_alloc(si, ci, GFP_ATOMIC); + spin_lock(&ci->lock); + if (!err) + goto restart; + } + goto failed; + } + } while (++ci_off < ci_end); + swap_cluster_unlock(ci); + return 0; +failed: + while (ci_off-- > ci_start) + __swap_cluster_put_entry(ci, ci_off); + swap_extend_table_try_free(ci); + swap_cluster_unlock(ci); + return err; +} + /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap @@ -1543,18 +1745,19 @@ again: * @subpage: if not NULL, only increase the swap count of this subpage. * * Typically called when the folio is unmapped and have its swap entry to - * take its palce. + * take its place: Swap entries allocated to a folio has count == 0 and pinned + * by swap cache. The swap cache pin doesn't increase the swap count. This + * helper sets the initial count == 1 and increases the count as the folio is + * unmapped and swap entries referencing the slots are generated to replace + * the folio. * * Context: Caller must ensure the folio is locked and in the swap cache. * NOTE: The caller also has to ensure there is no raced call to * swap_put_entries_direct on its swap entry before this helper returns, or - * the swap map may underflow. Currently, we only accept @subpage == NULL - * for shmem due to the limitation of swap continuation: shmem always - * duplicates the swap entry only once, so there is no such issue for it. + * the swap count may underflow. */ int folio_dup_swap(struct folio *folio, struct page *subpage) { - int err = 0; swp_entry_t entry = folio->swap; unsigned long nr_pages = folio_nr_pages(folio); @@ -1566,10 +1769,8 @@ int folio_dup_swap(struct folio *folio, struct page *subpage) nr_pages = 1; } - while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM) - err = add_swap_count_continuation(entry, GFP_ATOMIC); - - return err; + return swap_dup_entries_cluster(swap_entry_to_info(entry), + swp_offset(entry), nr_pages); } /** @@ -1598,28 +1799,6 @@ void folio_put_swap(struct folio *folio, struct page *subpage) swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false); } -static void swap_put_entry_locked(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset) -{ - unsigned char count; - - count = si->swap_map[offset]; - if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { - if (count == COUNT_CONTINUED) { - if (swap_count_continued(si, offset, count)) - count = SWAP_MAP_MAX | COUNT_CONTINUED; - else - count = SWAP_MAP_MAX; - } else - count--; - } - - WRITE_ONCE(si->swap_map[offset], count); - if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) - swap_entries_free(si, ci, offset, 1); -} - /* * When we get a swap entry, if there aren't some other ways to * prevent swapoff, such as the folio in swap cache is locked, RCU @@ -1686,31 +1865,30 @@ put_out: } /* - * Drop the last ref of swap entries, caller have to ensure all entries - * belong to the same cgroup and cluster. + * Free a set of swap slots after their swap count dropped to zero, or will be + * zero after putting the last ref (saves one __swap_cluster_put_entry call). */ -void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, unsigned int nr_pages) +void __swap_cluster_free_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned int ci_start, unsigned int nr_pages) { - swp_entry_t entry = swp_entry(si->type, offset); - unsigned char *map = si->swap_map + offset; - unsigned char *map_end = map + nr_pages; + unsigned long old_tb; + unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages; + unsigned long offset = cluster_offset(si, ci) + ci_start; - /* It should never free entries across different clusters */ - VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1)); - VM_BUG_ON(cluster_is_empty(ci)); - VM_BUG_ON(ci->count < nr_pages); + VM_WARN_ON(ci->count < nr_pages); ci->count -= nr_pages; do { - VM_WARN_ON(*map > 1); - *map = 0; - } while (++map < map_end); + old_tb = __swap_table_get(ci, ci_off); + /* Release the last ref, or after swap cache is dropped */ + VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1); + __swap_table_set(ci, ci_off, null_to_swp_tb()); + } while (++ci_off < ci_end); - mem_cgroup_uncharge_swap(entry, nr_pages); + mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages); swap_range_free(si, offset, nr_pages); - swap_cluster_assert_table_empty(ci, offset, nr_pages); + swap_cluster_assert_empty(ci, ci_start, nr_pages, false); if (!ci->count) free_cluster(si, ci); @@ -1720,10 +1898,10 @@ void swap_entries_free(struct swap_info_struct *si, int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si = __swap_entry_to_info(entry); - pgoff_t offset = swp_offset(entry); + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); + unsigned int ci_off = swp_cluster_offset(entry); - return si->swap_map[offset]; + return swp_tb_get_count(__swap_table_get(ci, ci_off)); } /** @@ -1735,103 +1913,79 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; - int count; + unsigned long swp_tb; ci = swap_cluster_lock(si, offset); - count = si->swap_map[offset]; + swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); swap_cluster_unlock(ci); - return count && count != SWAP_MAP_BAD; + return swp_tb_get_count(swp_tb) > 0; } /* * How many references to @entry are currently swapped out? - * This considers COUNT_CONTINUED so it returns exact answer. + * This returns exact answer. */ int swp_swapcount(swp_entry_t entry) { - int count, tmp_count, n; struct swap_info_struct *si; struct swap_cluster_info *ci; - struct page *page; - pgoff_t offset; - unsigned char *map; + unsigned long swp_tb; + int count; si = get_swap_device(entry); if (!si) return 0; - offset = swp_offset(entry); - - ci = swap_cluster_lock(si, offset); - - count = si->swap_map[offset]; - if (!(count & COUNT_CONTINUED)) - goto out; - - count &= ~COUNT_CONTINUED; - n = SWAP_MAP_MAX + 1; - - page = vmalloc_to_page(si->swap_map + offset); - offset &= ~PAGE_MASK; - VM_BUG_ON(page_private(page) != SWP_CONTINUED); - - do { - page = list_next_entry(page, lru); - map = kmap_local_page(page); - tmp_count = map[offset]; - kunmap_local(map); - - count += (tmp_count & ~COUNT_CONTINUED) * n; - n *= (SWAP_CONT_MAX + 1); - } while (tmp_count & COUNT_CONTINUED); -out: + ci = swap_cluster_lock(si, swp_offset(entry)); + swp_tb = __swap_table_get(ci, swp_cluster_offset(entry)); + count = swp_tb_get_count(swp_tb); + if (count == SWP_TB_COUNT_MAX) + count = ci->extend_table[swp_cluster_offset(entry)]; swap_cluster_unlock(ci); put_swap_device(si); - return count; -} -static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, - swp_entry_t entry, int order) -{ - struct swap_cluster_info *ci; - unsigned char *map = si->swap_map; - unsigned int nr_pages = 1 << order; - unsigned long roffset = swp_offset(entry); - unsigned long offset = round_down(roffset, nr_pages); - int i; - bool ret = false; - - ci = swap_cluster_lock(si, offset); - if (nr_pages == 1) { - if (map[roffset]) - ret = true; - goto unlock_out; - } - for (i = 0; i < nr_pages; i++) { - if (map[offset + i]) { - ret = true; - break; - } - } -unlock_out: - swap_cluster_unlock(ci); - return ret; + return count < 0 ? 0 : count; } -static bool folio_swapped(struct folio *folio) +/* + * folio_maybe_swapped - Test if a folio covers any swap slot with count > 0. + * + * Check if a folio is swapped. Holding the folio lock ensures the folio won't + * go from not-swapped to swapped because the initial swap count increment can + * only be done by folio_dup_swap, which also locks the folio. But a concurrent + * decrease of swap count is possible through swap_put_entries_direct, so this + * may return a false positive. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + */ +static bool folio_maybe_swapped(struct folio *folio) { swp_entry_t entry = folio->swap; - struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned int ci_off, ci_end; + bool ret = false; VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); - si = __swap_entry_to_info(entry); - if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) - return swap_entry_swapped(si, entry); + ci = __swap_entry_to_cluster(entry); + ci_off = swp_cluster_offset(entry); + ci_end = ci_off + folio_nr_pages(folio); + /* + * Extra locking not needed, folio lock ensures its swap entries + * won't be released, the backing data won't be gone either. + */ + rcu_read_lock(); + do { + if (__swp_tb_get_count(__swap_table_get(ci, ci_off))) { + ret = true; + break; + } + } while (++ci_off < ci_end); + rcu_read_unlock(); - return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); + return ret; } static bool folio_swapcache_freeable(struct folio *folio) @@ -1877,7 +2031,7 @@ bool folio_free_swap(struct folio *folio) { if (!folio_swapcache_freeable(folio)) return false; - if (folio_swapped(folio)) + if (folio_maybe_swapped(folio)) return false; swap_cache_del_folio(folio); @@ -1926,8 +2080,9 @@ out: /* Allocate a slot for hibernation */ swp_entry_t swap_alloc_hibernation_slot(int type) { - struct swap_info_struct *si = swap_type_to_info(type); - unsigned long offset; + struct swap_info_struct *pcp_si, *si = swap_type_to_info(type); + unsigned long pcp_offset, offset = SWAP_ENTRY_INVALID; + struct swap_cluster_info *ci; swp_entry_t entry = {0}; if (!si) @@ -1937,11 +2092,21 @@ swp_entry_t swap_alloc_hibernation_slot(int type) if (get_swap_device_info(si)) { if (si->flags & SWP_WRITEOK) { /* - * Grab the local lock to be compliant - * with swap table allocation. + * Try the local cluster first if it matches the device. If + * not, try grab a new cluster and override local cluster. */ local_lock(&percpu_swap_cluster.lock); - offset = cluster_alloc_swap_entry(si, NULL); + pcp_si = this_cpu_read(percpu_swap_cluster.si[0]); + pcp_offset = this_cpu_read(percpu_swap_cluster.offset[0]); + if (pcp_si == si && pcp_offset) { + ci = swap_cluster_lock(si, pcp_offset); + if (cluster_is_usable(ci, 0)) + offset = alloc_swap_scan_cluster(si, ci, NULL, pcp_offset); + else + swap_cluster_unlock(ci); + } + if (!offset) + offset = cluster_alloc_swap_entry(si, NULL); local_unlock(&percpu_swap_cluster.lock); if (offset) entry = swp_entry(si->type, offset); @@ -1964,7 +2129,8 @@ void swap_free_hibernation_slot(swp_entry_t entry) return; ci = swap_cluster_lock(si, offset); - swap_put_entry_locked(si, ci, offset); + __swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER); + __swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1); swap_cluster_unlock(ci); /* In theory readahead might add it to the swap cache by accident */ @@ -2190,13 +2356,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned int type) { pte_t *pte = NULL; - struct swap_info_struct *si; - si = swap_info[type]; do { struct folio *folio; - unsigned long offset; - unsigned char swp_count; + unsigned long swp_tb; softleaf_t entry; int ret; pte_t ptent; @@ -2215,7 +2378,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (swp_type(entry) != type) continue; - offset = swp_offset(entry); pte_unmap(pte); pte = NULL; @@ -2232,8 +2394,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, &vmf); } if (!folio) { - swp_count = READ_ONCE(si->swap_map[offset]); - if (swp_count == 0 || swp_count == SWAP_MAP_BAD) + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (swp_tb_get_count(swp_tb) <= 0) continue; return -ENOMEM; } @@ -2361,7 +2524,7 @@ unlock: } /* - * Scan swap_map from current position to next entry still in use. + * Scan swap table from current position to next entry still in use. * Return 0 if there are no inuse entries after prev till end of * the map. */ @@ -2370,7 +2533,6 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, { unsigned int i; unsigned long swp_tb; - unsigned char count; /* * No need for swap_lock here: we're just looking @@ -2379,12 +2541,9 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * allocations from this area (while holding swap_lock). */ for (i = prev + 1; i < si->max; i++) { - count = READ_ONCE(si->swap_map[i]); swp_tb = swap_table_get(__swap_offset_to_cluster(si, i), i % SWAPFILE_CLUSTER); - if (count == SWAP_MAP_BAD) - continue; - if (count || swp_tb_is_folio(swp_tb)) + if (!swp_tb_is_null(swp_tb) && !swp_tb_is_bad(swp_tb)) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); @@ -2521,7 +2680,8 @@ static void drain_mmlist(void) /* * Free all of a swapdev's extent information */ -static void destroy_swap_extents(struct swap_info_struct *sis) +static void destroy_swap_extents(struct swap_info_struct *sis, + struct file *swap_file) { while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { struct rb_node *rb = sis->swap_extent_root.rb_node; @@ -2532,7 +2692,6 @@ static void destroy_swap_extents(struct swap_info_struct *sis) } if (sis->flags & SWP_ACTIVATED) { - struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; sis->flags &= ~SWP_ACTIVATED; @@ -2615,9 +2774,9 @@ EXPORT_SYMBOL_GPL(add_swap_extent); * Typically it is in the 1-4 megabyte range. So we can have hundreds of * extents in the rbtree. - akpm. */ -static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) +static int setup_swap_extents(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) { - struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; int ret; @@ -2635,7 +2794,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) sis->flags |= SWP_ACTIVATED; if ((sis->flags & SWP_FS_OPS) && sio_pool_init() != 0) { - destroy_swap_extents(sis); + destroy_swap_extents(sis, swap_file); return -ENOMEM; } return ret; @@ -2644,23 +2803,6 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } -static void setup_swap_info(struct swap_info_struct *si, int prio, - unsigned char *swap_map, - struct swap_cluster_info *cluster_info, - unsigned long *zeromap) -{ - si->prio = prio; - /* - * the plist prio is negated because plist ordering is - * low-to-high, while swap ordering is high-to-low - */ - si->list.prio = -si->prio; - si->avail_list.prio = -si->prio; - si->swap_map = swap_map; - si->cluster_info = cluster_info; - si->zeromap = zeromap; -} - static void _enable_swap_info(struct swap_info_struct *si) { atomic_long_add(si->pages, &nr_swap_pages); @@ -2674,19 +2816,12 @@ static void _enable_swap_info(struct swap_info_struct *si) add_to_avail_list(si, true); } -static void enable_swap_info(struct swap_info_struct *si, int prio, - unsigned char *swap_map, - struct swap_cluster_info *cluster_info, - unsigned long *zeromap) +/* + * Called after the swap device is ready, resurrect its percpu ref, it's now + * safe to reference it. Add it to the list to expose it to the allocator. + */ +static void enable_swap_info(struct swap_info_struct *si) { - spin_lock(&swap_lock); - spin_lock(&si->lock); - setup_swap_info(si, prio, swap_map, cluster_info, zeromap); - spin_unlock(&si->lock); - spin_unlock(&swap_lock); - /* - * Finished initializing swap device, now it's safe to reference it. - */ percpu_ref_resurrect(&si->users); spin_lock(&swap_lock); spin_lock(&si->lock); @@ -2699,7 +2834,6 @@ static void reinsert_swap_info(struct swap_info_struct *si) { spin_lock(&swap_lock); spin_lock(&si->lock); - setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap); _enable_swap_info(si); spin_unlock(&si->lock); spin_unlock(&swap_lock); @@ -2723,8 +2857,8 @@ static void wait_for_allocation(struct swap_info_struct *si) } } -static void free_cluster_info(struct swap_cluster_info *cluster_info, - unsigned long maxpages) +static void free_swap_cluster_info(struct swap_cluster_info *cluster_info, + unsigned long maxpages) { struct swap_cluster_info *ci; int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); @@ -2736,7 +2870,7 @@ static void free_cluster_info(struct swap_cluster_info *cluster_info, /* Cluster with bad marks count will have a remaining table */ spin_lock(&ci->lock); if (rcu_dereference_protected(ci->table, true)) { - ci->count = 0; + swap_cluster_assert_empty(ci, 0, SWAPFILE_CLUSTER, true); swap_cluster_free_table(ci); } spin_unlock(&ci->lock); @@ -2769,7 +2903,6 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si) SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; - unsigned char *swap_map; unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; @@ -2846,9 +2979,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) flush_work(&p->reclaim_work); flush_percpu_swap_cluster(p); - destroy_swap_extents(p); - if (p->flags & SWP_CONTINUED) - free_swap_count_continuations(p); + destroy_swap_extents(p, p->swap_file); if (!(p->flags & SWP_SOLIDSTATE)) atomic_dec(&nr_rotate_swap); @@ -2860,8 +2991,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_file = p->swap_file; p->swap_file = NULL; - swap_map = p->swap_map; - p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; maxpages = p->max; @@ -2875,9 +3004,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); kfree(p->global_cluster); p->global_cluster = NULL; - vfree(swap_map); kvfree(zeromap); - free_cluster_info(cluster_info, maxpages); + free_swap_cluster_info(cluster_info, maxpages); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); @@ -2934,7 +3062,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) return SEQ_START_TOKEN; for (type = 0; (si = swap_type_to_info(type)); type++) { - if (!(si->flags & SWP_USED) || !si->swap_map) + if (!(si->swap_file)) continue; if (!--l) return si; @@ -2955,7 +3083,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) ++(*pos); for (; (si = swap_type_to_info(type)); type++) { - if (!(si->flags & SWP_USED) || !si->swap_map) + if (!(si->swap_file)) continue; return si; } @@ -3095,7 +3223,6 @@ static struct swap_info_struct *alloc_swap_info(void) kvfree(defer); } spin_lock_init(&p->lock); - spin_lock_init(&p->cont_lock); atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); init_completion(&p->comp); @@ -3222,35 +3349,9 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return maxpages; } -static int setup_swap_map(struct swap_info_struct *si, - union swap_header *swap_header, - unsigned char *swap_map, - unsigned long maxpages) -{ - unsigned long i; - - swap_map[0] = SWAP_MAP_BAD; /* omit header page */ - for (i = 0; i < swap_header->info.nr_badpages; i++) { - unsigned int page_nr = swap_header->info.badpages[i]; - if (page_nr == 0 || page_nr > swap_header->info.last_page) - return -EINVAL; - if (page_nr < maxpages) { - swap_map[page_nr] = SWAP_MAP_BAD; - si->pages--; - } - } - - if (!si->pages) { - pr_warn("Empty swap-file\n"); - return -EINVAL; - } - - return 0; -} - -static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, - union swap_header *swap_header, - unsigned long maxpages) +static int setup_swap_clusters_info(struct swap_info_struct *si, + union swap_header *swap_header, + unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; @@ -3274,26 +3375,28 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, } /* - * Mark unusable pages as unavailable. The clusters aren't - * marked free yet, so no list operations are involved yet. - * - * See setup_swap_map(): header page, bad pages, - * and the EOF part of the last cluster. + * Mark unusable pages (header page, bad pages, and the EOF part of + * the last cluster) as unavailable. The clusters aren't marked free + * yet, so no list operations are involved yet. */ - err = swap_cluster_setup_bad_slot(cluster_info, 0); + err = swap_cluster_setup_bad_slot(si, cluster_info, 0, false); if (err) goto err; for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; - if (page_nr >= maxpages) - continue; - err = swap_cluster_setup_bad_slot(cluster_info, page_nr); + if (!page_nr || page_nr > swap_header->info.last_page) { + pr_warn("Bad slot offset is out of border: %d (last_page: %d)\n", + page_nr, swap_header->info.last_page); + err = -EINVAL; + goto err; + } + err = swap_cluster_setup_bad_slot(si, cluster_info, page_nr, false); if (err) goto err; } for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { - err = swap_cluster_setup_bad_slot(cluster_info, i); + err = swap_cluster_setup_bad_slot(si, cluster_info, i, true); if (err) goto err; } @@ -3319,10 +3422,11 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, } } - return cluster_info; + si->cluster_info = cluster_info; + return 0; err: - free_cluster_info(cluster_info, maxpages); - return ERR_PTR(err); + free_swap_cluster_info(cluster_info, maxpages); + return err; } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) @@ -3337,9 +3441,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) int nr_extents; sector_t span; unsigned long maxpages; - unsigned char *swap_map = NULL; - unsigned long *zeromap = NULL; - struct swap_cluster_info *cluster_info = NULL; struct folio *folio = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; @@ -3350,6 +3451,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* + * Allocate or reuse existing !SWP_USED swap_info. The returned + * si will stay in a dying status, so nothing will access its content + * until enable_swap_info resurrects its percpu ref and expose it. + */ si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); @@ -3365,7 +3471,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } - si->swap_file = swap_file; mapping = swap_file->f_mapping; dentry = swap_file->f_path.dentry; inode = mapping->host; @@ -3415,7 +3520,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) si->max = maxpages; si->pages = maxpages - 1; - nr_extents = setup_swap_extents(si, &span); + nr_extents = setup_swap_extents(si, swap_file, &span); if (nr_extents < 0) { error = nr_extents; goto bad_swap_unlock_inode; @@ -3428,18 +3533,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) maxpages = si->max; - /* OK, set up the swap map and apply the bad block list */ - swap_map = vzalloc(maxpages); - if (!swap_map) { - error = -ENOMEM; - goto bad_swap_unlock_inode; - } - - error = swap_cgroup_swapon(si->type, maxpages); + /* Set up the swap cluster info */ + error = setup_swap_clusters_info(si, swap_header, maxpages); if (error) goto bad_swap_unlock_inode; - error = setup_swap_map(si, swap_header, swap_map, maxpages); + error = swap_cgroup_swapon(si->type, maxpages); if (error) goto bad_swap_unlock_inode; @@ -3447,9 +3546,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might * be above MAX_PAGE_ORDER incase of a large swap file. */ - zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), - GFP_KERNEL | __GFP_ZERO); - if (!zeromap) { + si->zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), + GFP_KERNEL | __GFP_ZERO); + if (!si->zeromap) { error = -ENOMEM; goto bad_swap_unlock_inode; } @@ -3460,20 +3559,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (si->bdev && bdev_synchronous(si->bdev)) si->flags |= SWP_SYNCHRONOUS_IO; - if (si->bdev && bdev_nonrot(si->bdev)) { + if (si->bdev && !bdev_rot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } - cluster_info = setup_clusters(si, swap_header, maxpages); - if (IS_ERR(cluster_info)) { - error = PTR_ERR(cluster_info); - cluster_info = NULL; - goto bad_swap_unlock_inode; - } - if ((swap_flags & SWAP_FLAG_DISCARD) && si->bdev && bdev_max_discard_sectors(si->bdev)) { /* @@ -3524,7 +3616,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) prio = DEF_SWAP_PRIO; if (swap_flags & SWAP_FLAG_PREFER) prio = swap_flags & SWAP_FLAG_PRIO_MASK; - enable_swap_info(si, prio, swap_map, cluster_info, zeromap); + + /* + * The plist prio is negated because plist ordering is + * low-to-high, while swap ordering is high-to-low + */ + si->prio = prio; + si->list.prio = -si->prio; + si->avail_list.prio = -si->prio; + si->swap_file = swap_file; + + /* Sets SWP_WRITEOK, resurrect the percpu ref, expose the swap device */ + enable_swap_info(si); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", K(si->pages), name->name, si->prio, nr_extents, @@ -3548,16 +3651,19 @@ bad_swap: kfree(si->global_cluster); si->global_cluster = NULL; inode = NULL; - destroy_swap_extents(si); + destroy_swap_extents(si, swap_file); swap_cgroup_swapoff(si->type); + free_swap_cluster_info(si->cluster_info, si->max); + si->cluster_info = NULL; + kvfree(si->zeromap); + si->zeromap = NULL; + /* + * Clear the SWP_USED flag after all resources are freed so + * alloc_swap_info can reuse this si safely. + */ spin_lock(&swap_lock); - si->swap_file = NULL; si->flags = 0; spin_unlock(&swap_lock); - vfree(swap_map); - kvfree(zeromap); - if (cluster_info) - free_cluster_info(cluster_info, maxpages); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) @@ -3588,321 +3694,37 @@ void si_swapinfo(struct sysinfo *val) } /* - * Verify that nr swap entries are valid and increment their swap map counts. - * - * Returns error code in following case. - * - success -> 0 - * - swp_entry is invalid -> EINVAL - * - swap-mapped reference is requested but the entry is not used. -> ENOENT - * - swap-mapped reference requested but needs continued swap count. -> ENOMEM - */ -static int swap_dup_entries(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, - unsigned char usage, int nr) -{ - int i; - unsigned char count; - - for (i = 0; i < nr; i++) { - count = si->swap_map[offset + i]; - /* - * For swapin out, allocator never allocates bad slots. for - * swapin, readahead is guarded by swap_entry_swapped. - */ - if (WARN_ON(count == SWAP_MAP_BAD)) - return -ENOENT; - /* - * Swap count duplication must be guarded by either swap cache folio (from - * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct). - */ - if (WARN_ON(!count && - !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))) - return -ENOENT; - if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)) - return -EINVAL; - } - - for (i = 0; i < nr; i++) { - count = si->swap_map[offset + i]; - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) - count += usage; - else if (swap_count_continued(si, offset + i, count)) - count = COUNT_CONTINUED; - else { - /* - * Don't need to rollback changes, because if - * usage == 1, there must be nr == 1. - */ - return -ENOMEM; - } - - WRITE_ONCE(si->swap_map[offset + i], count); - } - - return 0; -} - -static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) -{ - int err; - struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); - - si = swap_entry_to_info(entry); - if (WARN_ON_ONCE(!si)) { - pr_err("%s%08lx\n", Bad_file, entry.val); - return -EINVAL; - } - - VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - ci = swap_cluster_lock(si, offset); - err = swap_dup_entries(si, ci, offset, usage, nr); - swap_cluster_unlock(ci); - return err; -} - -/* * swap_dup_entry_direct() - Increase reference count of a swap entry by one. * @entry: first swap entry from which we want to increase the refcount. * - * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required - * but could not be atomically allocated. Returns 0, just as if it succeeded, - * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which - * might occur if a page table entry has got corrupted. + * Returns 0 for success, or -ENOMEM if the extend table is required + * but could not be atomically allocated. Returns -EINVAL if the swap + * entry is invalid, which might occur if a page table entry has got + * corrupted. * * Context: Caller must ensure there is no race condition on the reference * owner. e.g., locking the PTL of a PTE containing the entry being increased. + * Also the swap entry must have a count >= 1. Otherwise folio_dup_swap should + * be used. */ int swap_dup_entry_direct(swp_entry_t entry) { - int err = 0; - while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) - err = add_swap_count_continuation(entry, GFP_ATOMIC); - return err; -} - -/* - * add_swap_count_continuation - called when a swap count is duplicated - * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's - * page of the original vmalloc'ed swap_map, to hold the continuation count - * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called - * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. - * - * These continuation pages are seldom referenced: the common paths all work - * on the original swap_map, only referring to a continuation page when the - * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. - * - * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding - * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) - * can be called after dropping locks. - */ -int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) -{ struct swap_info_struct *si; - struct swap_cluster_info *ci; - struct page *head; - struct page *page; - struct page *list_page; - pgoff_t offset; - unsigned char count; - int ret = 0; - - /* - * When debugging, it's easier to use __GFP_ZERO here; but it's better - * for latency not to zero a page while GFP_ATOMIC and holding locks. - */ - page = alloc_page(gfp_mask | __GFP_HIGHMEM); - - si = get_swap_device(entry); - if (!si) { - /* - * An acceptable race has occurred since the failing - * __swap_duplicate(): the swap device may be swapoff - */ - goto outer; - } - - offset = swp_offset(entry); - ci = swap_cluster_lock(si, offset); - - count = si->swap_map[offset]; - - if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { - /* - * The higher the swap count, the more likely it is that tasks - * will race to add swap count continuation: we need to avoid - * over-provisioning. - */ - goto out; - } - - if (!page) { - ret = -ENOMEM; - goto out; + si = swap_entry_to_info(entry); + if (WARN_ON_ONCE(!si)) { + pr_err("%s%08lx\n", Bad_file, entry.val); + return -EINVAL; } - head = vmalloc_to_page(si->swap_map + offset); - offset &= ~PAGE_MASK; - - spin_lock(&si->cont_lock); /* - * Page allocation does not initialize the page's lru field, - * but it does always reset its private field. + * The caller must be increasing the swap count from a direct + * reference of the swap slot (e.g. a swap entry in page table). + * So the swap count must be >= 1. */ - if (!page_private(head)) { - BUG_ON(count & COUNT_CONTINUED); - INIT_LIST_HEAD(&head->lru); - set_page_private(head, SWP_CONTINUED); - si->flags |= SWP_CONTINUED; - } - - list_for_each_entry(list_page, &head->lru, lru) { - unsigned char *map; - - /* - * If the previous map said no continuation, but we've found - * a continuation page, free our allocation and use this one. - */ - if (!(count & COUNT_CONTINUED)) - goto out_unlock_cont; - - map = kmap_local_page(list_page) + offset; - count = *map; - kunmap_local(map); - - /* - * If this continuation count now has some space in it, - * free our allocation and use this one. - */ - if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) - goto out_unlock_cont; - } - - list_add_tail(&page->lru, &head->lru); - page = NULL; /* now it's attached, don't free it */ -out_unlock_cont: - spin_unlock(&si->cont_lock); -out: - swap_cluster_unlock(ci); - put_swap_device(si); -outer: - if (page) - __free_page(page); - return ret; -} - -/* - * swap_count_continued - when the original swap_map count is incremented - * from SWAP_MAP_MAX, check if there is already a continuation page to carry - * into, carry if so, or else fail until a new continuation page is allocated; - * when the original swap_map count is decremented from 0 with continuation, - * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or caller of swap_put_entry_locked() - * holds cluster lock. - */ -static bool swap_count_continued(struct swap_info_struct *si, - pgoff_t offset, unsigned char count) -{ - struct page *head; - struct page *page; - unsigned char *map; - bool ret; - - head = vmalloc_to_page(si->swap_map + offset); - if (page_private(head) != SWP_CONTINUED) { - BUG_ON(count & COUNT_CONTINUED); - return false; /* need to add count continuation */ - } - - spin_lock(&si->cont_lock); - offset &= ~PAGE_MASK; - page = list_next_entry(head, lru); - map = kmap_local_page(page) + offset; - - if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ - goto init_map; /* jump over SWAP_CONT_MAX checks */ + VM_WARN_ON_ONCE(!swap_entry_swapped(si, entry)); - if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ - /* - * Think of how you add 1 to 999 - */ - while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { - kunmap_local(map); - page = list_next_entry(page, lru); - BUG_ON(page == head); - map = kmap_local_page(page) + offset; - } - if (*map == SWAP_CONT_MAX) { - kunmap_local(map); - page = list_next_entry(page, lru); - if (page == head) { - ret = false; /* add count continuation */ - goto out; - } - map = kmap_local_page(page) + offset; -init_map: *map = 0; /* we didn't zero the page */ - } - *map += 1; - kunmap_local(map); - while ((page = list_prev_entry(page, lru)) != head) { - map = kmap_local_page(page) + offset; - *map = COUNT_CONTINUED; - kunmap_local(map); - } - ret = true; /* incremented */ - - } else { /* decrementing */ - /* - * Think of how you subtract 1 from 1000 - */ - BUG_ON(count != COUNT_CONTINUED); - while (*map == COUNT_CONTINUED) { - kunmap_local(map); - page = list_next_entry(page, lru); - BUG_ON(page == head); - map = kmap_local_page(page) + offset; - } - BUG_ON(*map == 0); - *map -= 1; - if (*map == 0) - count = 0; - kunmap_local(map); - while ((page = list_prev_entry(page, lru)) != head) { - map = kmap_local_page(page) + offset; - *map = SWAP_CONT_MAX | count; - count = COUNT_CONTINUED; - kunmap_local(map); - } - ret = count == COUNT_CONTINUED; - } -out: - spin_unlock(&si->cont_lock); - return ret; -} - -/* - * free_swap_count_continuations - swapoff free all the continuation pages - * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. - */ -static void free_swap_count_continuations(struct swap_info_struct *si) -{ - pgoff_t offset; - - for (offset = 0; offset < si->max; offset += PAGE_SIZE) { - struct page *head; - head = vmalloc_to_page(si->swap_map + offset); - if (page_private(head)) { - struct page *page, *next; - - list_for_each_entry_safe(page, next, &head->lru, lru) { - list_del(&page->lru); - __free_page(page); - } - } - } + return swap_dup_entries_cluster(si, swp_offset(entry), 1); } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) diff --git a/mm/truncate.c b/mm/truncate.c index 12467c1bd711..12cc89f89afc 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -17,7 +17,7 @@ #include <linux/export.h> #include <linux/pagemap.h> #include <linux/highmem.h> -#include <linux/pagevec.h> +#include <linux/folio_batch.h> #include <linux/task_io_accounting_ops.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> @@ -369,7 +369,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ struct folio_batch fbatch; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; pgoff_t index; int i; struct folio *folio; @@ -534,7 +534,7 @@ EXPORT_SYMBOL(truncate_inode_pages_final); unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed) { - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; struct folio_batch fbatch; pgoff_t index = start; unsigned long ret; @@ -622,6 +622,7 @@ static int folio_launder(struct address_space *mapping, struct folio *folio) int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp) { + void (*free_folio)(struct folio *); int ret; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -648,9 +649,12 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); + free_folio = mapping->a_ops->free_folio; spin_unlock(&mapping->host->i_lock); - filemap_free_folio(mapping, folio); + if (free_folio) + free_folio(folio); + folio_put_refs(folio, folio_nr_pages(folio)); return 1; failed: xa_unlock_irq(&mapping->i_pages); @@ -672,7 +676,7 @@ failed: int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; struct folio_batch fbatch; pgoff_t index; int i; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 927086bb4a3c..885da1e56466 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -14,12 +14,61 @@ #include <linux/userfaultfd_k.h> #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> -#include <linux/shmem_fs.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" #include "swap.h" +struct mfill_state { + struct userfaultfd_ctx *ctx; + unsigned long src_start; + unsigned long dst_start; + unsigned long len; + uffd_flags_t flags; + + struct vm_area_struct *vma; + unsigned long src_addr; + unsigned long dst_addr; + pmd_t *pmd; +}; + +static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) +{ + /* anonymous memory does not support MINOR mode */ + if (vm_flags & VM_UFFD_MINOR) + return false; + return true; +} + +static struct folio *anon_alloc_folio(struct vm_area_struct *vma, + unsigned long addr) +{ + struct folio *folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + addr); + + if (!folio) + return NULL; + + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(folio); + return NULL; + } + + return folio; +} + +static const struct vm_uffd_ops anon_uffd_ops = { + .can_userfault = anon_can_userfault, + .alloc_folio = anon_alloc_folio, +}; + +static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return &anon_uffd_ops; + return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL; +} + static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) { @@ -143,6 +192,128 @@ static void uffd_mfill_unlock(struct vm_area_struct *vma) } #endif +static void mfill_put_vma(struct mfill_state *state) +{ + if (!state->vma) + return; + + up_read(&state->ctx->map_changing_lock); + uffd_mfill_unlock(state->vma); + state->vma = NULL; +} + +static int mfill_get_vma(struct mfill_state *state) +{ + struct userfaultfd_ctx *ctx = state->ctx; + uffd_flags_t flags = state->flags; + struct vm_area_struct *dst_vma; + const struct vm_uffd_ops *ops; + int err; + + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + dst_vma = uffd_mfill_lock(ctx->mm, state->dst_start, state->len); + if (IS_ERR(dst_vma)) + return PTR_ERR(dst_vma); + + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + down_read(&ctx->map_changing_lock); + state->vma = dst_vma; + err = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) + goto out_unlock; + + err = -EINVAL; + + /* + * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but + * it will overwrite vm_ops, so vma_is_anonymous must return false. + */ + if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && + dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + + /* + * validate 'mode' now that we know the dst_vma: don't allow + * a wrprotect copy if the userfaultfd didn't register as WP. + */ + if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) + goto out_unlock; + + if (is_vm_hugetlb_page(dst_vma)) + return 0; + + ops = vma_uffd_ops(dst_vma); + if (!ops) + goto out_unlock; + + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && + !ops->get_folio_noalloc) + goto out_unlock; + + return 0; + +out_unlock: + mfill_put_vma(state); + return err; +} + +static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset(mm, address); + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, address); + if (!pud) + return NULL; + /* + * Note that we didn't run this because the pmd was + * missing, the *pmd may be already established and in + * turn it may also be a trans_huge_pmd. + */ + return pmd_alloc(mm, pud, address); +} + +static int mfill_establish_pmd(struct mfill_state *state) +{ + struct mm_struct *dst_mm = state->ctx->mm; + pmd_t *dst_pmd, dst_pmdval; + + dst_pmd = mm_alloc_pmd(dst_mm, state->dst_addr); + if (unlikely(!dst_pmd)) + return -ENOMEM; + + dst_pmdval = pmdp_get_lockless(dst_pmd); + if (unlikely(pmd_none(dst_pmdval)) && + unlikely(__pte_alloc(dst_mm, dst_pmd))) + return -ENOMEM; + + dst_pmdval = pmdp_get_lockless(dst_pmd); + /* + * If the dst_pmd is THP don't override it and just be strict. + * (This includes the case where the PMD used to be THP and + * changed back to none after __pte_alloc().) + */ + if (unlikely(!pmd_present(dst_pmdval) || pmd_leaf(dst_pmdval))) + return -EEXIST; + if (unlikely(pmd_bad(dst_pmdval))) + return -EFAULT; + + state->pmd = dst_pmd; + return 0; +} + /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ static bool mfill_file_over_size(struct vm_area_struct *dst_vma, unsigned long dst_addr) @@ -165,10 +336,10 @@ static bool mfill_file_over_size(struct vm_area_struct *dst_vma, * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem * and anon, and for both shared and private VMAs. */ -int mfill_atomic_install_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, struct page *page, - bool newly_allocated, uffd_flags_t flags) +static int mfill_atomic_install_pte(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, struct page *page, + uffd_flags_t flags) { int ret; struct mm_struct *dst_mm = dst_vma->vm_mm; @@ -212,9 +383,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, goto out_unlock; if (page_in_cache) { - /* Usually, cache pages are already added to LRU */ - if (newly_allocated) - folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); @@ -229,6 +397,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + if (page_in_cache) + folio_unlock(folio); + /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; @@ -238,58 +409,100 @@ out: return ret; } -static int mfill_atomic_pte_copy(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) +static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr) { void *kaddr; int ret; + + kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); + ret = copy_from_user(kaddr, (const void __user *) src_addr, + PAGE_SIZE); + pagefault_enable(); + kunmap_local(kaddr); + + if (ret) + return -EFAULT; + + flush_dcache_folio(folio); + return ret; +} + +static int mfill_copy_folio_retry(struct mfill_state *state, struct folio *folio) +{ + unsigned long src_addr = state->src_addr; + void *kaddr; + int err; + + /* retry copying with mm_lock dropped */ + mfill_put_vma(state); + + kaddr = kmap_local_folio(folio, 0); + err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); + kunmap_local(kaddr); + if (unlikely(err)) + return -EFAULT; + + flush_dcache_folio(folio); + + /* reget VMA and PMD, they could change underneath us */ + err = mfill_get_vma(state); + if (err) + return err; + + err = mfill_establish_pmd(state); + if (err) + return err; + + return 0; +} + +static int __mfill_atomic_pte(struct mfill_state *state, + const struct vm_uffd_ops *ops) +{ + unsigned long dst_addr = state->dst_addr; + unsigned long src_addr = state->src_addr; + uffd_flags_t flags = state->flags; struct folio *folio; + int ret; - if (!*foliop) { - ret = -ENOMEM; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, - dst_addr); - if (!folio) - goto out; + folio = ops->alloc_folio(state->vma, state->dst_addr); + if (!folio) + return -ENOMEM; - kaddr = kmap_local_folio(folio, 0); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { + ret = mfill_copy_folio_locked(folio, src_addr); /* - * The read mmap_lock is held here. Despite the - * mmap_lock being read recursive a deadlock is still - * possible if a writer has taken a lock. For example: - * - * process A thread 1 takes read lock on own mmap_lock - * process A thread 2 calls mmap, blocks taking write lock - * process B thread 1 takes page fault, read lock on own mmap lock - * process B thread 2 calls mmap, blocks taking write lock - * process A thread 1 blocks taking read lock on process B - * process B thread 1 blocks taking read lock on process A - * - * Disable page faults to prevent potential deadlock - * and retry the copy outside the mmap_lock. + * Fallback to copy_from_user outside mmap_lock. + * If retry is successful, mfill_copy_folio_locked() returns + * with locks retaken by mfill_get_vma(). + * If there was an error, we must mfill_put_vma() anyway and it + * will take care of unlocking if needed. */ - pagefault_disable(); - ret = copy_from_user(kaddr, (const void __user *) src_addr, - PAGE_SIZE); - pagefault_enable(); - kunmap_local(kaddr); - - /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { - ret = -ENOENT; - *foliop = folio; - /* don't free the page */ - goto out; + ret = mfill_copy_folio_retry(state, folio); + if (ret) + goto err_folio_put; } - - flush_dcache_folio(folio); + } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { + clear_user_highpage(&folio->page, state->dst_addr); } else { - folio = *foliop; - *foliop = NULL; + VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); } /* @@ -299,65 +512,67 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, */ __folio_mark_uptodate(folio); - ret = -ENOMEM; - if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) - goto out_release; + if (ops->filemap_add) { + ret = ops->filemap_add(folio, state->vma, state->dst_addr); + if (ret) + goto err_folio_put; + } - ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, flags); + ret = mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, + &folio->page, flags); if (ret) - goto out_release; -out: - return ret; -out_release: + goto err_filemap_remove; + + return 0; + +err_filemap_remove: + if (ops->filemap_remove) + ops->filemap_remove(folio, state->vma); +err_folio_put: folio_put(folio); - goto out; + return ret; } -static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) +static int mfill_atomic_pte_copy(struct mfill_state *state) { - struct folio *folio; - int ret = -ENOMEM; - - folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); - if (!folio) - return ret; - - if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) - goto out_put; + const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); /* - * The memory barrier inside __folio_mark_uptodate makes sure that - * zeroing out the folio become visible before mapping the page - * using set_pte_at(). See do_anonymous_page(). + * The normal page fault path for a MAP_PRIVATE mapping in a + * file-backed VMA will invoke the fault, fill the hole in the file and + * COW it right away. The result generates plain anonymous memory. + * So when we are asked to fill a hole in a MAP_PRIVATE mapping, we'll + * generate anonymous memory directly without actually filling the + * hole. For the MAP_PRIVATE case the robustness check only happens in + * the pagetable (to verify it's still none) and not in the page cache. */ - __folio_mark_uptodate(folio); + if (!(state->vma->vm_flags & VM_SHARED)) + ops = &anon_uffd_ops; - ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, 0); - if (ret) - goto out_put; + return __mfill_atomic_pte(state, ops); +} - return 0; -out_put: - folio_put(folio); - return ret; +static int mfill_atomic_pte_zeroed_folio(struct mfill_state *state) +{ + const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); + + return __mfill_atomic_pte(state, ops); } -static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) +static int mfill_atomic_pte_zeropage(struct mfill_state *state) { + struct vm_area_struct *dst_vma = state->vma; + unsigned long dst_addr = state->dst_addr; + pmd_t *dst_pmd = state->pmd; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; int ret; - if (mm_forbids_zeropage(dst_vma->vm_mm)) - return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); + if (mm_forbids_zeropage(dst_vma->vm_mm) || + (dst_vma->vm_flags & VM_SHARED)) + return mfill_atomic_pte_zeroed_folio(state); - _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), dst_vma->vm_page_prot)); ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); @@ -381,28 +596,29 @@ out: } /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ -static int mfill_atomic_pte_continue(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - uffd_flags_t flags) +static int mfill_atomic_pte_continue(struct mfill_state *state) { - struct inode *inode = file_inode(dst_vma->vm_file); + struct vm_area_struct *dst_vma = state->vma; + const struct vm_uffd_ops *ops = vma_uffd_ops(dst_vma); + unsigned long dst_addr = state->dst_addr; pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct inode *inode = file_inode(dst_vma->vm_file); + uffd_flags_t flags = state->flags; + pmd_t *dst_pmd = state->pmd; struct folio *folio; struct page *page; int ret; - ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); - /* Our caller expects us to return -EFAULT if we failed to find folio */ - if (ret == -ENOENT) - ret = -EFAULT; - if (ret) - goto out; - if (!folio) { - ret = -EFAULT; - goto out; + if (!ops) { + VM_WARN_ONCE(1, "UFFDIO_CONTINUE for unsupported VMA"); + return -EOPNOTSUPP; } + folio = ops->get_folio_noalloc(inode, pgoff); + /* Our caller expects us to return -EFAULT if we failed to find folio */ + if (IS_ERR_OR_NULL(folio)) + return -EFAULT; + page = folio_file_page(folio, pgoff); if (PageHWPoison(page)) { ret = -EIO; @@ -410,30 +626,28 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, } ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - page, false, flags); + page, flags); if (ret) goto out_release; - folio_unlock(folio); - ret = 0; -out: - return ret; + return 0; + out_release: folio_unlock(folio); folio_put(folio); - goto out; + return ret; } /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ -static int mfill_atomic_pte_poison(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - uffd_flags_t flags) +static int mfill_atomic_pte_poison(struct mfill_state *state) { - int ret; + struct vm_area_struct *dst_vma = state->vma; struct mm_struct *dst_mm = dst_vma->vm_mm; + unsigned long dst_addr = state->dst_addr; + pmd_t *dst_pmd = state->pmd; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; + int ret; _dst_pte = make_pte_marker(PTE_MARKER_POISONED); ret = -EAGAIN; @@ -462,27 +676,6 @@ out: return ret; } -static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) -{ - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - - pgd = pgd_offset(mm, address); - p4d = p4d_alloc(mm, pgd, address); - if (!p4d) - return NULL; - pud = pud_alloc(mm, p4d, address); - if (!pud) - return NULL; - /* - * Note that we didn't run this because the pmd was - * missing, the *pmd may be already established and in - * turn it may also be a trans_huge_pmd. - */ - return pmd_alloc(mm, pud, address); -} - #ifdef CONFIG_HUGETLB_PAGE /* * mfill_atomic processing for HUGETLB vmas. Note that this routine is @@ -573,7 +766,7 @@ retry: * in the case of shared pmds. fault mutex prevents * races with other faulting threads. */ - idx = linear_page_index(dst_vma, dst_addr); + idx = hugetlb_linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -657,48 +850,21 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, uffd_flags_t flags); #endif /* CONFIG_HUGETLB_PAGE */ -static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) +static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state) { - ssize_t err; - - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { - return mfill_atomic_pte_continue(dst_pmd, dst_vma, - dst_addr, flags); - } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { - return mfill_atomic_pte_poison(dst_pmd, dst_vma, - dst_addr, flags); - } - - /* - * The normal page fault path for a shmem will invoke the - * fault, fill the hole in the file and COW it right away. The - * result generates plain anonymous memory. So when we are - * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll - * generate anonymous memory directly without actually filling - * the hole. For the MAP_PRIVATE case the robustness check - * only happens in the pagetable (to verify it's still none) - * and not in the radix tree. - */ - if (!(dst_vma->vm_flags & VM_SHARED)) { - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) - err = mfill_atomic_pte_copy(dst_pmd, dst_vma, - dst_addr, src_addr, - flags, foliop); - else - err = mfill_atomic_pte_zeropage(dst_pmd, - dst_vma, dst_addr); - } else { - err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, - dst_addr, src_addr, - flags, foliop); - } - - return err; + uffd_flags_t flags = state->flags; + + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) + return mfill_atomic_pte_continue(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) + return mfill_atomic_pte_poison(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) + return mfill_atomic_pte_copy(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) + return mfill_atomic_pte_zeropage(state); + + VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); + return -EOPNOTSUPP; } static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, @@ -707,13 +873,17 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, unsigned long len, uffd_flags_t flags) { - struct mm_struct *dst_mm = ctx->mm; - struct vm_area_struct *dst_vma; + struct mfill_state state = (struct mfill_state){ + .ctx = ctx, + .dst_start = dst_start, + .src_start = src_start, + .flags = flags, + .len = len, + .src_addr = src_start, + .dst_addr = dst_start, + }; + long copied = 0; ssize_t err; - pmd_t *dst_pmd; - unsigned long src_addr, dst_addr; - long copied; - struct folio *folio; /* * Sanitize the command parameters: @@ -725,125 +895,35 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, VM_WARN_ON_ONCE(src_start + len <= src_start); VM_WARN_ON_ONCE(dst_start + len <= dst_start); - src_addr = src_start; - dst_addr = dst_start; - copied = 0; - folio = NULL; -retry: - /* - * Make sure the vma is not shared, that the dst range is - * both valid and fully within a single existing vma. - */ - dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); - if (IS_ERR(dst_vma)) { - err = PTR_ERR(dst_vma); + err = mfill_get_vma(&state); + if (err) goto out; - } - - /* - * If memory mappings are changing because of non-cooperative - * operation (e.g. mremap) running in parallel, bail out and - * request the user to retry later - */ - down_read(&ctx->map_changing_lock); - err = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) - goto out_unlock; - - err = -EINVAL; - /* - * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but - * it will overwrite vm_ops, so vma_is_anonymous must return false. - */ - if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && - dst_vma->vm_flags & VM_SHARED)) - goto out_unlock; - - /* - * validate 'mode' now that we know the dst_vma: don't allow - * a wrprotect copy if the userfaultfd didn't register as WP. - */ - if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) - goto out_unlock; /* * If this is a HUGETLB vma, pass off to appropriate routine */ - if (is_vm_hugetlb_page(dst_vma)) - return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, + if (is_vm_hugetlb_page(state.vma)) + return mfill_atomic_hugetlb(ctx, state.vma, dst_start, src_start, len, flags); - if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) - goto out_unlock; - if (!vma_is_shmem(dst_vma) && - uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) - goto out_unlock; - - while (src_addr < src_start + len) { - pmd_t dst_pmdval; - - VM_WARN_ON_ONCE(dst_addr >= dst_start + len); + while (state.src_addr < src_start + len) { + VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len); - dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); - if (unlikely(!dst_pmd)) { - err = -ENOMEM; + err = mfill_establish_pmd(&state); + if (err) break; - } - dst_pmdval = pmdp_get_lockless(dst_pmd); - if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_pmd))) { - err = -ENOMEM; - break; - } - dst_pmdval = pmdp_get_lockless(dst_pmd); - /* - * If the dst_pmd is THP don't override it and just be strict. - * (This includes the case where the PMD used to be THP and - * changed back to none after __pte_alloc().) - */ - if (unlikely(!pmd_present(dst_pmdval) || - pmd_trans_huge(dst_pmdval))) { - err = -EEXIST; - break; - } - if (unlikely(pmd_bad(dst_pmdval))) { - err = -EFAULT; - break; - } /* * For shmem mappings, khugepaged is allowed to remove page * tables under us; pte_offset_map_lock() will deal with that. */ - err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, - src_addr, flags, &folio); + err = mfill_atomic_pte(&state); cond_resched(); - if (unlikely(err == -ENOENT)) { - void *kaddr; - - up_read(&ctx->map_changing_lock); - uffd_mfill_unlock(dst_vma); - VM_WARN_ON_ONCE(!folio); - - kaddr = kmap_local_folio(folio, 0); - err = copy_from_user(kaddr, - (const void __user *) src_addr, - PAGE_SIZE); - kunmap_local(kaddr); - if (unlikely(err)) { - err = -EFAULT; - goto out; - } - flush_dcache_folio(folio); - goto retry; - } else - VM_WARN_ON_ONCE(folio); - if (!err) { - dst_addr += PAGE_SIZE; - src_addr += PAGE_SIZE; + state.dst_addr += PAGE_SIZE; + state.src_addr += PAGE_SIZE; copied += PAGE_SIZE; if (fatal_signal_pending(current)) @@ -853,12 +933,8 @@ retry: break; } -out_unlock: - up_read(&ctx->map_changing_lock); - uffd_mfill_unlock(dst_vma); + mfill_put_vma(&state); out: - if (folio) - folio_put(folio); VM_WARN_ON_ONCE(copied < 0); VM_WARN_ON_ONCE(err > 0); VM_WARN_ON_ONCE(!copied && !err); @@ -1229,7 +1305,7 @@ static int move_zeropage_pte(struct mm_struct *mm, return -EAGAIN; } - zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + zero_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), dst_vma->vm_page_prot)); ptep_clear_flush(src_vma, src_addr, src_pte); set_pte_at(mm, dst_addr, dst_pte, zero_pte); @@ -1938,6 +2014,38 @@ out: return moved ? moved : err; } +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async) +{ + const struct vm_uffd_ops *ops = vma_uffd_ops(vma); + + if (vma->vm_flags & VM_DROPPABLE) + return false; + + vm_flags &= __VM_UFFD_FLAGS; + + /* + * If WP is the only mode enabled and context is wp async, allow any + * memory type. + */ + if (wp_async && (vm_flags == VM_UFFD_WP)) + return true; + + /* For any other mode reject VMAs that don't implement vm_uffd_ops */ + if (!ops) + return false; + + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then only anonymous memory is supported + */ + if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && + !vma_is_anonymous(vma)) + return false; + + return ops->can_userfault(vma, vm_flags); +} + static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, vm_flags_t vm_flags) { @@ -1976,6 +2084,9 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, { struct vm_area_struct *ret; bool give_up_on_oom = false; + vma_flags_t new_vma_flags = vma->flags; + + vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); /* * If we are modifying only and not splitting, just give up on the merge @@ -1989,8 +2100,8 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, uffd_wp_range(vma, start, end - start, false); ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, - vma->vm_flags & ~__VM_UFFD_FLAGS, - NULL_VM_UFFD_CTX, give_up_on_oom); + &new_vma_flags, NULL_VM_UFFD_CTX, + give_up_on_oom); /* * In the vma_merge() successful mprotect-like case 8: @@ -2010,10 +2121,11 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long end, bool wp_async) { + vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); VMA_ITERATOR(vmi, ctx->mm, start); struct vm_area_struct *prev = vma_prev(&vmi); unsigned long vma_end; - vm_flags_t new_flags; + vma_flags_t new_vma_flags; if (vma->vm_start < start) prev = vma; @@ -2024,23 +2136,26 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); - VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(!vma_test(vma, VMA_MAYWRITE_BIT)); /* * Nothing to do: this vma is already registered into this * userfaultfd and with the right tracking mode too. */ if (vma->vm_userfaultfd_ctx.ctx == ctx && - (vma->vm_flags & vm_flags) == vm_flags) + vma_test_all_mask(vma, vma_flags)) goto skip; if (vma->vm_start > start) start = vma->vm_start; vma_end = min(end, vma->vm_end); - new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; + new_vma_flags = vma->flags; + vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); + vma_flags_set_mask(&new_vma_flags, vma_flags); + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, - new_flags, + &new_vma_flags, (struct vm_userfaultfd_ctx){ctx}, /* give_up_on_oom = */false); if (IS_ERR(vma)) diff --git a/mm/util.c b/mm/util.c index b05ab6f97e11..232c3930a662 100644 --- a/mm/util.c +++ b/mm/util.c @@ -618,6 +618,35 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK +/* + * Perform a userland memory mapping for a shadow stack into the current + * process address space. This is intended to be used by architectures that + * support user shadow stacks. + */ +unsigned long vm_mmap_shadow_stack(unsigned long addr, unsigned long len, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + unsigned long ret, unused; + vm_flags_t vm_flags = VM_SHADOW_STACK; + + flags |= MAP_ANONYMOUS | MAP_PRIVATE; + if (addr) + flags |= MAP_FIXED_NOREPLACE; + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + vm_flags |= VM_NOHUGEPAGE; + + mmap_write_lock(mm); + ret = do_mmap(NULL, addr, len, PROT_READ | PROT_WRITE, flags, + vm_flags, 0, &unused, NULL); + mmap_write_unlock(mm); + + return ret; +} +#endif /* CONFIG_ARCH_HAS_USER_SHADOW_STACK */ + /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. @@ -1135,39 +1164,75 @@ EXPORT_SYMBOL(flush_dcache_folio); #endif /** - * __compat_vma_mmap() - See description for compat_vma_mmap() - * for details. This is the same operation, only with a specific file operations - * struct which may or may not be the same as vma->vm_file->f_op. - * @f_op: The file operations whose .mmap_prepare() hook is specified. - * @file: The file which backs or will back the mapping. - * @vma: The VMA to apply the .mmap_prepare() hook to. + * compat_set_desc_from_vma() - assigns VMA descriptor @desc fields from a VMA. + * @desc: A VMA descriptor whose fields need to be set. + * @file: The file object describing the file being mmap()'d. + * @vma: The VMA whose fields we wish to assign to @desc. + * + * This is a compatibility function to allow an mmap() hook to call + * mmap_prepare() hooks when drivers nest these. This function specifically + * allows the construction of a vm_area_desc value, @desc, from a VMA @vma for + * the purposes of doing this. + * + * Once the conversion of drivers is complete this function will no longer be + * required and will be removed. + */ +void compat_set_desc_from_vma(struct vm_area_desc *desc, + const struct file *file, + const struct vm_area_struct *vma) +{ + memset(desc, 0, sizeof(*desc)); + + desc->mm = vma->vm_mm; + desc->file = (struct file *)file; + desc->start = vma->vm_start; + desc->end = vma->vm_end; + + desc->pgoff = vma->vm_pgoff; + desc->vm_file = vma->vm_file; + desc->vma_flags = vma->flags; + desc->page_prot = vma->vm_page_prot; + + /* Default. */ + desc->action.type = MMAP_NOTHING; +} +EXPORT_SYMBOL(compat_set_desc_from_vma); + +/** + * __compat_vma_mmap() - Similar to compat_vma_mmap(), only it allows + * flexibility as to how the mmap_prepare callback is invoked, which is useful + * for drivers which invoke nested mmap_prepare callbacks in an mmap() hook. + * @desc: A VMA descriptor upon which an mmap_prepare() hook has already been + * executed. + * @vma: The VMA to which @desc should be applied. + * + * The function assumes that you have obtained a VMA descriptor @desc from + * compat_set_desc_from_vma(), and already executed the mmap_prepare() hook upon + * it. + * + * It then performs any specified mmap actions, and invokes the vm_ops->mapped() + * hook if one is present. + * + * See the description of compat_vma_mmap() for more details. + * + * Once the conversion of drivers is complete this function will no longer be + * required and will be removed. + * * Returns: 0 on success or error. */ -int __compat_vma_mmap(const struct file_operations *f_op, - struct file *file, struct vm_area_struct *vma) -{ - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = file, - .start = vma->vm_start, - .end = vma->vm_end, - - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vma_flags = vma->flags, - .page_prot = vma->vm_page_prot, - - .action.type = MMAP_NOTHING, /* Default */ - }; +int __compat_vma_mmap(struct vm_area_desc *desc, + struct vm_area_struct *vma) +{ int err; - err = f_op->mmap_prepare(&desc); + /* Perform any preparatory tasks for mmap action. */ + err = mmap_action_prepare(desc); if (err) return err; - - mmap_action_prepare(&desc.action, &desc); - set_vma_from_desc(vma, &desc); - return mmap_action_complete(&desc.action, vma); + /* Update the VMA from the descriptor. */ + compat_set_vma_from_desc(vma, desc); + /* Complete any specified mmap actions. */ + return mmap_action_complete(vma, &desc->action); } EXPORT_SYMBOL(__compat_vma_mmap); @@ -1178,10 +1243,10 @@ EXPORT_SYMBOL(__compat_vma_mmap); * @vma: The VMA to apply the .mmap_prepare() hook to. * * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain - * stacked filesystems invoke a nested mmap hook of an underlying file. + * stacked drivers invoke a nested mmap hook of an underlying file. * - * Until all filesystems are converted to use .mmap_prepare(), we must be - * conservative and continue to invoke these stacked filesystems using the + * Until all drivers are converted to use .mmap_prepare(), we must be + * conservative and continue to invoke these stacked drivers using the * deprecated .mmap() hook. * * However we have a problem if the underlying file system possesses an @@ -1192,14 +1257,27 @@ EXPORT_SYMBOL(__compat_vma_mmap); * establishes a struct vm_area_desc descriptor, passes to the underlying * .mmap_prepare() hook and applies any changes performed by it. * - * Once the conversion of filesystems is complete this function will no longer - * be required and will be removed. + * Once the conversion of drivers is complete this function will no longer be + * required and will be removed. * * Returns: 0 on success or error. */ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap(file->f_op, file, vma); + struct vm_area_desc desc; + struct mmap_action *action; + int err; + + compat_set_desc_from_vma(&desc, file, vma); + err = vfs_mmap_prepare(file, &desc); + if (err) + return err; + action = &desc.action; + + /* being invoked from .mmmap means we don't have to enforce this. */ + action->hide_from_rmap_until_complete = false; + + return __compat_vma_mmap(&desc, vma); } EXPORT_SYMBOL(compat_vma_mmap); @@ -1237,7 +1315,7 @@ static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, */ void snapshot_page(struct page_snapshot *ps, const struct page *page) { - unsigned long head, nr_pages = 1; + unsigned long info, nr_pages = 1; struct folio *foliop; int loops = 5; @@ -1247,8 +1325,8 @@ void snapshot_page(struct page_snapshot *ps, const struct page *page) again: memset(&ps->folio_snapshot, 0, sizeof(struct folio)); memcpy(&ps->page_snapshot, page, sizeof(*page)); - head = ps->page_snapshot.compound_head; - if ((head & 1) == 0) { + info = ps->page_snapshot.compound_info; + if (!(info & 1)) { ps->idx = 0; foliop = (struct folio *)&ps->page_snapshot; if (!folio_test_large(foliop)) { @@ -1259,7 +1337,15 @@ again: } foliop = (struct folio *)page; } else { - foliop = (struct folio *)(head - 1); + /* See compound_head() */ + if (compound_info_has_mask()) { + unsigned long p = (unsigned long)page; + + foliop = (struct folio *)(p & info); + } else { + foliop = (struct folio *)(info - 1); + } + ps->idx = folio_page_idx(foliop, page); } @@ -1283,70 +1369,95 @@ again: } } -static int mmap_action_finish(struct mmap_action *action, - const struct vm_area_struct *vma, int err) +static int call_vma_mapped(struct vm_area_struct *vma) +{ + const struct vm_operations_struct *vm_ops = vma->vm_ops; + void *vm_private_data = vma->vm_private_data; + int err; + + if (!vm_ops || !vm_ops->mapped) + return 0; + + err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_file, &vm_private_data); + if (err) + return err; + + if (vm_private_data != vma->vm_private_data) + vma->vm_private_data = vm_private_data; + return 0; +} + +static int mmap_action_finish(struct vm_area_struct *vma, + struct mmap_action *action, int err) { + size_t len; + + if (!err) + err = call_vma_mapped(vma); + if (!err && action->success_hook) + err = action->success_hook(vma); + + /* do_munmap() might take rmap lock, so release if held. */ + maybe_rmap_unlock_action(vma, action); + if (!err) + return 0; + /* * If an error occurs, unmap the VMA altogether and return an error. We * only clear the newly allocated VMA, since this function is only * invoked if we do NOT merge, so we only clean up the VMA we created. */ - if (err) { - const size_t len = vma_pages(vma) << PAGE_SHIFT; - - do_munmap(current->mm, vma->vm_start, len, NULL); - - if (action->error_hook) { - /* We may want to filter the error. */ - err = action->error_hook(err); - - /* The caller should not clear the error. */ - VM_WARN_ON_ONCE(!err); - } - return err; + len = vma_pages(vma) << PAGE_SHIFT; + do_munmap(current->mm, vma->vm_start, len, NULL); + if (action->error_hook) { + /* We may want to filter the error. */ + err = action->error_hook(err); + /* The caller should not clear the error. */ + VM_WARN_ON_ONCE(!err); } - - if (action->success_hook) - return action->success_hook(vma); - - return 0; + return err; } #ifdef CONFIG_MMU /** * mmap_action_prepare - Perform preparatory setup for an VMA descriptor * action which need to be performed. - * @desc: The VMA descriptor to prepare for @action. - * @action: The action to perform. + * @desc: The VMA descriptor to prepare for its @desc->action. + * + * Returns: %0 on success, otherwise error. */ -void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) +int mmap_action_prepare(struct vm_area_desc *desc) { - switch (action->type) { + switch (desc->action.type) { case MMAP_NOTHING: - break; + return 0; case MMAP_REMAP_PFN: - remap_pfn_range_prepare(desc, action->remap.start_pfn); - break; + return remap_pfn_range_prepare(desc); case MMAP_IO_REMAP_PFN: - io_remap_pfn_range_prepare(desc, action->remap.start_pfn, - action->remap.size); - break; + return io_remap_pfn_range_prepare(desc); + case MMAP_SIMPLE_IO_REMAP: + return simple_ioremap_prepare(desc); + case MMAP_MAP_KERNEL_PAGES: + return map_kernel_pages_prepare(desc); } + + WARN_ON_ONCE(1); + return -EINVAL; } EXPORT_SYMBOL(mmap_action_prepare); /** * mmap_action_complete - Execute VMA descriptor action. - * @action: The action to perform. * @vma: The VMA to perform the action upon. + * @action: The action to perform. * * Similar to mmap_action_prepare(). * * Return: 0 on success, or error, at which point the VMA will be unmapped. */ -int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) +int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action) { int err = 0; @@ -1354,37 +1465,42 @@ int mmap_action_complete(struct mmap_action *action, case MMAP_NOTHING: break; case MMAP_REMAP_PFN: - err = remap_pfn_range_complete(vma, action->remap.start, - action->remap.start_pfn, action->remap.size, - action->remap.pgprot); + err = remap_pfn_range_complete(vma, action); + break; + case MMAP_MAP_KERNEL_PAGES: + err = map_kernel_pages_complete(vma, action); break; case MMAP_IO_REMAP_PFN: - err = io_remap_pfn_range_complete(vma, action->remap.start, - action->remap.start_pfn, action->remap.size, - action->remap.pgprot); + case MMAP_SIMPLE_IO_REMAP: + /* Should have been delegated. */ + WARN_ON_ONCE(1); + err = -EINVAL; break; } - return mmap_action_finish(action, vma, err); + return mmap_action_finish(vma, action, err); } EXPORT_SYMBOL(mmap_action_complete); #else -void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) +int mmap_action_prepare(struct vm_area_desc *desc) { - switch (action->type) { + switch (desc->action.type) { case MMAP_NOTHING: break; case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: + case MMAP_SIMPLE_IO_REMAP: + case MMAP_MAP_KERNEL_PAGES: WARN_ON_ONCE(1); /* nommu cannot handle these. */ break; } + + return 0; } EXPORT_SYMBOL(mmap_action_prepare); -int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) +int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action) { int err = 0; @@ -1393,13 +1509,15 @@ int mmap_action_complete(struct mmap_action *action, break; case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: + case MMAP_SIMPLE_IO_REMAP: + case MMAP_MAP_KERNEL_PAGES: WARN_ON_ONCE(1); /* nommu cannot handle this. */ err = -EINVAL; break; } - return mmap_action_finish(action, vma, err); + return mmap_action_finish(vma, action, err); } EXPORT_SYMBOL(mmap_action_complete); #endif @@ -38,13 +38,11 @@ struct mmap_state { /* Determine if we can check KSM flags early in mmap() logic. */ bool check_ksm_early :1; - /* If we map new, hold the file rmap lock on mapping. */ - bool hold_file_rmap_lock :1; /* If .mmap_prepare changed the file, we don't need to pin. */ bool file_doesnt_need_get :1; }; -#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ +#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vma_flags_, file_) \ struct mmap_state name = { \ .mm = mm_, \ .vmi = vmi_, \ @@ -52,9 +50,9 @@ struct mmap_state { .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ - .vm_flags = vm_flags_, \ + .vma_flags = vma_flags_, \ .file = file_, \ - .page_prot = vm_get_page_prot(vm_flags_), \ + .page_prot = vma_get_page_prot(vma_flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ @@ -63,7 +61,7 @@ struct mmap_state { .vmi = (map_)->vmi, \ .start = (map_)->addr, \ .end = (map_)->end, \ - .vm_flags = (map_)->vm_flags, \ + .vma_flags = (map_)->vma_flags, \ .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ @@ -86,10 +84,15 @@ static bool vma_is_fork_child(struct vm_area_struct *vma) static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; + vma_flags_t diff; if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; - if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE) + + diff = vma_flags_diff_pair(&vma->flags, &vmg->vma_flags); + vma_flags_clear_mask(&diff, VMA_IGNORE_MERGE_FLAGS); + + if (!vma_flags_empty(&diff)) return false; if (vma->vm_file != vmg->file) return false; @@ -180,7 +183,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp, } /* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * Return true if we can merge this (vma_flags,anon_vma,file,vm_pgoff) * in front of (at a lower virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) @@ -206,7 +209,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg) } /* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * Return true if we can merge this (vma_flags,anon_vma,file,vm_pgoff) * beyond (at a higher virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) @@ -590,7 +593,7 @@ out_free_vma: static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { - if (vma->vm_mm->map_count >= sysctl_max_map_count) + if (vma->vm_mm->map_count >= get_sysctl_max_map_count()) return -ENOMEM; return __split_vma(vmi, vma, addr, new_below); @@ -805,7 +808,8 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) static __must_check struct vm_area_struct *vma_merge_existing_range( struct vma_merge_struct *vmg) { - vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY; + vma_flags_t sticky_flags = vma_flags_and_mask(&vmg->vma_flags, + VMA_STICKY_FLAGS); struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next; @@ -844,7 +848,8 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( * furthermost left or right side of the VMA, then we have no chance of * merging and should abort. */ - if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side)) + if (vma_flags_test_any_mask(&vmg->vma_flags, VMA_SPECIAL_FLAGS) || + (!left_side && !right_side)) return NULL; if (left_side) @@ -898,15 +903,22 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( vma_start_write(middle); if (merge_right) { + vma_flags_t next_sticky; + vma_start_write(next); vmg->target = next; - sticky_flags |= (next->vm_flags & VM_STICKY); + next_sticky = vma_flags_and_mask(&next->flags, VMA_STICKY_FLAGS); + vma_flags_set_mask(&sticky_flags, next_sticky); } if (merge_left) { + vma_flags_t prev_sticky; + vma_start_write(prev); vmg->target = prev; - sticky_flags |= (prev->vm_flags & VM_STICKY); + + prev_sticky = vma_flags_and_mask(&prev->flags, VMA_STICKY_FLAGS); + vma_flags_set_mask(&sticky_flags, prev_sticky); } if (merge_both) { @@ -976,7 +988,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (err || commit_merge(vmg)) goto abort; - vm_flags_set(vmg->target, sticky_flags); + vma_set_flags_mask(vmg->target, sticky_flags); khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; @@ -1059,7 +1071,8 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) vmg->state = VMA_MERGE_NOMERGE; /* Special VMAs are unmergeable, also if no prev/next. */ - if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next)) + if (vma_flags_test_any_mask(&vmg->vma_flags, VMA_SPECIAL_FLAGS) || + (!prev && !next)) return NULL; can_merge_left = can_vma_merge_left(vmg); @@ -1154,12 +1167,16 @@ int vma_expand(struct vma_merge_struct *vmg) struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; bool remove_next = false; - vm_flags_t sticky_flags; + vma_flags_t sticky_flags = + vma_flags_and_mask(&vmg->vma_flags, VMA_STICKY_FLAGS); + vma_flags_t target_sticky; int ret = 0; mmap_assert_write_locked(vmg->mm); vma_start_write(target); + target_sticky = vma_flags_and_mask(&target->flags, VMA_STICKY_FLAGS); + if (next && target != next && vmg->end == next->vm_end) remove_next = true; @@ -1174,10 +1191,7 @@ int vma_expand(struct vma_merge_struct *vmg) VM_WARN_ON_VMG(target->vm_start < vmg->start || target->vm_end > vmg->end, vmg); - sticky_flags = vmg->vm_flags & VM_STICKY; - sticky_flags |= target->vm_flags & VM_STICKY; - if (remove_next) - sticky_flags |= next->vm_flags & VM_STICKY; + vma_flags_set_mask(&sticky_flags, target_sticky); /* * If we are removing the next VMA or copying from a VMA @@ -1194,13 +1208,18 @@ int vma_expand(struct vma_merge_struct *vmg) return ret; if (remove_next) { + vma_flags_t next_sticky; + vma_start_write(next); vmg->__remove_next = true; + + next_sticky = vma_flags_and_mask(&next->flags, VMA_STICKY_FLAGS); + vma_flags_set_mask(&sticky_flags, next_sticky); } if (commit_merge(vmg)) goto nomem; - vm_flags_set(target, sticky_flags); + vma_set_flags_mask(target, sticky_flags); return 0; nomem: @@ -1394,7 +1413,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, * its limit temporarily, to help free resources as expected. */ if (vms->end < vms->vma->vm_end && - vms->vma->vm_mm->map_count >= sysctl_max_map_count) { + vms->vma->vm_mm->map_count >= get_sysctl_max_map_count()) { error = -ENOMEM; goto map_count_exceeded; } @@ -1440,17 +1459,17 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, nrpages = vma_pages(next); vms->nr_pages += nrpages; - if (next->vm_flags & VM_LOCKED) + if (vma_test(next, VMA_LOCKED_BIT)) vms->locked_vm += nrpages; - if (next->vm_flags & VM_ACCOUNT) + if (vma_test(next, VMA_ACCOUNT_BIT)) vms->nr_accounted += nrpages; if (is_exec_mapping(next->vm_flags)) vms->exec_vm += nrpages; else if (is_stack_mapping(next->vm_flags)) vms->stack_vm += nrpages; - else if (is_data_mapping(next->vm_flags)) + else if (is_data_mapping_vma_flags(&next->flags)) vms->data_vm += nrpages; if (vms->uf) { @@ -1689,13 +1708,13 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - vm_flags_t *vm_flags_ptr) + vma_flags_t *vma_flags_ptr) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - const vm_flags_t vm_flags = *vm_flags_ptr; + const vma_flags_t vma_flags = *vma_flags_ptr; struct vm_area_struct *ret; - vmg.vm_flags = vm_flags; + vmg.vma_flags = vma_flags; ret = vma_modify(&vmg); if (IS_ERR(ret)) @@ -1707,7 +1726,7 @@ struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, * them to the caller. */ if (vmg.state == VMA_MERGE_SUCCESS) - *vm_flags_ptr = ret->vm_flags; + *vma_flags_ptr = ret->flags; return ret; } @@ -1737,12 +1756,13 @@ struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, - unsigned long start, unsigned long end, vm_flags_t vm_flags, - struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom) + unsigned long start, unsigned long end, + const vma_flags_t *vma_flags, struct vm_userfaultfd_ctx new_ctx, + bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.vm_flags = vm_flags; + vmg.vma_flags = *vma_flags; vmg.uffd_ctx = new_ctx; if (give_up_on_oom) vmg.give_up_on_oom = true; @@ -1950,10 +1970,15 @@ out: */ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) { + vma_flags_t diff = vma_flags_diff_pair(&a->flags, &b->flags); + + vma_flags_clear_mask(&diff, VMA_ACCESS_FLAGS); + vma_flags_clear_mask(&diff, VMA_IGNORE_MERGE_FLAGS); + return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) && + vma_flags_empty(&diff) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } @@ -2041,14 +2066,13 @@ static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) static bool vma_is_shared_writable(struct vm_area_struct *vma) { - return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == - (VM_WRITE | VM_SHARED); + return vma_test_all(vma, VMA_WRITE_BIT, VMA_SHARED_BIT); } static bool vma_fs_can_writeback(struct vm_area_struct *vma) { /* No managed pages to writeback. */ - if (vma->vm_flags & VM_PFNMAP) + if (vma_test(vma, VMA_PFNMAP_BIT)) return false; return vma->vm_file && vma->vm_file->f_mapping && @@ -2314,8 +2338,10 @@ void mm_drop_all_locks(struct mm_struct *mm) * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ -static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) +static bool accountable_mapping(struct mmap_state *map) { + const struct file *file = map->file; + /* * hugetlb has its own accounting separate from the core VM * VM_HUGETLB may not be set yet so we cannot check for that flag. @@ -2323,7 +2349,9 @@ static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) if (file && is_file_hugepages(file)) return false; - return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; + return vma_flags_test(&map->vma_flags, VMA_WRITE_BIT) && + !vma_flags_test_any(&map->vma_flags, VMA_NORESERVE_BIT, + VMA_SHARED_BIT); } /* @@ -2361,7 +2389,7 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, static void update_ksm_flags(struct mmap_state *map) { - map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); + map->vma_flags = ksm_vma_flags(map->mm, map->file, map->vma_flags); } static void set_desc_from_map(struct vm_area_desc *desc, @@ -2422,11 +2450,11 @@ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, } /* Check against address space limit. */ - if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages)) + if (!may_expand_vm(map->mm, &map->vma_flags, map->pglen - vms->nr_pages)) return -ENOMEM; /* Private writable mapping: check memory availability. */ - if (accountable_mapping(map->file, map->vm_flags)) { + if (accountable_mapping(map)) { map->charged = map->pglen; map->charged -= vms->nr_accounted; if (map->charged) { @@ -2436,7 +2464,7 @@ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, } vms->nr_accounted = 0; - map->vm_flags |= VM_ACCOUNT; + vma_flags_set(&map->vma_flags, VMA_ACCOUNT_BIT); } /* @@ -2484,12 +2512,12 @@ static int __mmap_new_file_vma(struct mmap_state *map, * Drivers should not permit writability when previously it was * disallowed. */ - VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags && - !(map->vm_flags & VM_MAYWRITE) && - (vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(!vma_flags_same_pair(&map->vma_flags, &vma->flags) && + !vma_flags_test(&map->vma_flags, VMA_MAYWRITE_BIT) && + vma_test(vma, VMA_MAYWRITE_BIT)); map->file = vma->vm_file; - map->vm_flags = vma->vm_flags; + map->vma_flags = vma->flags; return 0; } @@ -2500,10 +2528,12 @@ static int __mmap_new_file_vma(struct mmap_state *map, * * @map: Mapping state. * @vmap: Output pointer for the new VMA. + * @action: Any mmap_prepare action that is still to complete. * * Returns: Zero on success, or an error. */ -static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) +static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap, + struct mmap_action *action) { struct vma_iterator *vmi = map->vmi; int error = 0; @@ -2520,7 +2550,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); - vm_flags_init(vma, map->vm_flags); + vma->flags = map->vma_flags; vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { @@ -2530,7 +2560,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (map->file) error = __mmap_new_file_vma(map, vma); - else if (map->vm_flags & VM_SHARED) + else if (vma_flags_test(&map->vma_flags, VMA_SHARED_BIT)) error = shmem_zero_setup(vma); else vma_set_anonymous(vma); @@ -2540,7 +2570,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (!map->check_ksm_early) { update_ksm_flags(map); - vm_flags_init(vma, map->vm_flags); + vma->flags = map->vma_flags; } #ifdef CONFIG_SPARC64 @@ -2552,7 +2582,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_start_write(vma); vma_iter_store_new(vmi, vma); map->mm->map_count++; - vma_link_file(vma, map->hold_file_rmap_lock); + vma_link_file(vma, action->hide_from_rmap_until_complete); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below @@ -2580,7 +2610,6 @@ free_vma: static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) { struct mm_struct *mm = map->mm; - vm_flags_t vm_flags = vma->vm_flags; perf_event_mmap(vma); @@ -2588,11 +2617,9 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vms_complete_munmap_vmas(&map->vms, &map->mas_detach); vm_stat_account(mm, vma->vm_flags, map->pglen); - if (vm_flags & VM_LOCKED) { - if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(mm)) - vm_flags_clear(vma, VM_LOCKED_MASK); + if (vma_test(vma, VMA_LOCKED_BIT)) { + if (!vma_supports_mlock(vma)) + vma_clear_flags_mask(vma, VMA_LOCKED_MASK); else mm->locked_vm += map->pglen; } @@ -2608,20 +2635,21 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) * a completely new data area). */ if (pgtable_supports_soft_dirty()) - vm_flags_set(vma, VM_SOFTDIRTY); + vma_set_flags(vma, VMA_SOFTDIRTY_BIT); vma_set_page_prot(vma); } -static void call_action_prepare(struct mmap_state *map, - struct vm_area_desc *desc) +static int call_action_prepare(struct mmap_state *map, + struct vm_area_desc *desc) { - struct mmap_action *action = &desc->action; + int err; - mmap_action_prepare(action, desc); + err = mmap_action_prepare(desc); + if (err) + return err; - if (action->hide_from_rmap_until_complete) - map->hold_file_rmap_lock = true; + return 0; } /* @@ -2645,7 +2673,9 @@ static int call_mmap_prepare(struct mmap_state *map, if (err) return err; - call_action_prepare(map, desc); + err = call_action_prepare(map, desc); + if (err) + return err; /* Update fields permitted to be changed. */ map->pgoff = desc->pgoff; @@ -2699,33 +2729,15 @@ static bool can_set_ksm_flags_early(struct mmap_state *map) return false; } -static int call_action_complete(struct mmap_state *map, - struct vm_area_desc *desc, - struct vm_area_struct *vma) -{ - struct mmap_action *action = &desc->action; - int ret; - - ret = mmap_action_complete(action, vma); - - /* If we held the file rmap we need to release it. */ - if (map->hold_file_rmap_lock) { - struct file *file = vma->vm_file; - - i_mmap_unlock_write(file->f_mapping); - } - return ret; -} - static unsigned long __mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) + unsigned long len, vma_flags_t vma_flags, + unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); - MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); + MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vma_flags, file); struct vm_area_desc desc = { .mm = mm, .file = file, @@ -2756,7 +2768,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, /* ...but if we can't, allocate a new VMA. */ if (!vma) { - error = __mmap_new_vma(&map, &vma); + error = __mmap_new_vma(&map, &vma, &desc.action); if (error) goto unacct_error; allocated_new = true; @@ -2768,8 +2780,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); if (have_mmap_prepare && allocated_new) { - error = call_action_complete(&map, &desc, vma); - + error = mmap_action_complete(vma, &desc.action); if (error) return error; } @@ -2781,6 +2792,13 @@ unacct_error: if (map.charged) vm_unacct_memory(map.charged); abort_munmap: + /* + * This indicates that .mmap_prepare has set a new file, differing from + * desc->vm_file. But since we're aborting the operation, only the + * original file will be cleaned up. Ensure we clean up both. + */ + if (map.file_doesnt_need_get) + fput(map.file); vms_abort_munmap_vmas(&map.vms, &map.mas_detach); return error; } @@ -2809,16 +2827,17 @@ abort_munmap: * been performed. */ unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) + unsigned long len, vm_flags_t vm_flags, + unsigned long pgoff, struct list_head *uf) { unsigned long ret; bool writable_file_mapping = false; + const vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); mmap_assert_write_locked(current->mm); /* Check to see if MDWE is applicable. */ - if (map_deny_write_exec(vm_flags, vm_flags)) + if (map_deny_write_exec(&vma_flags, &vma_flags)) return -EACCES; /* Allow architectures to sanity-check the vm_flags. */ @@ -2826,7 +2845,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -EINVAL; /* Map writable and ensure this isn't a sealed memfd. */ - if (file && is_shared_maywrite_vm_flags(vm_flags)) { + if (file && is_shared_maywrite(&vma_flags)) { int error = mapping_map_writable(file->f_mapping); if (error) @@ -2834,7 +2853,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, writable_file_mapping = true; } - ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); + ret = __mmap_region(file, addr, len, vma_flags, pgoff, uf); /* Clear our write mapping regardless of error. */ if (writable_file_mapping) @@ -2844,20 +2863,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return ret; } -/* +/** * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, - * @vm_flags: The VMA Flags + * @vma_flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. + * + * Returns: %0 on success, or otherwise an error. */ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, vm_flags_t vm_flags) + unsigned long addr, unsigned long len, vma_flags_t vma_flags) { struct mm_struct *mm = current->mm; @@ -2865,12 +2886,15 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. */ - vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - vm_flags = ksm_vma_flags(mm, NULL, vm_flags); - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) + vma_flags_set_mask(&vma_flags, VMA_DATA_DEFAULT_FLAGS); + vma_flags_set(&vma_flags, VMA_ACCOUNT_BIT); + vma_flags_set_mask(&vma_flags, mm->def_vma_flags); + + vma_flags = ksm_vma_flags(mm, NULL, vma_flags); + if (!may_expand_vm(mm, &vma_flags, len >> PAGE_SHIFT)) return -ENOMEM; - if (mm->map_count > sysctl_max_map_count) + if (mm->map_count > get_sysctl_max_map_count()) return -ENOMEM; if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) @@ -2881,7 +2905,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr) { - VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr)); + VMG_STATE(vmg, mm, vmi, addr, addr + len, vma_flags, PHYS_PFN(addr)); vmg.prev = vma; /* vmi is positioned at prev, which this mode expects. */ @@ -2902,8 +2926,8 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_set_anonymous(vma); vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); - vm_flags_init(vma, vm_flags); - vma->vm_page_prot = vm_get_page_prot(vm_flags); + vma->flags = vma_flags; + vma->vm_page_prot = vm_get_page_prot(vma_flags_to_legacy(vma_flags)); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; @@ -2914,10 +2938,10 @@ out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; - if (vm_flags & VM_LOCKED) + if (vma_flags_test(&vma_flags, VMA_LOCKED_BIT)) mm->locked_vm += (len >> PAGE_SHIFT); if (pgtable_supports_soft_dirty()) - vm_flags_set(vma, VM_SOFTDIRTY); + vma_set_flags(vma, VMA_SOFTDIRTY_BIT); return 0; mas_store_fail: @@ -2966,7 +2990,8 @@ retry: gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + /* Avoid prev check if possible */ + if (tmp && vma_test_any_mask(tmp, VMA_STARTGAP_FLAGS)) { if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; vma_iter_reset(&vmi); @@ -3018,7 +3043,8 @@ retry: gap -= (gap - info->align_offset) & info->align_mask; gap_end = vma_iter_end(&vmi); tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + /* Avoid prev check if possible */ + if (tmp && vma_test_any_mask(tmp, VMA_STARTGAP_FLAGS)) { if (vm_start_gap(tmp) < gap_end) { high_limit = vm_start_gap(tmp); vma_iter_reset(&vmi); @@ -3048,7 +3074,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long new_start; /* address space limit tests */ - if (!may_expand_vm(mm, vma->vm_flags, grow)) + if (!may_expand_vm(mm, &vma->flags, grow)) return -ENOMEM; /* Stack limit test */ @@ -3056,12 +3082,16 @@ static int acct_stack_growth(struct vm_area_struct *vma, return -ENOMEM; /* mlock limit tests */ - if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, grow << PAGE_SHIFT)) + if (!mlock_future_ok(mm, vma_test(vma, VMA_LOCKED_BIT), + grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ - new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : - vma->vm_end - size; + new_start = vma->vm_end - size; +#ifdef CONFIG_STACK_GROWSUP + if (vma_test(vma, VMA_GROWSUP_BIT)) + new_start = vma->vm_start; +#endif if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; @@ -3075,7 +3105,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, return 0; } -#if defined(CONFIG_STACK_GROWSUP) +#ifdef CONFIG_STACK_GROWSUP /* * PA-RISC uses this for its stack. * vma is the last one with address > vma->vm_end. Have to extend vma. @@ -3088,7 +3118,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); - if (!(vma->vm_flags & VM_GROWSUP)) + if (!vma_test(vma, VMA_GROWSUP_BIT)) return -EFAULT; mmap_assert_write_locked(mm); @@ -3108,7 +3138,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { - if (!(next->vm_flags & VM_GROWSUP)) + if (!vma_test(next, VMA_GROWSUP_BIT)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } @@ -3142,7 +3172,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { - if (vma->vm_flags & VM_LOCKED) + if (vma_test(vma, VMA_LOCKED_BIT)) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); @@ -3173,7 +3203,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); - if (!(vma->vm_flags & VM_GROWSDOWN)) + if (!vma_test(vma, VMA_GROWSDOWN_BIT)) return -EFAULT; mmap_assert_write_locked(mm); @@ -3186,7 +3216,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) prev = vma_prev(&vmi); /* Check that both stack segments have the same anon_vma? */ if (prev) { - if (!(prev->vm_flags & VM_GROWSDOWN) && + if (!vma_test(prev, VMA_GROWSDOWN_BIT) && vma_is_accessible(prev) && (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; @@ -3221,7 +3251,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { - if (vma->vm_flags & VM_LOCKED) + if (vma_test(vma, VMA_LOCKED_BIT)) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); @@ -3267,11 +3297,10 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long charged = vma_pages(vma); - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; - if ((vma->vm_flags & VM_ACCOUNT) && + if (vma_test(vma, VMA_ACCOUNT_BIT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; @@ -3293,10 +3322,31 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) } if (vma_link(mm, vma)) { - if (vma->vm_flags & VM_ACCOUNT) + if (vma_test(vma, VMA_ACCOUNT_BIT)) vm_unacct_memory(charged); return -ENOMEM; } return 0; } + +/** + * vma_mmu_pagesize - Default MMU page size granularity for this VMA. + * @vma: The user mapping. + * + * In the common case, the default page size used by the MMU matches the + * default page size used by the kernel (see vma_kernel_pagesize()). On + * architectures where it differs, an architecture-specific 'strong' version + * of this symbol is required. + * + * The default MMU page size is not affected by Transparent Huge Pages + * being in effect, or any usage of larger MMU page sizes (either through + * architectural huge-page mappings or other explicit/implicit coalescing of + * virtual ranges performed by the MMU). + * + * Return: The default MMU page size granularity for this VMA. + */ +__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return vma_kernel_pagesize(vma); +} @@ -98,7 +98,11 @@ struct vma_merge_struct { unsigned long end; pgoff_t pgoff; - vm_flags_t vm_flags; + union { + /* Temporary while VMA flags are being converted. */ + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; @@ -233,13 +237,13 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } -#define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \ +#define VMG_STATE(name, mm_, vmi_, start_, end_, vma_flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ - .vm_flags = vm_flags_, \ + .vma_flags = vma_flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ } @@ -296,7 +300,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, * f_op->mmap() but which might have an underlying file system which implements * f_op->mmap_prepare(). */ -static inline void set_vma_from_desc(struct vm_area_struct *vma, +static inline void compat_set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc) { /* @@ -338,24 +342,23 @@ void unmap_region(struct unmap_desc *unmap); * @vma: The VMA containing the range @start to @end to be updated. * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. - * @vm_flags_ptr: A pointer to the VMA flags that the @start to @end range is + * @vma_flags_ptr: A pointer to the VMA flags that the @start to @end range is * about to be set to. On merge, this will be updated to include sticky flags. * * IMPORTANT: The actual modification being requested here is NOT applied, * rather the VMA is perhaps split, perhaps merged to accommodate the change, * and the caller is expected to perform the actual modification. * - * In order to account for sticky VMA flags, the @vm_flags_ptr parameter points + * In order to account for sticky VMA flags, the @vma_flags_ptr parameter points * to the requested flags which are then updated so the caller, should they * overwrite any existing flags, correctly retains these. * * Returns: A VMA which contains the range @start to @end ready to have its - * flags altered to *@vm_flags. + * flags altered to *@vma_flags. */ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - vm_flags_t *vm_flags_ptr); + unsigned long start, unsigned long end, vma_flags_t *vma_flags_ptr); /** * vma_modify_name() - Perform any necessary split/merge in preparation for @@ -414,7 +417,7 @@ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, * @vma: The VMA containing the range @start to @end to be updated. * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. - * @vm_flags: The VMA flags that the @start to @end range is about to be set to. + * @vma_flags: The VMA flags that the @start to @end range is about to be set to. * @new_ctx: The userfaultfd context that the @start to @end range is about to * be set to. * @give_up_on_oom: If an out of memory condition occurs on merge, simply give @@ -425,11 +428,11 @@ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, * and the caller is expected to perform the actual modification. * * Returns: A VMA which contains the range @start to @end ready to have its VMA - * flags changed to @vm_flags and its userfaultfd context changed to @new_ctx. + * flags changed to @vma_flags and its userfaultfd context changed to @new_ctx. */ __must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, - unsigned long start, unsigned long end, vm_flags_t vm_flags, + unsigned long start, unsigned long end, const vma_flags_t *vma_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom); __must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); @@ -461,7 +464,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct list_head *uf); int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, - unsigned long addr, unsigned long request, unsigned long flags); + unsigned long addr, unsigned long request, + vma_flags_t vma_flags); unsigned long unmapped_area(struct vm_unmapped_area_info *info); unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); @@ -523,6 +527,11 @@ static inline bool is_data_mapping(vm_flags_t flags) return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } +static inline bool is_data_mapping_vma_flags(const vma_flags_t *vma_flags) +{ + return vma_flags_test(vma_flags, VMA_WRITE_BIT) && + !vma_flags_test_any(vma_flags, VMA_SHARED_BIT, VMA_STACK_BIT); +} static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) @@ -693,4 +702,55 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); #endif +#ifdef CONFIG_MMU +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + * + * This is only applicable if the user has set the Memory-Deny-Write-Execute + * (MDWE) protection mask for the current process. + * + * @old specifies the VMA flags the VMA originally possessed, and @new the ones + * we propose to set. + * + * Return: false if proposed change is OK, true if not ok and should be denied. + */ +static inline bool map_deny_write_exec(const vma_flags_t *old, + const vma_flags_t *new) +{ + /* If MDWE is disabled, we have nothing to deny. */ + if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) + return false; + + /* If the new VMA is not executable, we have nothing to deny. */ + if (!vma_flags_test(new, VMA_EXEC_BIT)) + return false; + + /* Under MDWE we do not accept newly writably executable VMAs... */ + if (vma_flags_test(new, VMA_WRITE_BIT)) + return true; + + /* ...nor previously non-executable VMAs becoming executable. */ + if (!vma_flags_test(old, VMA_EXEC_BIT)) + return true; + + return false; +} +#endif + #endif /* __MM_VMA_H */ diff --git a/mm/vma_exec.c b/mm/vma_exec.c index 8134e1afca68..5cee8b7efa0f 100644 --- a/mm/vma_exec.c +++ b/mm/vma_exec.c @@ -36,7 +36,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; VMA_ITERATOR(vmi, mm, new_start); - VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); + VMG_STATE(vmg, mm, &vmi, new_start, old_end, EMPTY_VMA_FLAGS, + vma->vm_pgoff); struct vm_area_struct *next; struct mmu_gather tlb; PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length); @@ -135,7 +136,7 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + VM_WARN_ON_ONCE(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; if (pgtable_supports_soft_dirty()) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 61caa55a4402..aa08651ec0df 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1068,14 +1068,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); -static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages; static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr; -unsigned long vmalloc_nr_pages(void) -{ - return atomic_long_read(&nr_vmalloc_pages); -} - static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { struct rb_node *n = root->rb_node; @@ -3189,7 +3183,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } -static void clear_vm_uninitialized_flag(struct vm_struct *vm) +void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* * Before removing VM_UNINITIALIZED, @@ -3465,9 +3459,6 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); - /* All pages of vm should be charged to same memcg, so use first one. */ - if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES)) - mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; @@ -3476,11 +3467,11 @@ void vfree(const void *addr) * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations */ + if (!(vm->flags & VM_MAP_PUT_PAGES)) + mod_lruvec_page_state(page, NR_VMALLOC, -1); __free_page(page); cond_resched(); } - if (!(vm->flags & VM_MAP_PUT_PAGES)) - atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); kvfree(vm->pages); kfree(vm); } @@ -3668,6 +3659,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, continue; } + mod_lruvec_page_state(page, NR_VMALLOC, 1 << large_order); + split_page(page, large_order); for (i = 0; i < (1U << large_order); i++) pages[nr_allocated + i] = page + i; @@ -3688,6 +3681,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (!order) { while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; + int i; /* * A maximum allowed request is hard-coded and is 100 @@ -3711,6 +3705,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid, nr_pages_request, pages + nr_allocated); + for (i = nr_allocated; i < nr_allocated + nr; i++) + mod_lruvec_page_state(pages[i], NR_VMALLOC, 1); + nr_allocated += nr; /* @@ -3735,6 +3732,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (unlikely(!page)) break; + mod_lruvec_page_state(page, NR_VMALLOC, 1 << order); + /* * High-order allocations must be able to be treated as * independent small pages by callers (as they can with @@ -3798,6 +3797,8 @@ static void defer_vm_area_cleanup(struct vm_struct *area) * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save() * GFP_NOFS - memalloc_nofs_save() * GFP_NOIO - memalloc_noio_save() + * __GFP_RETRY_MAYFAIL, __GFP_NORETRY - memalloc_noreclaim_save() + * to prevent OOMs * * Returns a flag cookie to pair with restore. */ @@ -3806,7 +3807,8 @@ memalloc_apply_gfp_scope(gfp_t gfp_mask) { unsigned int flags = 0; - if (!gfpflags_allow_blocking(gfp_mask)) + if (!gfpflags_allow_blocking(gfp_mask) || + (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_NORETRY))) flags = memalloc_noreclaim_save(); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) flags = memalloc_nofs_save(); @@ -3877,12 +3879,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, vmalloc_gfp_adjust(gfp_mask, page_order), node, page_order, nr_small_pages, area->pages); - atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - /* All pages of vm should be charged to same memcg, so use first one. */ - if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) - mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, - area->nr_pages); - /* * If not enough pages were obtained to accomplish an * allocation request, free them via vfree() if any. @@ -3901,7 +3897,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocate pages", - area->nr_pages * PAGE_SIZE); + nr_small_pages * PAGE_SIZE); goto fail; } @@ -3940,7 +3936,8 @@ fail: * GFP_KERNEL_ACCOUNT. Xfs uses __GFP_NOLOCKDEP. */ #define GFP_VMALLOC_SUPPORTED (GFP_KERNEL | GFP_ATOMIC | GFP_NOWAIT |\ - __GFP_NOFAIL | __GFP_ZERO | __GFP_NORETRY |\ + __GFP_NOFAIL | __GFP_ZERO |\ + __GFP_NORETRY | __GFP_RETRY_MAYFAIL |\ GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\ GFP_USER | __GFP_NOLOCKDEP) @@ -3971,12 +3968,15 @@ static gfp_t vmalloc_fix_flags(gfp_t flags) * virtual range with protection @prot. * * Supported GFP classes: %GFP_KERNEL, %GFP_ATOMIC, %GFP_NOWAIT, - * %GFP_NOFS and %GFP_NOIO. Zone modifiers are not supported. + * %__GFP_RETRY_MAYFAIL, %__GFP_NORETRY, %GFP_NOFS and %GFP_NOIO. + * Zone modifiers are not supported. * Please note %GFP_ATOMIC and %GFP_NOWAIT are supported only * by __vmalloc(). * - * Retry modifiers: only %__GFP_NOFAIL is supported; %__GFP_NORETRY - * and %__GFP_RETRY_MAYFAIL are not supported. + * Retry modifiers: only %__GFP_NOFAIL is fully supported; + * %__GFP_NORETRY and %__GFP_RETRY_MAYFAIL are supported with limitation, + * i.e. page tables are allocated with NOWAIT semantic so they might fail + * under moderate memory pressure. * * %__GFP_NOWARN can be used to suppress failure messages. * @@ -4575,20 +4575,20 @@ finished: * @count: number of bytes to be read. * * This function checks that addr is a valid vmalloc'ed area, and - * copy data from that area to a given buffer. If the given memory range + * copies data from that area to a given iterator. If the given memory range * of [addr...addr+count) includes some valid address, data is copied to - * proper area of @buf. If there are memory holes, they'll be zero-filled. + * proper area of @iter. If there are memory holes, they'll be zero-filled. * IOREMAP area is treated as memory hole and no copy is done. * * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. @buf should be kernel's buffer. + * vm_struct area, returns 0. * - * Note: In usual ops, vread() is never necessary because the caller + * Note: In usual ops, vread_iter() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any information, as /proc/kcore. * - * Return: number of bytes for which addr and buf should be increased + * Return: number of bytes for which addr and iter should be advanced * (same number as @count) or %0 if [addr...addr+count) doesn't * include any intersection with valid vmalloc area */ @@ -5416,6 +5416,7 @@ vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct vmap_node *vn; + guard(mutex)(&vmap_purge_lock); for_each_vmap_node(vn) decay_va_pool_node(vn, true); diff --git a/mm/vmscan.c b/mm/vmscan.c index 0fc9373e8251..bd1b1aa12581 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -44,7 +44,7 @@ #include <linux/sysctl.h> #include <linux/memory-tiers.h> #include <linux/oom.h> -#include <linux/pagevec.h> +#include <linux/folio_batch.h> #include <linux/prefetch.h> #include <linux/printk.h> #include <linux/dax.h> @@ -269,25 +269,6 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) } #endif -/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to - * and including the specified highidx - * @zone: The current zone in the iterator - * @pgdat: The pgdat which node_zones are being iterated - * @idx: The index variable - * @highidx: The index of the highest zone to return - * - * This macro iterates through all managed zones up to and including the specified highidx. - * The zone iterator enters an invalid state after macro call and must be reinitialized - * before it can be used again. - */ -#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ - for ((idx) = 0, (zone) = (pgdat)->node_zones; \ - (idx) <= (highidx); \ - (idx)++, (zone)++) \ - if (!managed_zone(zone)) \ - continue; \ - else - static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { @@ -409,8 +390,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) */ -static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zone_idx) +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { unsigned long size = 0; int zid; @@ -905,7 +885,7 @@ static enum folio_references folio_check_references(struct folio *folio, if (referenced_ptes == -1) return FOLIOREF_KEEP; - if (lru_gen_enabled()) { + if (lru_gen_enabled() && !lru_gen_switching()) { if (!referenced_ptes) return FOLIOREF_RECLAIM; @@ -963,8 +943,7 @@ static void folio_check_dirty_writeback(struct folio *folio, * They could be mistakenly treated as file lru. So further anon * test is needed. */ - if (!folio_is_file_lru(folio) || - (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { + if (!folio_is_file_lru(folio) || folio_test_lazyfree(folio)) { *dirty = false; *writeback = false; return; @@ -986,13 +965,11 @@ static void folio_check_dirty_writeback(struct folio *folio, static struct folio *alloc_demote_folio(struct folio *src, unsigned long private) { + struct migration_target_control *mtc, target_nid_mtc; struct folio *dst; - nodemask_t *allowed_mask; - struct migration_target_control *mtc; mtc = (struct migration_target_control *)private; - allowed_mask = mtc->nmask; /* * make sure we allocate from the target node first also trying to * demote or reclaim pages from the target node via kswapd if we are @@ -1002,15 +979,13 @@ static struct folio *alloc_demote_folio(struct folio *src, * a demotion of cold pages from the target memtier. This can result * in the kernel placing hot pages in slower(lower) memory tiers. */ - mtc->nmask = NULL; - mtc->gfp_mask |= __GFP_THISNODE; - dst = alloc_migration_target(src, (unsigned long)mtc); + target_nid_mtc = *mtc; + target_nid_mtc.nmask = NULL; + target_nid_mtc.gfp_mask |= __GFP_THISNODE; + dst = alloc_migration_target(src, (unsigned long)&target_nid_mtc); if (dst) return dst; - mtc->gfp_mask &= ~__GFP_THISNODE; - mtc->nmask = allowed_mask; - return alloc_migration_target(src, (unsigned long)mtc); } @@ -1070,7 +1045,7 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) /* * We can "enter_fs" for swap-cache with only __GFP_IO * providing this isn't SWP_FS_OPS. - * ->flags can be updated non-atomically (scan_swap_map_slots), + * ->flags can be updated non-atomically, * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ @@ -1508,7 +1483,7 @@ retry: } } - if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { + if (folio_test_lazyfree(folio)) { /* follow __remove_mapping for reference */ if (!folio_ref_freeze(folio, 1)) goto keep_locked; @@ -1836,7 +1811,7 @@ bool folio_isolate_lru(struct folio *folio) folio_get(folio); lruvec = folio_lruvec_lock_irq(folio); lruvec_del_folio(lruvec, folio); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); ret = true; } @@ -1890,24 +1865,27 @@ static bool too_many_isolated(struct pglist_data *pgdat, int file, /* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. * - * Returns the number of pages moved to the given lruvec. + * Returns the number of pages moved to the appropriate lruvec. + * + * Note: The caller must not hold any lruvec lock. */ -static unsigned int move_folios_to_lru(struct lruvec *lruvec, - struct list_head *list) +static unsigned int move_folios_to_lru(struct list_head *list) { int nr_pages, nr_moved = 0; + struct lruvec *lruvec = NULL; struct folio_batch free_folios; folio_batch_init(&free_folios); while (!list_empty(list)) { struct folio *folio = lru_to_folio(list); + lruvec = folio_lruvec_relock_irq(folio, lruvec); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); list_del(&folio->lru); if (unlikely(!folio_evictable(folio))) { - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); folio_putback_lru(folio); - spin_lock_irq(&lruvec->lru_lock); + lruvec = NULL; continue; } @@ -1929,20 +1907,15 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, folio_unqueue_deferred_split(folio); if (folio_batch_add(&free_folios, folio) == 0) { - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); mem_cgroup_uncharge_folios(&free_folios); free_unref_folios(&free_folios); - spin_lock_irq(&lruvec->lru_lock); + lruvec = NULL; } continue; } - /* - * All pages were isolated from the same lruvec (and isolation - * inhibits memcg migration). - */ - VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); lruvec_add_folio(lruvec, folio); nr_pages = folio_nr_pages(folio); nr_moved += nr_pages; @@ -1950,11 +1923,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, workingset_age_nonresident(lruvec, nr_pages); } + if (lruvec) + lruvec_unlock_irq(lruvec); + if (free_folios.nr) { - spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_folios(&free_folios); free_unref_folios(&free_folios); - spin_lock_irq(&lruvec->lru_lock); } return nr_moved; @@ -1984,7 +1958,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, unsigned long nr_taken; struct reclaim_stat stat; bool file = is_file_lru(lru); - enum vm_event_item item; + enum node_stat_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); bool stalled = false; @@ -2003,19 +1977,17 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, lru_add_drain(); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); item = PGSCAN_KSWAPD + reclaimer_offset(sc); - if (!cgroup_reclaim(sc)) - __count_vm_events(item, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); - __count_vm_events(PGSCAN_ANON + file, nr_scanned); + mod_lruvec_state(lruvec, item, nr_scanned); + mod_lruvec_state(lruvec, PGSCAN_ANON + file, nr_scanned); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); if (nr_taken == 0) return 0; @@ -2023,18 +1995,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false, lruvec_memcg(lruvec)); - spin_lock_irq(&lruvec->lru_lock); - move_folios_to_lru(lruvec, &folio_list); + move_folios_to_lru(&folio_list); mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); - if (!cgroup_reclaim(sc)) - __count_vm_events(item, nr_reclaimed); - count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); - __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); + mod_lruvec_state(lruvec, item, nr_reclaimed); + mod_lruvec_state(lruvec, PGSTEAL_ANON + file, nr_reclaimed); + lruvec_lock_irq(lruvec); lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); @@ -2113,18 +2083,16 @@ static void shrink_active_list(unsigned long nr_to_scan, lru_add_drain(); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - if (!cgroup_reclaim(sc)) - __count_vm_events(PGREFILL, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + mod_lruvec_state(lruvec, PGREFILL, nr_scanned); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); while (!list_empty(&l_hold)) { struct folio *folio; @@ -2173,16 +2141,14 @@ static void shrink_active_list(unsigned long nr_to_scan, /* * Move folios back to the lru list. */ - spin_lock_irq(&lruvec->lru_lock); + nr_activate = move_folios_to_lru(&l_active); + nr_deactivate = move_folios_to_lru(&l_inactive); - nr_activate = move_folios_to_lru(lruvec, &l_active); - nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); - - __count_vm_events(PGDEACTIVATE, nr_deactivate); + count_vm_events(PGDEACTIVATE, nr_deactivate); count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - + lruvec_lock_irq(lruvec); lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); @@ -2319,7 +2285,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) unsigned long file; struct lruvec *target_lruvec; - if (lru_gen_enabled()) + if (lru_gen_enabled() && !lru_gen_switching()) return; target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); @@ -2658,6 +2624,7 @@ static bool can_age_anon_pages(struct lruvec *lruvec, #ifdef CONFIG_LRU_GEN +DEFINE_STATIC_KEY_FALSE(lru_switch); #ifdef CONFIG_LRU_GEN_ENABLED DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); #define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) @@ -2896,8 +2863,9 @@ static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) return NULL; clear_bit(key, &mm->lru_gen.bitmap); + mmgrab(mm); - return mmget_not_zero(mm) ? mm : NULL; + return mm; } void lru_gen_add_mm(struct mm_struct *mm) @@ -3097,7 +3065,7 @@ done: reset_bloom_filter(mm_state, walk->seq + 1); if (*iter) - mmput_async(*iter); + mmdrop(*iter); *iter = mm; @@ -3452,8 +3420,10 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, if (folio_nid(folio) != pgdat->node_id) return NULL; + rcu_read_lock(); if (folio_memcg(folio) != memcg) - return NULL; + folio = NULL; + rcu_read_unlock(); return folio; } @@ -3506,6 +3476,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); int gen = lru_gen_from_seq(max_seq); + unsigned int nr; pmd_t pmdval; pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl); @@ -3524,11 +3495,13 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, lazy_mmu_mode_enable(); restart: - for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { + for (i = pte_index(start), addr = start; addr != end; i += nr, addr += nr * PAGE_SIZE) { unsigned long pfn; struct folio *folio; - pte_t ptent = ptep_get(pte + i); + pte_t *cur_pte = pte + i; + pte_t ptent = ptep_get(cur_pte); + nr = 1; total++; walk->mm_stats[MM_LEAF_TOTAL]++; @@ -3540,7 +3513,16 @@ restart: if (!folio) continue; - if (!ptep_clear_young_notify(args->vma, addr, pte + i)) + if (folio_test_large(folio)) { + const unsigned int max_nr = (end - addr) >> PAGE_SHIFT; + + nr = folio_pte_batch_flags(folio, NULL, cur_pte, &ptent, + max_nr, FPB_MERGE_YOUNG_DIRTY); + total += nr - 1; + walk->mm_stats[MM_LEAF_TOTAL] += nr - 1; + } + + if (!test_and_clear_young_ptes_notify(args->vma, addr, cur_pte, nr)) continue; if (last != folio) { @@ -3553,8 +3535,8 @@ restart: if (pte_dirty(ptent)) dirty = true; - young++; - walk->mm_stats[MM_LEAF_YOUNG]++; + young += nr; + walk->mm_stats[MM_LEAF_YOUNG] += nr; } walk_update_folio(walk, last, gen, dirty); @@ -3631,7 +3613,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (!folio) goto next; - if (!pmdp_clear_young_notify(vma, addr, pmd + i)) + if (!pmdp_test_and_clear_young_notify(vma, addr, pmd + i)) goto next; if (last != folio) { @@ -3801,9 +3783,9 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) } if (walk->batched) { - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); reset_batch_size(walk); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); } cond_resched(); @@ -3963,7 +3945,7 @@ restart: if (seq < READ_ONCE(lrugen->max_seq)) return false; - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -3978,7 +3960,7 @@ restart: if (inc_min_seq(lruvec, type, swappiness)) continue; - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); cond_resched(); goto restart; } @@ -4013,7 +3995,7 @@ restart: /* make sure preceding modifications appear */ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); unlock: - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); return success; } @@ -4198,7 +4180,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * the PTE table to the Bloom filter. This forms a feedback loop between the * eviction and the aging. */ -bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr) { int i; bool dirty; @@ -4211,17 +4193,17 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; struct folio *folio = pfn_folio(pvmw->pfn); - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; struct pglist_data *pgdat = folio_pgdat(folio); - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); - DEFINE_MAX_SEQ(lruvec); - int gen = lru_gen_from_seq(max_seq); + struct lruvec *lruvec; + struct lru_gen_mm_state *mm_state; + unsigned long max_seq; + int gen; lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); - if (!ptep_clear_young_notify(vma, addr, pte)) + if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr)) return false; if (spin_is_contended(pvmw->ptl)) @@ -4251,14 +4233,22 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } } + memcg = get_mem_cgroup_from_folio(folio); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + max_seq = READ_ONCE((lruvec)->lrugen.max_seq); + gen = lru_gen_from_seq(max_seq); + mm_state = get_mm_state(lruvec); + lazy_mmu_mode_enable(); pte -= (addr - start) / PAGE_SIZE; - for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + for (i = 0, addr = start; addr != end; + i += nr, pte += nr, addr += nr * PAGE_SIZE) { unsigned long pfn; - pte_t ptent = ptep_get(pte + i); + pte_t ptent = ptep_get(pte); + nr = 1; pfn = get_pte_pfn(ptent, vma, addr, pgdat); if (pfn == -1) continue; @@ -4267,7 +4257,14 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (!folio) continue; - if (!ptep_clear_young_notify(vma, addr, pte + i)) + if (folio_test_large(folio)) { + const unsigned int max_nr = (end - addr) >> PAGE_SHIFT; + + nr = folio_pte_batch_flags(folio, NULL, pte, &ptent, + max_nr, FPB_MERGE_YOUNG_DIRTY); + } + + if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr)) continue; if (last != folio) { @@ -4280,7 +4277,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (pte_dirty(ptent)) dirty = true; - young++; + young += nr; } walk_update_folio(walk, last, gen, dirty); @@ -4291,6 +4288,8 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (mm_state && suitable_to_scan(i, young)) update_bloom_filter(mm_state, max_seq, pvmw->pmd); + mem_cgroup_put(memcg); + return true; } @@ -4426,6 +4425,148 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); } +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid) +{ + struct lruvec *lruvec = get_lruvec(memcg, nid); + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + return false; + } + + return true; +} + +static void try_to_inc_max_seq_nowalk(struct mem_cgroup *memcg, + struct lruvec *lruvec) +{ + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + int swappiness = mem_cgroup_swappiness(memcg); + DEFINE_MAX_SEQ(lruvec); + bool success = false; + + /* + * We are not iterating the mm_list here, updating mm_state->seq is just + * to make mm walkers work properly. + */ + if (mm_state) { + spin_lock(&mm_list->lock); + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + if (max_seq > mm_state->seq) { + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + success = true; + } + spin_unlock(&mm_list->lock); + } else { + success = true; + } + + if (success) + inc_max_seq(lruvec, max_seq, swappiness); +} + +/* + * We need to ensure that the folios of child memcg can be reparented to the + * same gen of the parent memcg, so the gens of the parent memcg needed be + * incremented to the MAX_NR_GENS before reparenting. + */ +void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid) +{ + struct lruvec *lruvec = get_lruvec(memcg, nid); + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + while (get_nr_gens(lruvec, type) < MAX_NR_GENS) { + try_to_inc_max_seq_nowalk(memcg, lruvec); + cond_resched(); + } + } +} + +/* + * Compared to traditional LRU, MGLRU faces the following challenges: + * + * 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the + * number of generations of the parent and child memcg may be different, + * so we cannot simply transfer MGLRU folios in the child memcg to the + * parent memcg as we did for traditional LRU folios. + * 2. The generation information is stored in folio->flags, but we cannot + * traverse these folios while holding the lru lock, otherwise it may + * cause softlockup. + * 3. In walk_update_folio(), the gen of folio and corresponding lru size + * may be updated, but the folio is not immediately moved to the + * corresponding lru list. Therefore, there may be folios of different + * generations on an LRU list. + * 4. In lru_gen_del_folio(), the generation to which the folio belongs is + * found based on the generation information in folio->flags, and the + * corresponding LRU size will be updated. Therefore, we need to update + * the lru size correctly during reparenting, otherwise the lru size may + * be updated incorrectly in lru_gen_del_folio(). + * + * Finally, we choose a compromise method, which is to splice the lru list in + * the child memcg to the lru list of the same generation in the parent memcg + * during reparenting. + * + * The same generation has different meanings in the parent and child memcg, + * so this compromise method will cause the LRU inversion problem. But as the + * system runs, this problem will be fixed automatically. + */ +static void __lru_gen_reparent_memcg(struct lruvec *child_lruvec, struct lruvec *parent_lruvec, + int zone, int type) +{ + struct lru_gen_folio *child_lrugen, *parent_lrugen; + enum lru_list lru = type * LRU_INACTIVE_FILE; + int i; + + child_lrugen = &child_lruvec->lrugen; + parent_lrugen = &parent_lruvec->lrugen; + + for (i = 0; i < get_nr_gens(child_lruvec, type); i++) { + int gen = lru_gen_from_seq(child_lrugen->max_seq - i); + long nr_pages = child_lrugen->nr_pages[gen][type][zone]; + int child_lru_active = lru_gen_is_active(child_lruvec, gen) ? LRU_ACTIVE : 0; + int parent_lru_active = lru_gen_is_active(parent_lruvec, gen) ? LRU_ACTIVE : 0; + + /* Assuming that child pages are colder than parent pages */ + list_splice_tail_init(&child_lrugen->folios[gen][type][zone], + &parent_lrugen->folios[gen][type][zone]); + + WRITE_ONCE(child_lrugen->nr_pages[gen][type][zone], 0); + WRITE_ONCE(parent_lrugen->nr_pages[gen][type][zone], + parent_lrugen->nr_pages[gen][type][zone] + nr_pages); + + if (lru_gen_is_active(child_lruvec, gen) != lru_gen_is_active(parent_lruvec, gen)) { + __update_lru_size(child_lruvec, lru + child_lru_active, zone, -nr_pages); + __update_lru_size(parent_lruvec, lru + parent_lru_active, zone, nr_pages); + } + } +} + +void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + struct lruvec *child_lruvec, *parent_lruvec; + int type, zid; + struct zone *zone; + enum lru_list lru; + + child_lruvec = get_lruvec(memcg, nid); + parent_lruvec = get_lruvec(parent, nid); + + for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) + for (type = 0; type < ANON_AND_FILE; type++) + __lru_gen_reparent_memcg(child_lruvec, parent_lruvec, zid, type); + + for_each_lru(lru) { + for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) { + unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid); + + mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size); + } + } +} + #endif /* CONFIG_MEMCG */ /****************************************************************************** @@ -4543,7 +4684,7 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, { int i; int gen; - enum vm_event_item item; + enum node_stat_item item; int sorted = 0; int scanned = 0; int isolated = 0; @@ -4551,7 +4692,6 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, int scan_batch = min(nr_to_scan, MAX_LRU_BATCH); int remaining = scan_batch; struct lru_gen_folio *lrugen = &lruvec->lrugen; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); VM_WARN_ON_ONCE(!list_empty(list)); @@ -4602,13 +4742,9 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, } item = PGSCAN_KSWAPD + reclaimer_offset(sc); - if (!cgroup_reclaim(sc)) { - __count_vm_events(item, isolated); - __count_vm_events(PGREFILL, sorted); - } - count_memcg_events(memcg, item, isolated); - count_memcg_events(memcg, PGREFILL, sorted); - __count_vm_events(PGSCAN_ANON + type, isolated); + mod_lruvec_state(lruvec, item, isolated); + mod_lruvec_state(lruvec, PGREFILL, sorted); + mod_lruvec_state(lruvec, PGSCAN_ANON + type, isolated); trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, scan_batch, scanned, skipped, isolated, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); @@ -4624,7 +4760,7 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, static int get_tier_idx(struct lruvec *lruvec, int type) { int tier; - struct ctrl_pos sp, pv; + struct ctrl_pos sp, pv = {}; /* * To leave a margin for fluctuations, use a larger gain factor (2:3). @@ -4643,7 +4779,7 @@ static int get_tier_idx(struct lruvec *lruvec, int type) static int get_type_to_scan(struct lruvec *lruvec, int swappiness) { - struct ctrl_pos sp, pv; + struct ctrl_pos sp, pv = {}; if (swappiness <= MIN_SWAPPINESS + 1) return LRU_GEN_FILE; @@ -4693,7 +4829,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, LIST_HEAD(clean); struct folio *folio; struct folio *next; - enum vm_event_item item; + enum node_stat_item item; struct reclaim_stat stat; struct lru_gen_mm_walk *walk; bool skip_retry = false; @@ -4701,7 +4837,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list); @@ -4710,7 +4846,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) scanned = 0; - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); if (list_empty(&list)) return scanned; @@ -4743,26 +4879,22 @@ retry: set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active)); } - spin_lock_irq(&lruvec->lru_lock); - - move_folios_to_lru(lruvec, &list); + move_folios_to_lru(&list); walk = current->reclaim_state->mm_walk; if (walk && walk->batched) { walk->lruvec = lruvec; + lruvec_lock_irq(lruvec); reset_batch_size(walk); + lruvec_unlock_irq(lruvec); } mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); - if (!cgroup_reclaim(sc)) - __count_vm_events(item, reclaimed); - count_memcg_events(memcg, item, reclaimed); - __count_vm_events(PGSTEAL_ANON + type, reclaimed); - - spin_unlock_irq(&lruvec->lru_lock); + mod_lruvec_state(lruvec, item, reclaimed); + mod_lruvec_state(lruvec, PGSTEAL_ANON + type, reclaimed); list_splice_init(&clean, &list); @@ -4839,10 +4971,6 @@ static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) int i; enum zone_watermarks mark; - /* don't abort memcg reclaim to ensure fairness */ - if (!root_reclaim(sc)) - return false; - if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order))) return true; @@ -4896,9 +5024,24 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * If too many file cache in the coldest generation can't be evicted * due to being dirty, wake up the flusher. */ - if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) + if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) { + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + wakeup_flusher_threads(WB_REASON_VMSCAN); + /* + * For cgroupv1 dirty throttling is achieved by waking up + * the kernel flusher here and later waiting on folios + * which are in writeback to finish (see shrink_folio_list()). + * + * Flusher may not be able to issue writeback quickly + * enough for cgroupv1 writeback throttling to work + * on a large system. + */ + if (!writeback_throttling_sane(sc)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } + /* whether this lruvec should be rotated */ return nr_to_scan < 0; } @@ -5178,6 +5321,8 @@ static void lru_gen_change_state(bool enabled) if (enabled == lru_gen_enabled()) goto unlock; + static_branch_enable_cpuslocked(&lru_switch); + if (enabled) static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); else @@ -5190,7 +5335,7 @@ static void lru_gen_change_state(bool enabled) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); VM_WARN_ON_ONCE(!state_is_valid(lruvec)); @@ -5198,16 +5343,19 @@ static void lru_gen_change_state(bool enabled) lruvec->lrugen.enabled = enabled; while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); cond_resched(); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); } - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); } cond_resched(); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + static_branch_disable_cpuslocked(&lru_switch); + unlock: mutex_unlock(&state_mutex); put_online_mems(); @@ -5780,9 +5928,12 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) bool proportional_reclaim; struct blk_plug plug; - if (lru_gen_enabled() && !root_reclaim(sc)) { + if ((lru_gen_enabled() || lru_gen_switching()) && !root_reclaim(sc)) { lru_gen_shrink_lruvec(lruvec, sc); - return; + + if (!lru_gen_switching()) + return; + } get_scan_count(lruvec, sc, nr); @@ -6042,10 +6193,13 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; bool reclaimable = false; - if (lru_gen_enabled() && root_reclaim(sc)) { + if ((lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) { memset(&sc->nr, 0, sizeof(sc->nr)); lru_gen_shrink_node(pgdat, sc); - return; + + if (!lru_gen_switching()) + return; + } target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); @@ -6315,7 +6469,7 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) struct lruvec *target_lruvec; unsigned long refaults; - if (lru_gen_enabled()) + if (lru_gen_enabled() && !lru_gen_switching()) return; target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); @@ -6596,11 +6750,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, return 1; set_task_reclaim_state(current, &sc.reclaim_state); - trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); + trace_mm_vmscan_direct_reclaim_begin(sc.gfp_mask, order, 0); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + trace_mm_vmscan_direct_reclaim_end(nr_reclaimed, 0); set_task_reclaim_state(current, NULL); return nr_reclaimed; @@ -6629,8 +6783,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, - sc.gfp_mask); + trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.gfp_mask, + sc.order, + memcg); /* * NOTE: Although we can get the priority field, using it @@ -6641,7 +6796,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, */ shrink_lruvec(lruvec, &sc); - trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed, memcg); *nr_scanned = sc.nr_scanned; @@ -6677,13 +6832,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); set_task_reclaim_state(current, &sc.reclaim_state); - trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); + trace_mm_vmscan_memcg_reclaim_begin(sc.gfp_mask, 0, memcg); noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); memalloc_noreclaim_restore(noreclaim_flag); - trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed, memcg); set_task_reclaim_state(current, NULL); return nr_reclaimed; @@ -6704,9 +6859,12 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) struct mem_cgroup *memcg; struct lruvec *lruvec; - if (lru_gen_enabled()) { + if (lru_gen_enabled() || lru_gen_switching()) { lru_gen_age_node(pgdat, sc); - return; + + if (!lru_gen_switching()) + return; + } lruvec = mem_cgroup_lruvec(NULL, pgdat); @@ -7657,7 +7815,7 @@ static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, delayacct_freepages_end(); psi_memstall_leave(&pflags); - trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed); + trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed, 0); return sc->nr_reclaimed; } @@ -7879,7 +8037,7 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); } else if (pgscanned) { count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 86b14b0f77b5..f534972f517d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -547,7 +547,7 @@ EXPORT_SYMBOL(__dec_node_page_state); #ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * If we have cmpxchg_local support then we do not need to incur the overhead - * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. + * that comes with local_irq_save/restore if we use this_cpu_try_cmpxchg(). * * mod_state() modifies the zone counter state through atomic per cpu * operations. @@ -1255,6 +1255,7 @@ const char * const vmstat_text[] = { [I(NR_KERNEL_MISC_RECLAIMABLE)] = "nr_kernel_misc_reclaimable", [I(NR_FOLL_PIN_ACQUIRED)] = "nr_foll_pin_acquired", [I(NR_FOLL_PIN_RELEASED)] = "nr_foll_pin_released", + [I(NR_VMALLOC)] = "nr_vmalloc", [I(NR_KERNEL_STACK_KB)] = "nr_kernel_stack", #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) [I(NR_KERNEL_SCS_KB)] = "nr_shadow_call_stack", @@ -1276,11 +1277,26 @@ const char * const vmstat_text[] = { [I(PGDEMOTE_DIRECT)] = "pgdemote_direct", [I(PGDEMOTE_KHUGEPAGED)] = "pgdemote_khugepaged", [I(PGDEMOTE_PROACTIVE)] = "pgdemote_proactive", + [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd", + [I(PGSTEAL_DIRECT)] = "pgsteal_direct", + [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged", + [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive", + [I(PGSTEAL_ANON)] = "pgsteal_anon", + [I(PGSTEAL_FILE)] = "pgsteal_file", + [I(PGSCAN_KSWAPD)] = "pgscan_kswapd", + [I(PGSCAN_DIRECT)] = "pgscan_direct", + [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged", + [I(PGSCAN_PROACTIVE)] = "pgscan_proactive", + [I(PGSCAN_ANON)] = "pgscan_anon", + [I(PGSCAN_FILE)] = "pgscan_file", + [I(PGREFILL)] = "pgrefill", #ifdef CONFIG_HUGETLB_PAGE [I(NR_HUGETLB)] = "nr_hugetlb", #endif [I(NR_BALLOON_PAGES)] = "nr_balloon_pages", [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages", + [I(NR_GPU_ACTIVE)] = "nr_gpu_active", + [I(NR_GPU_RECLAIM)] = "nr_gpu_reclaim", #undef I /* system-wide enum vm_stat_item counters */ @@ -1318,21 +1334,8 @@ const char * const vmstat_text[] = { [I(PGMAJFAULT)] = "pgmajfault", [I(PGLAZYFREED)] = "pglazyfreed", - [I(PGREFILL)] = "pgrefill", [I(PGREUSE)] = "pgreuse", - [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd", - [I(PGSTEAL_DIRECT)] = "pgsteal_direct", - [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged", - [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive", - [I(PGSCAN_KSWAPD)] = "pgscan_kswapd", - [I(PGSCAN_DIRECT)] = "pgscan_direct", - [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged", - [I(PGSCAN_PROACTIVE)] = "pgscan_proactive", [I(PGSCAN_DIRECT_THROTTLE)] = "pgscan_direct_throttle", - [I(PGSCAN_ANON)] = "pgscan_anon", - [I(PGSCAN_FILE)] = "pgscan_file", - [I(PGSTEAL_ANON)] = "pgsteal_anon", - [I(PGSTEAL_FILE)] = "pgsteal_file", #ifdef CONFIG_NUMA [I(PGSCAN_ZONE_RECLAIM_SUCCESS)] = "zone_reclaim_success", @@ -2138,7 +2141,7 @@ static void vmstat_shepherd(struct work_struct *w) if (cpu_is_isolated(cpu)) continue; - if (!delayed_work_pending(dw) && need_update(cpu)) + if (!work_busy(&dw->work) && need_update(cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); } diff --git a/mm/workingset.c b/mm/workingset.c index 13422d304715..07e6836d0502 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -16,6 +16,7 @@ #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> +#include "swap_table.h" #include "internal.h" /* @@ -184,7 +185,9 @@ #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ WORKINGSET_SHIFT + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) +#define EVICTION_SHIFT_ANON (EVICTION_SHIFT + SWAP_COUNT_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) +#define EVICTION_MASK_ANON (~0UL >> EVICTION_SHIFT_ANON) /* * Eviction timestamps need to be able to cover the full range of @@ -194,12 +197,12 @@ * that case, we have to sacrifice granularity for distance, and group * evictions into coarser buckets by shaving off lower timestamp bits. */ -static unsigned int bucket_order __read_mostly; +static unsigned int bucket_order[ANON_AND_FILE] __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, - bool workingset) + bool workingset, bool file) { - eviction &= EVICTION_MASK; + eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << WORKINGSET_SHIFT) | workingset; @@ -241,11 +244,15 @@ static void *lru_gen_eviction(struct folio *folio) int refs = folio_lru_refs(folio); bool workingset = folio_test_workingset(folio); int tier = lru_tier_from_refs(refs, workingset); - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; struct pglist_data *pgdat = folio_pgdat(folio); + unsigned short memcg_id; - BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > + BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON)); + rcu_read_lock(); + memcg = folio_memcg(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; min_seq = READ_ONCE(lrugen->min_seq[type]); @@ -253,8 +260,10 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); + memcg_id = mem_cgroup_private_id(memcg); + rcu_read_unlock(); - return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset); + return pack_shadow(memcg_id, pgdat, token, workingset, type); } /* @@ -262,7 +271,7 @@ static void *lru_gen_eviction(struct folio *folio) * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, - unsigned long *token, bool *workingset) + unsigned long *token, bool *workingset, bool file) { int memcg_id; unsigned long max_seq; @@ -275,7 +284,7 @@ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, *lruvec = mem_cgroup_lruvec(memcg, pgdat); max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); - max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; + max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH; return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; } @@ -293,7 +302,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) rcu_read_lock(); - recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); + recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset, type); if (lruvec != folio_lruvec(folio)) goto unlock; @@ -331,7 +340,7 @@ static void *lru_gen_eviction(struct folio *folio) } static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, - unsigned long *token, bool *workingset) + unsigned long *token, bool *workingset, bool file) { return false; } @@ -381,6 +390,7 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = folio_pgdat(folio); + int file = folio_is_file_lru(folio); unsigned long eviction; struct lruvec *lruvec; int memcgid; @@ -397,10 +407,10 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); - eviction >>= bucket_order; + eviction >>= bucket_order[file]; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); return pack_shadow(memcgid, pgdat, eviction, - folio_test_workingset(folio)); + folio_test_workingset(folio), file); } /** @@ -431,14 +441,15 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool recent; rcu_read_lock(); - recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); + recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, + workingset, file); rcu_read_unlock(); return recent; } rcu_read_lock(); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); - eviction <<= bucket_order; + eviction <<= bucket_order[file]; /* * Look up the memcg associated with the stored ID. It might @@ -495,7 +506,8 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. */ - refault_distance = (refault - eviction) & EVICTION_MASK; + refault_distance = ((refault - eviction) & + (file ? EVICTION_MASK : EVICTION_MASK_ANON)); /* * Compare the distance to the existing workingset size. We @@ -534,7 +546,6 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, void workingset_refault(struct folio *folio, void *shadow) { bool file = folio_is_file_lru(folio); - struct pglist_data *pgdat; struct mem_cgroup *memcg; struct lruvec *lruvec; bool workingset; @@ -557,14 +568,12 @@ void workingset_refault(struct folio *folio, void *shadow) * locked to guarantee folio_memcg() stability throughout. */ nr = folio_nr_pages(folio); - memcg = folio_memcg(folio); - pgdat = folio_pgdat(folio); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - + memcg = get_mem_cgroup_from_folio(folio); + lruvec = mem_cgroup_lruvec(memcg, folio_pgdat(folio)); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset, true)) - return; + goto out; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); @@ -580,6 +589,8 @@ void workingset_refault(struct folio *folio, void *shadow) lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } +out: + mem_cgroup_put(memcg); } /** @@ -592,8 +603,11 @@ void workingset_activation(struct folio *folio) * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. */ - if (mem_cgroup_disabled() || folio_memcg_charged(folio)) + if (mem_cgroup_disabled() || folio_memcg_charged(folio)) { + rcu_read_lock(); workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); + rcu_read_unlock(); + } } /* @@ -677,9 +691,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); + for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) - pages += lruvec_page_state_local(lruvec, - NR_LRU_BASE + i); + pages += lruvec_lru_size(lruvec, i, MAX_NR_ZONES - 1); + pages += lruvec_page_state_local( lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; pages += lruvec_page_state_local( @@ -780,8 +795,8 @@ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { + unsigned int timestamp_bits, timestamp_bits_anon; struct shrinker *workingset_shadow_shrinker; - unsigned int timestamp_bits; unsigned int max_order; int ret = -ENOMEM; @@ -794,11 +809,15 @@ static int __init workingset_init(void) * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; + timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON; max_order = fls_long(totalram_pages() - 1); - if (max_order > timestamp_bits) - bucket_order = max_order - timestamp_bits; - pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", - timestamp_bits, max_order, bucket_order); + if (max_order > (BITS_PER_LONG - EVICTION_SHIFT)) + bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits; + if (max_order > timestamp_bits_anon) + bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon; + pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n", + timestamp_bits, timestamp_bits_anon, max_order, + bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]); workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2c1430bf8d57..63128ddb7959 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1727,7 +1727,19 @@ static int zs_page_migrate(struct page *newpage, struct page *page, if (!zspage_write_trylock(zspage)) { spin_unlock(&class->lock); write_unlock(&pool->lock); - return -EINVAL; + /* + * Return -EBUSY but not -EAGAIN: the zspage's reader-lock + * owner may hold the lock for an unbounded duration due to a + * slow decompression or reader-lock owner preemption. + * Since migration retries are bounded by + * NR_MAX_MIGRATE_PAGES_RETRY and performed with virtually no + * delay between attempts, there is no guarantee the lock will + * be released in time for a retry to succeed. + * -EAGAIN implies "try again soon", which does not hold here. + * -EBUSY more accurately conveys "resource is occupied, + * migration cannot proceed". + */ + return -EBUSY; } /* We're committed, tell the world that this is a Zsmalloc page. */ @@ -1741,6 +1753,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, */ d_addr = kmap_local_zpdesc(newzpdesc); copy_page(d_addr, s_addr); + kmsan_copy_page_meta(zpdesc_page(newzpdesc), zpdesc_page(zpdesc)); kunmap_local(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; diff --git a/mm/zswap.c b/mm/zswap.c index e6ec3295bdb0..4b5149173b0e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -242,6 +242,34 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp) **********************************/ static void __zswap_pool_empty(struct percpu_ref *ref); +static void acomp_ctx_free(struct crypto_acomp_ctx *acomp_ctx) +{ + if (!acomp_ctx) + return; + + /* + * If there was an error in allocating @acomp_ctx->req, it + * would be set to NULL. + */ + if (acomp_ctx->req) + acomp_request_free(acomp_ctx->req); + + acomp_ctx->req = NULL; + + /* + * We have to handle both cases here: an error pointer return from + * crypto_alloc_acomp_node(); and a) NULL initialization by zswap, or + * b) NULL assignment done in a previous call to acomp_ctx_free(). + */ + if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) + crypto_free_acomp(acomp_ctx->acomp); + + acomp_ctx->acomp = NULL; + + kfree(acomp_ctx->buffer); + acomp_ctx->buffer = NULL; +} + static struct zswap_pool *zswap_pool_create(char *compressor) { struct zswap_pool *pool; @@ -263,19 +291,27 @@ static struct zswap_pool *zswap_pool_create(char *compressor) strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); - pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); + /* Many things rely on the zero-initialization. */ + pool->acomp_ctx = alloc_percpu_gfp(*pool->acomp_ctx, + GFP_KERNEL | __GFP_ZERO); if (!pool->acomp_ctx) { pr_err("percpu alloc failed\n"); goto error; } - for_each_possible_cpu(cpu) - mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex); - + /* + * This is serialized against CPU hotplug operations. Hence, cores + * cannot be offlined until this finishes. + */ ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + + /* + * cpuhp_state_add_instance() will not cleanup on failure since + * we don't register a hotunplug callback. + */ if (ret) - goto error; + goto cpuhp_add_fail; /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool @@ -292,6 +328,10 @@ static struct zswap_pool *zswap_pool_create(char *compressor) ref_fail: cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + +cpuhp_add_fail: + for_each_possible_cpu(cpu) + acomp_ctx_free(per_cpu_ptr(pool->acomp_ctx, cpu)); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -322,9 +362,15 @@ static struct zswap_pool *__zswap_pool_create_fallback(void) static void zswap_pool_destroy(struct zswap_pool *pool) { + int cpu; + zswap_pool_debug("destroying", pool); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + + for_each_possible_cpu(cpu) + acomp_ctx_free(per_cpu_ptr(pool->acomp_ctx, cpu)); + free_percpu(pool->acomp_ctx); zs_destroy_pool(pool->zs_pool); @@ -664,8 +710,10 @@ void zswap_folio_swapin(struct folio *folio) struct lruvec *lruvec; if (folio) { + rcu_read_lock(); lruvec = folio_lruvec(folio); atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins); + rcu_read_unlock(); } } @@ -736,39 +784,41 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp = NULL; - struct acomp_req *req = NULL; - u8 *buffer = NULL; - int ret; + int ret = -ENOMEM; - buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu)); - if (!buffer) { - ret = -ENOMEM; - goto fail; + /* + * To handle cases where the CPU goes through online-offline-online + * transitions, we return if the acomp_ctx has already been initialized. + */ + if (acomp_ctx->acomp) { + WARN_ON_ONCE(IS_ERR(acomp_ctx->acomp)); + return 0; } - acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); - if (IS_ERR(acomp)) { + acomp_ctx->buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu)); + if (!acomp_ctx->buffer) + return ret; + + /* + * In case of an error, crypto_alloc_acomp_node() returns an + * error pointer, never NULL. + */ + acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); + if (IS_ERR(acomp_ctx->acomp)) { pr_err("could not alloc crypto acomp %s : %pe\n", - pool->tfm_name, acomp); - ret = PTR_ERR(acomp); + pool->tfm_name, acomp_ctx->acomp); + ret = PTR_ERR(acomp_ctx->acomp); goto fail; } - req = acomp_request_alloc(acomp); - if (!req) { + /* acomp_request_alloc() returns NULL in case of an error. */ + acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp); + if (!acomp_ctx->req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); - ret = -ENOMEM; goto fail; } - /* - * Only hold the mutex after completing allocations, otherwise we may - * recurse into zswap through reclaim and attempt to hold the mutex - * again resulting in a deadlock. - */ - mutex_lock(&acomp_ctx->mutex); crypto_init_wait(&acomp_ctx->wait); /* @@ -776,80 +826,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) * crypto_wait_req(); if the backend of acomp is scomp, the callback * won't be called, crypto_wait_req() will return without blocking. */ - acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); - acomp_ctx->buffer = buffer; - acomp_ctx->acomp = acomp; - acomp_ctx->req = req; - mutex_unlock(&acomp_ctx->mutex); + mutex_init(&acomp_ctx->mutex); return 0; fail: - if (!IS_ERR_OR_NULL(acomp)) - crypto_free_acomp(acomp); - kfree(buffer); + acomp_ctx_free(acomp_ctx); return ret; } -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct acomp_req *req; - struct crypto_acomp *acomp; - u8 *buffer; - - if (IS_ERR_OR_NULL(acomp_ctx)) - return 0; - - mutex_lock(&acomp_ctx->mutex); - req = acomp_ctx->req; - acomp = acomp_ctx->acomp; - buffer = acomp_ctx->buffer; - acomp_ctx->req = NULL; - acomp_ctx->acomp = NULL; - acomp_ctx->buffer = NULL; - mutex_unlock(&acomp_ctx->mutex); - - /* - * Do the actual freeing after releasing the mutex to avoid subtle - * locking dependencies causing deadlocks. - */ - if (!IS_ERR_OR_NULL(req)) - acomp_request_free(req); - if (!IS_ERR_OR_NULL(acomp)) - crypto_free_acomp(acomp); - kfree(buffer); - - return 0; -} - -static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool) -{ - struct crypto_acomp_ctx *acomp_ctx; - - for (;;) { - acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); - mutex_lock(&acomp_ctx->mutex); - if (likely(acomp_ctx->req)) - return acomp_ctx; - /* - * It is possible that we were migrated to a different CPU after - * getting the per-CPU ctx but before the mutex was acquired. If - * the old CPU got offlined, zswap_cpu_comp_dead() could have - * already freed ctx->req (among other things) and set it to - * NULL. Just try again on the new CPU that we ended up on. - */ - mutex_unlock(&acomp_ctx->mutex); - } -} - -static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx) -{ - mutex_unlock(&acomp_ctx->mutex); -} - static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zswap_pool *pool) { @@ -862,7 +849,9 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, u8 *dst; bool mapped = false; - acomp_ctx = acomp_ctx_get_cpu_lock(pool); + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); + dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, page, PAGE_SIZE, 0); @@ -893,11 +882,14 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, * to the active LRU list in the case. */ if (comp_ret || !dlen || dlen >= PAGE_SIZE) { + rcu_read_lock(); if (!mem_cgroup_zswap_writeback_enabled( folio_memcg(page_folio(page)))) { + rcu_read_unlock(); comp_ret = comp_ret ? comp_ret : -EINVAL; goto unlock; } + rcu_read_unlock(); comp_ret = 0; dlen = PAGE_SIZE; dst = kmap_local_page(page); @@ -925,7 +917,7 @@ unlock: else if (alloc_ret) zswap_reject_alloc_fail++; - acomp_ctx_put_unlock(acomp_ctx); + mutex_unlock(&acomp_ctx->mutex); return comp_ret == 0 && alloc_ret == 0; } @@ -937,14 +929,21 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct crypto_acomp_ctx *acomp_ctx; int ret = 0, dlen; - acomp_ctx = acomp_ctx_get_cpu_lock(pool); + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); zs_obj_read_sg_begin(pool->zs_pool, entry->handle, input, entry->length); /* zswap entries of length PAGE_SIZE are not compressed. */ if (entry->length == PAGE_SIZE) { + void *dst; + WARN_ON_ONCE(input->length != PAGE_SIZE); - memcpy_from_sglist(kmap_local_folio(folio, 0), input, 0, PAGE_SIZE); + + dst = kmap_local_folio(folio, 0); + memcpy_from_sglist(dst, input, 0, PAGE_SIZE); dlen = PAGE_SIZE; + kunmap_local(dst); + flush_dcache_folio(folio); } else { sg_init_table(&output, 1); sg_set_folio(&output, folio, PAGE_SIZE, 0); @@ -956,7 +955,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) } zs_obj_read_sg_end(pool->zs_pool, entry->handle); - acomp_ctx_put_unlock(acomp_ctx); + mutex_unlock(&acomp_ctx->mutex); if (!ret && dlen == PAGE_SIZE) return true; @@ -1589,11 +1588,11 @@ int zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); - bool swapcache = folio_test_swapcache(folio); struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; VM_WARN_ON_ONCE(!folio_test_locked(folio)); + VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); if (zswap_never_enabled()) return -ENOENT; @@ -1624,22 +1623,15 @@ int zswap_load(struct folio *folio) count_objcg_events(entry->objcg, ZSWPIN, 1); /* - * When reading into the swapcache, invalidate our entry. The - * swapcache can be the authoritative owner of the page and + * We are reading into the swapcache, invalidate zswap entry. + * The swapcache is the authoritative owner of the page and * its mappings, and the pressure that results from having two * in-memory copies outweighs any benefits of caching the * compression work. - * - * (Most swapins go through the swapcache. The notable - * exception is the singleton fault on SWP_SYNCHRONOUS_IO - * files, which reads into a private page and may free it if - * the fault fails. We remain the primary owner of the entry.) */ - if (swapcache) { - folio_mark_dirty(folio); - xa_erase(tree, offset); - zswap_entry_free(entry); - } + folio_mark_dirty(folio); + xa_erase(tree, offset); + zswap_entry_free(entry); folio_unlock(folio); return 0; @@ -1783,7 +1775,7 @@ static int zswap_setup(void) ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, "mm/zswap_pool:prepare", zswap_cpu_comp_prepare, - zswap_cpu_comp_dead); + NULL); if (ret) goto hp_fail; |
