summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig18
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/cma.c2
-rw-r--r--mm/damon/Kconfig2
-rw-r--r--mm/damon/core.c95
-rw-r--r--mm/damon/ops-common.c11
-rw-r--r--mm/damon/ops-common.h2
-rw-r--r--mm/damon/paddr.c130
-rw-r--r--mm/damon/sysfs.c41
-rw-r--r--mm/damon/tests/core-kunit.h38
-rw-r--r--mm/damon/tests/vaddr-kunit.h2
-rw-r--r--mm/damon/vaddr.c105
-rw-r--r--mm/debug.c4
-rw-r--r--mm/filemap.c47
-rw-r--r--mm/gup.c12
-rw-r--r--mm/huge_memory.c224
-rw-r--r--mm/hugetlb.c8
-rw-r--r--mm/internal.h5
-rw-r--r--mm/kasan/init.c4
-rw-r--r--mm/kasan/kasan_test_c.c40
-rw-r--r--mm/khugepaged.c34
-rw-r--r--mm/ksm.c32
-rw-r--r--mm/memcontrol.c7
-rw-r--r--mm/memory-failure.c12
-rw-r--r--mm/memory.c352
-rw-r--r--mm/migrate.c85
-rw-r--r--mm/migrate_device.c2
-rw-r--r--mm/mincore.c71
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/mmap_lock.c109
-rw-r--r--mm/mmu_gather.c4
-rw-r--r--mm/mmzone.c4
-rw-r--r--mm/nommu.c17
-rw-r--r--mm/oom_kill.c26
-rw-r--r--mm/page-writeback.c46
-rw-r--r--mm/page_alloc.c47
-rw-r--r--mm/pagewalk.c20
-rw-r--r--mm/rmap.c128
-rw-r--r--mm/shmem.c16
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slub.c77
-rw-r--r--mm/sparse.c6
-rw-r--r--mm/swap.c10
-rw-r--r--mm/swap.h10
-rw-r--r--mm/swap_state.c38
-rw-r--r--mm/swapfile.c114
-rw-r--r--mm/userfaultfd.c222
-rw-r--r--mm/util.c6
-rw-r--r--mm/vma.h2
-rw-r--r--mm/vma_init.c2
-rw-r--r--mm/vmalloc.c31
-rw-r--r--mm/vmscan.c31
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/workingset.c2
-rw-r--r--mm/zsmalloc.c4
-rw-r--r--mm/zswap.c62
56 files changed, 1544 insertions, 893 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e443fe8cd6cf..4108bcd96784 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -823,6 +823,22 @@ config ARCH_WANT_GENERAL_HUGETLB
config ARCH_WANTS_THP_SWAP
def_bool n
+config PERSISTENT_HUGE_ZERO_FOLIO
+ bool "Allocate a PMD sized folio for zeroing"
+ depends on TRANSPARENT_HUGEPAGE
+ help
+ Enable this option to reduce the runtime refcounting overhead
+ of the huge zero folio and expand the places in the kernel
+ that can use huge zero folios. For instance, block I/O benefits
+ from access to large folios for zeroing memory.
+
+ With this option enabled, the huge zero folio is allocated
+ once and never freed. One full huge page's worth of memory shall
+ be used.
+
+ Say Y if your system has lots of memory. Say N if you are
+ memory constrained.
+
config MM_ID
def_bool n
@@ -1381,6 +1397,8 @@ config PT_RECLAIM
Note: now only empty user PTE page table pages will be reclaimed.
+config FIND_NORMAL_PAGE
+ def_bool n
source "mm/damon/Kconfig"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 783904d8c5ef..e4d578e6121c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -510,7 +510,7 @@ static void wb_update_bandwidth_workfn(struct work_struct *work)
/*
* Initial write bandwidth: 100 MB/s
*/
-#define INIT_BW (100 << (20 - PAGE_SHIFT))
+#define INIT_BW MB_TO_PAGES(100)
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
gfp_t gfp)
diff --git a/mm/cma.c b/mm/cma.c
index 2ffa4befb99a..e56ec64d0567 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -864,7 +864,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
if (!count)
return page;
- trace_cma_alloc_start(name, count, align);
+ trace_cma_alloc_start(name, count, cma->available_count, cma->count, align);
for (r = 0; r < cma->nranges; r++) {
page = NULL;
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index b3171f9406c1..8c868f7035fc 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -104,7 +104,7 @@ config DAMON_STAT
config DAMON_STAT_ENABLED_DEFAULT
bool "Enable DAMON_STAT by default"
- depends on DAMON_PADDR
+ depends on DAMON_STAT
default DAMON_STAT
help
Whether to enable DAMON_STAT by default. Users can disable it in
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 08065b363972..be5942435d78 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -201,6 +201,7 @@ static int damon_fill_regions_holes(struct damon_region *first,
* @t: the given target.
* @ranges: array of new monitoring target ranges.
* @nr_ranges: length of @ranges.
+ * @min_sz_region: minimum region size.
*
* This function adds new regions to, or modify existing regions of a
* monitoring target to fit in specific ranges.
@@ -208,7 +209,7 @@ static int damon_fill_regions_holes(struct damon_region *first,
* Return: 0 if success, or negative error code otherwise.
*/
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
- unsigned int nr_ranges)
+ unsigned int nr_ranges, unsigned long min_sz_region)
{
struct damon_region *r, *next;
unsigned int i;
@@ -245,16 +246,16 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
/* no region intersects with this range */
newr = damon_new_region(
ALIGN_DOWN(range->start,
- DAMON_MIN_REGION),
- ALIGN(range->end, DAMON_MIN_REGION));
+ min_sz_region),
+ ALIGN(range->end, min_sz_region));
if (!newr)
return -ENOMEM;
damon_insert_region(newr, damon_prev_region(r), r, t);
} else {
/* resize intersecting regions to fit in this range */
first->ar.start = ALIGN_DOWN(range->start,
- DAMON_MIN_REGION);
- last->ar.end = ALIGN(range->end, DAMON_MIN_REGION);
+ min_sz_region);
+ last->ar.end = ALIGN(range->end, min_sz_region);
/* fill possible holes in the range */
err = damon_fill_regions_holes(first, last, t);
@@ -544,6 +545,9 @@ struct damon_ctx *damon_new_ctx(void)
ctx->attrs.min_nr_regions = 10;
ctx->attrs.max_nr_regions = 1000;
+ ctx->addr_unit = 1;
+ ctx->min_sz_region = DAMON_MIN_REGION;
+
INIT_LIST_HEAD(&ctx->adaptive_targets);
INIT_LIST_HEAD(&ctx->schemes);
@@ -570,6 +574,23 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
kfree(ctx);
}
+static bool damon_attrs_equals(const struct damon_attrs *attrs1,
+ const struct damon_attrs *attrs2)
+{
+ const struct damon_intervals_goal *ig1 = &attrs1->intervals_goal;
+ const struct damon_intervals_goal *ig2 = &attrs2->intervals_goal;
+
+ return attrs1->sample_interval == attrs2->sample_interval &&
+ attrs1->aggr_interval == attrs2->aggr_interval &&
+ attrs1->ops_update_interval == attrs2->ops_update_interval &&
+ attrs1->min_nr_regions == attrs2->min_nr_regions &&
+ attrs1->max_nr_regions == attrs2->max_nr_regions &&
+ ig1->access_bp == ig2->access_bp &&
+ ig1->aggrs == ig2->aggrs &&
+ ig1->min_sample_us == ig2->min_sample_us &&
+ ig1->max_sample_us == ig2->max_sample_us;
+}
+
static unsigned int damon_age_for_new_attrs(unsigned int age,
struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
{
@@ -1108,8 +1129,8 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx)
*
* If @src has no region, @dst keeps current regions.
*/
-static int damon_commit_target_regions(
- struct damon_target *dst, struct damon_target *src)
+static int damon_commit_target_regions(struct damon_target *dst,
+ struct damon_target *src, unsigned long src_min_sz_region)
{
struct damon_region *src_region;
struct damon_addr_range *ranges;
@@ -1126,18 +1147,19 @@ static int damon_commit_target_regions(
i = 0;
damon_for_each_region(src_region, src)
ranges[i++] = src_region->ar;
- err = damon_set_regions(dst, ranges, i);
+ err = damon_set_regions(dst, ranges, i, src_min_sz_region);
kfree(ranges);
return err;
}
static int damon_commit_target(
struct damon_target *dst, bool dst_has_pid,
- struct damon_target *src, bool src_has_pid)
+ struct damon_target *src, bool src_has_pid,
+ unsigned long src_min_sz_region)
{
int err;
- err = damon_commit_target_regions(dst, src);
+ err = damon_commit_target_regions(dst, src, src_min_sz_region);
if (err)
return err;
if (dst_has_pid)
@@ -1159,7 +1181,8 @@ static int damon_commit_targets(
if (src_target) {
err = damon_commit_target(
dst_target, damon_target_has_pid(dst),
- src_target, damon_target_has_pid(src));
+ src_target, damon_target_has_pid(src),
+ src->min_sz_region);
if (err)
return err;
} else {
@@ -1182,7 +1205,8 @@ static int damon_commit_targets(
if (!new_target)
return -ENOMEM;
err = damon_commit_target(new_target, false,
- src_target, damon_target_has_pid(src));
+ src_target, damon_target_has_pid(src),
+ src->min_sz_region);
if (err) {
damon_destroy_target(new_target, NULL);
return err;
@@ -1222,10 +1246,14 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
* 2. ops update should be done after pid handling is done (target
* committing require putting pids).
*/
- err = damon_set_attrs(dst, &src->attrs);
- if (err)
- return err;
+ if (!damon_attrs_equals(&dst->attrs, &src->attrs)) {
+ err = damon_set_attrs(dst, &src->attrs);
+ if (err)
+ return err;
+ }
dst->ops = src->ops;
+ dst->addr_unit = src->addr_unit;
+ dst->min_sz_region = src->min_sz_region;
return 0;
}
@@ -1258,8 +1286,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
if (ctx->attrs.min_nr_regions)
sz /= ctx->attrs.min_nr_regions;
- if (sz < DAMON_MIN_REGION)
- sz = DAMON_MIN_REGION;
+ if (sz < ctx->min_sz_region)
+ sz = ctx->min_sz_region;
return sz;
}
@@ -1603,6 +1631,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
* @t: The target of the region.
* @rp: The pointer to the region.
* @s: The scheme to be applied.
+ * @min_sz_region: minimum region size.
*
* If a quota of a scheme has exceeded in a quota charge window, the scheme's
* action would applied to only a part of the target access pattern fulfilling
@@ -1620,7 +1649,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
* Return: true if the region should be entirely skipped, false otherwise.
*/
static bool damos_skip_charged_region(struct damon_target *t,
- struct damon_region **rp, struct damos *s)
+ struct damon_region **rp, struct damos *s, unsigned long min_sz_region)
{
struct damon_region *r = *rp;
struct damos_quota *quota = &s->quota;
@@ -1642,11 +1671,11 @@ static bool damos_skip_charged_region(struct damon_target *t,
if (quota->charge_addr_from && r->ar.start <
quota->charge_addr_from) {
sz_to_skip = ALIGN_DOWN(quota->charge_addr_from -
- r->ar.start, DAMON_MIN_REGION);
+ r->ar.start, min_sz_region);
if (!sz_to_skip) {
- if (damon_sz_region(r) <= DAMON_MIN_REGION)
+ if (damon_sz_region(r) <= min_sz_region)
return true;
- sz_to_skip = DAMON_MIN_REGION;
+ sz_to_skip = min_sz_region;
}
damon_split_region_at(t, r, sz_to_skip);
r = damon_next_region(r);
@@ -1671,7 +1700,8 @@ static void damos_update_stat(struct damos *s,
}
static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
- struct damon_region *r, struct damos_filter *filter)
+ struct damon_region *r, struct damos_filter *filter,
+ unsigned long min_sz_region)
{
bool matched = false;
struct damon_target *ti;
@@ -1688,8 +1718,8 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
matched = target_idx == filter->target_idx;
break;
case DAMOS_FILTER_TYPE_ADDR:
- start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION);
- end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION);
+ start = ALIGN_DOWN(filter->addr_range.start, min_sz_region);
+ end = ALIGN_DOWN(filter->addr_range.end, min_sz_region);
/* inside the range */
if (start <= r->ar.start && r->ar.end <= end) {
@@ -1725,7 +1755,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
s->core_filters_allowed = false;
damos_for_each_filter(filter, s) {
- if (damos_filter_match(ctx, t, r, filter)) {
+ if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) {
if (filter->allow)
s->core_filters_allowed = true;
return !filter->allow;
@@ -1860,7 +1890,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
if (c->ops.apply_scheme) {
if (quota->esz && quota->charged_sz + sz > quota->esz) {
sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
- DAMON_MIN_REGION);
+ c->min_sz_region);
if (!sz)
goto update_stat;
damon_split_region_at(t, r, sz);
@@ -1908,7 +1938,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
if (quota->esz && quota->charged_sz >= quota->esz)
continue;
- if (damos_skip_charged_region(t, &r, s))
+ if (damos_skip_charged_region(t, &r, s, c->min_sz_region))
continue;
if (!damos_valid_target(c, t, r, s))
@@ -2306,7 +2336,8 @@ static void damon_split_region_at(struct damon_target *t,
}
/* Split every region in the given target into 'nr_subs' regions */
-static void damon_split_regions_of(struct damon_target *t, int nr_subs)
+static void damon_split_regions_of(struct damon_target *t, int nr_subs,
+ unsigned long min_sz_region)
{
struct damon_region *r, *next;
unsigned long sz_region, sz_sub = 0;
@@ -2316,13 +2347,13 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs)
sz_region = damon_sz_region(r);
for (i = 0; i < nr_subs - 1 &&
- sz_region > 2 * DAMON_MIN_REGION; i++) {
+ sz_region > 2 * min_sz_region; i++) {
/*
* Randomly select size of left sub-region to be at
* least 10 percent and at most 90% of original region
*/
sz_sub = ALIGN_DOWN(damon_rand(1, 10) *
- sz_region / 10, DAMON_MIN_REGION);
+ sz_region / 10, min_sz_region);
/* Do not allow blank region */
if (sz_sub == 0 || sz_sub >= sz_region)
continue;
@@ -2362,7 +2393,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
nr_subregions = 3;
damon_for_each_target(t, ctx)
- damon_split_regions_of(t, nr_subregions);
+ damon_split_regions_of(t, nr_subregions, ctx->min_sz_region);
last_nr_regions = nr_regions;
}
@@ -2755,7 +2786,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
addr_range.start = *start;
addr_range.end = *end;
- return damon_set_regions(t, &addr_range, 1);
+ return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION);
}
/*
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 99321ff5cb92..998c5180a603 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -303,7 +303,7 @@ static unsigned int __damon_migrate_folio_list(
* instead of migrated.
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
- __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT,
+ __GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
};
@@ -412,3 +412,12 @@ unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid)
return nr_migrated;
}
+
+bool damos_ops_has_filter(struct damos *s)
+{
+ struct damos_filter *f;
+
+ damos_for_each_ops_filter(f, s)
+ return true;
+ return false;
+}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 61ad54aaf256..5efa5b5970de 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -21,3 +21,5 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio);
unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid);
+
+bool damos_ops_has_filter(struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 53a55c5114fb..07a8aead439e 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -18,7 +18,26 @@
#include "../internal.h"
#include "ops-common.h"
-static void damon_pa_mkold(unsigned long paddr)
+static phys_addr_t damon_pa_phys_addr(
+ unsigned long addr, unsigned long addr_unit)
+{
+ return (phys_addr_t)addr * addr_unit;
+}
+
+static unsigned long damon_pa_core_addr(
+ phys_addr_t pa, unsigned long addr_unit)
+{
+ /*
+ * Use div_u64() for avoiding linking errors related with __udivdi3,
+ * __aeabi_uldivmod, or similar problems. This should also improve the
+ * performance optimization (read div_u64() comment for the detail).
+ */
+ if (sizeof(pa) == 8 && sizeof(addr_unit) == 4)
+ return div_u64(pa, addr_unit);
+ return pa / addr_unit;
+}
+
+static void damon_pa_mkold(phys_addr_t paddr)
{
struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
@@ -29,11 +48,12 @@ static void damon_pa_mkold(unsigned long paddr)
folio_put(folio);
}
-static void __damon_pa_prepare_access_check(struct damon_region *r)
+static void __damon_pa_prepare_access_check(struct damon_region *r,
+ unsigned long addr_unit)
{
r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
- damon_pa_mkold(r->sampling_addr);
+ damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, addr_unit));
}
static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
@@ -43,11 +63,11 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t)
- __damon_pa_prepare_access_check(r);
+ __damon_pa_prepare_access_check(r, ctx->addr_unit);
}
}
-static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
+static bool damon_pa_young(phys_addr_t paddr, unsigned long *folio_sz)
{
struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
bool accessed;
@@ -62,23 +82,25 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
}
static void __damon_pa_check_access(struct damon_region *r,
- struct damon_attrs *attrs)
+ struct damon_attrs *attrs, unsigned long addr_unit)
{
- static unsigned long last_addr;
+ static phys_addr_t last_addr;
static unsigned long last_folio_sz = PAGE_SIZE;
static bool last_accessed;
+ phys_addr_t sampling_addr = damon_pa_phys_addr(
+ r->sampling_addr, addr_unit);
/* If the region is in the last checked page, reuse the result */
if (ALIGN_DOWN(last_addr, last_folio_sz) ==
- ALIGN_DOWN(r->sampling_addr, last_folio_sz)) {
+ ALIGN_DOWN(sampling_addr, last_folio_sz)) {
damon_update_region_access_rate(r, last_accessed, attrs);
return;
}
- last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz);
+ last_accessed = damon_pa_young(sampling_addr, &last_folio_sz);
damon_update_region_access_rate(r, last_accessed, attrs);
- last_addr = r->sampling_addr;
+ last_addr = sampling_addr;
}
static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
@@ -89,7 +111,8 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t) {
- __damon_pa_check_access(r, &ctx->attrs);
+ __damon_pa_check_access(
+ r, &ctx->attrs, ctx->addr_unit);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
}
}
@@ -125,10 +148,11 @@ static bool damon_pa_invalid_damos_folio(struct folio *folio, struct damos *s)
return false;
}
-static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
+static unsigned long damon_pa_pageout(struct damon_region *r,
+ unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- unsigned long addr, applied;
+ phys_addr_t addr, applied;
LIST_HEAD(folio_list);
bool install_young_filter = true;
struct damos_filter *filter;
@@ -149,8 +173,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
damos_add_filter(s, filter);
}
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -160,7 +184,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
if (damos_pa_filter_out(s, folio))
goto put_folio;
else
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
folio_clear_referenced(folio);
folio_test_clear_young(folio);
@@ -179,18 +203,19 @@ put_folio:
applied = reclaim_pages(&folio_list);
cond_resched();
s->last_applied = folio;
- return applied * PAGE_SIZE;
+ return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
static inline unsigned long damon_pa_mark_accessed_or_deactivate(
- struct damon_region *r, struct damos *s, bool mark_accessed,
+ struct damon_region *r, unsigned long addr_unit,
+ struct damos *s, bool mark_accessed,
unsigned long *sz_filter_passed)
{
- unsigned long addr, applied = 0;
+ phys_addr_t addr, applied = 0;
struct folio *folio;
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -200,7 +225,7 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
if (damos_pa_filter_out(s, folio))
goto put_folio;
else
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
if (mark_accessed)
folio_mark_accessed(folio);
@@ -212,32 +237,35 @@ put_folio:
folio_put(folio);
}
s->last_applied = folio;
- return applied * PAGE_SIZE;
+ return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
static unsigned long damon_pa_mark_accessed(struct damon_region *r,
- struct damos *s, unsigned long *sz_filter_passed)
+ unsigned long addr_unit, struct damos *s,
+ unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, true,
+ return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, true,
sz_filter_passed);
}
static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
- struct damos *s, unsigned long *sz_filter_passed)
+ unsigned long addr_unit, struct damos *s,
+ unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, false,
+ return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, false,
sz_filter_passed);
}
-static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
+static unsigned long damon_pa_migrate(struct damon_region *r,
+ unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- unsigned long addr, applied;
+ phys_addr_t addr, applied;
LIST_HEAD(folio_list);
struct folio *folio;
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -247,7 +275,7 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
if (damos_pa_filter_out(s, folio))
goto put_folio;
else
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
if (!folio_isolate_lru(folio))
goto put_folio;
@@ -259,29 +287,21 @@ put_folio:
applied = damon_migrate_pages(&folio_list, s->target_nid);
cond_resched();
s->last_applied = folio;
- return applied * PAGE_SIZE;
+ return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
-static bool damon_pa_scheme_has_filter(struct damos *s)
-{
- struct damos_filter *f;
-
- damos_for_each_ops_filter(f, s)
- return true;
- return false;
-}
-
-static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
+static unsigned long damon_pa_stat(struct damon_region *r,
+ unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- unsigned long addr;
+ phys_addr_t addr;
struct folio *folio;
- if (!damon_pa_scheme_has_filter(s))
+ if (!damos_ops_has_filter(s))
return 0;
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -289,7 +309,7 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
}
if (!damos_pa_filter_out(s, folio))
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
addr += folio_size(folio);
folio_put(folio);
}
@@ -301,18 +321,22 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
struct damos *scheme, unsigned long *sz_filter_passed)
{
+ unsigned long aunit = ctx->addr_unit;
+
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pa_pageout(r, scheme, sz_filter_passed);
+ return damon_pa_pageout(r, aunit, scheme, sz_filter_passed);
case DAMOS_LRU_PRIO:
- return damon_pa_mark_accessed(r, scheme, sz_filter_passed);
+ return damon_pa_mark_accessed(r, aunit, scheme,
+ sz_filter_passed);
case DAMOS_LRU_DEPRIO:
- return damon_pa_deactivate_pages(r, scheme, sz_filter_passed);
+ return damon_pa_deactivate_pages(r, aunit, scheme,
+ sz_filter_passed);
case DAMOS_MIGRATE_HOT:
case DAMOS_MIGRATE_COLD:
- return damon_pa_migrate(r, scheme, sz_filter_passed);
+ return damon_pa_migrate(r, aunit, scheme, sz_filter_passed);
case DAMOS_STAT:
- return damon_pa_stat(r, scheme, sz_filter_passed);
+ return damon_pa_stat(r, aunit, scheme, sz_filter_passed);
default:
/* DAMOS actions that not yet supported by 'paddr'. */
break;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index c96c2154128f..fe4e73d0ebbb 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -834,6 +834,7 @@ static const struct damon_sysfs_ops_name damon_sysfs_ops_names[] = {
struct damon_sysfs_context {
struct kobject kobj;
enum damon_ops_id ops_id;
+ unsigned long addr_unit;
struct damon_sysfs_attrs *attrs;
struct damon_sysfs_targets *targets;
struct damon_sysfs_schemes *schemes;
@@ -849,6 +850,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc(
return NULL;
context->kobj = (struct kobject){};
context->ops_id = ops_id;
+ context->addr_unit = 1;
return context;
}
@@ -997,6 +999,32 @@ static ssize_t operations_store(struct kobject *kobj,
return -EINVAL;
}
+static ssize_t addr_unit_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+
+ return sysfs_emit(buf, "%lu\n", context->addr_unit);
+}
+
+static ssize_t addr_unit_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+ unsigned long input_addr_unit;
+ int err = kstrtoul(buf, 0, &input_addr_unit);
+
+ if (err)
+ return err;
+ if (!input_addr_unit)
+ return -EINVAL;
+
+ context->addr_unit = input_addr_unit;
+ return count;
+}
+
static void damon_sysfs_context_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_context, kobj));
@@ -1008,9 +1036,13 @@ static struct kobj_attribute damon_sysfs_context_avail_operations_attr =
static struct kobj_attribute damon_sysfs_context_operations_attr =
__ATTR_RW_MODE(operations, 0600);
+static struct kobj_attribute damon_sysfs_context_addr_unit_attr =
+ __ATTR_RW_MODE(addr_unit, 0600);
+
static struct attribute *damon_sysfs_context_attrs[] = {
&damon_sysfs_context_avail_operations_attr.attr,
&damon_sysfs_context_operations_attr.attr,
+ &damon_sysfs_context_addr_unit_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_context);
@@ -1301,7 +1333,8 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
}
static int damon_sysfs_set_regions(struct damon_target *t,
- struct damon_sysfs_regions *sysfs_regions)
+ struct damon_sysfs_regions *sysfs_regions,
+ unsigned long min_sz_region)
{
struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr,
sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN);
@@ -1323,7 +1356,7 @@ static int damon_sysfs_set_regions(struct damon_target *t,
if (ranges[i - 1].end > ranges[i].start)
goto out;
}
- err = damon_set_regions(t, ranges, sysfs_regions->nr);
+ err = damon_set_regions(t, ranges, sysfs_regions->nr, min_sz_region);
out:
kfree(ranges);
return err;
@@ -1344,7 +1377,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
/* caller will destroy targets */
return -EINVAL;
}
- return damon_sysfs_set_regions(t, sys_target->regions);
+ return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region);
}
static int damon_sysfs_add_targets(struct damon_ctx *ctx,
@@ -1401,6 +1434,8 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
err = damon_select_ops(ctx, sys_ctx->ops_id);
if (err)
return err;
+ ctx->addr_unit = sys_ctx->addr_unit;
+ ctx->min_sz_region = max(DAMON_MIN_REGION / sys_ctx->addr_unit, 1);
err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
if (err)
return err;
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index dfedfff19940..51369e35298b 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -230,14 +230,14 @@ static void damon_test_split_regions_of(struct kunit *test)
t = damon_new_target();
r = damon_new_region(0, 22);
damon_add_region(r, t);
- damon_split_regions_of(t, 2);
+ damon_split_regions_of(t, 2, DAMON_MIN_REGION);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
damon_free_target(t);
t = damon_new_target();
r = damon_new_region(0, 220);
damon_add_region(r, t);
- damon_split_regions_of(t, 4);
+ damon_split_regions_of(t, 4, DAMON_MIN_REGION);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
damon_free_target(t);
damon_destroy_ctx(c);
@@ -303,7 +303,7 @@ static void damon_test_set_regions(struct kunit *test)
damon_add_region(r1, t);
damon_add_region(r2, t);
- damon_set_regions(t, &range, 1);
+ damon_set_regions(t, &range, 1, DAMON_MIN_REGION);
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3);
damon_for_each_region(r, t) {
@@ -419,6 +419,22 @@ static void damos_test_new_filter(struct kunit *test)
damos_destroy_filter(filter);
}
+static void damos_test_commit_filter(struct kunit *test)
+{
+ struct damos_filter *src_filter = damos_new_filter(
+ DAMOS_FILTER_TYPE_ANON, true, true);
+ struct damos_filter *dst_filter = damos_new_filter(
+ DAMOS_FILTER_TYPE_ACTIVE, false, false);
+
+ damos_commit_filter(dst_filter, src_filter);
+ KUNIT_EXPECT_EQ(test, dst_filter->type, src_filter->type);
+ KUNIT_EXPECT_EQ(test, dst_filter->matching, src_filter->matching);
+ KUNIT_EXPECT_EQ(test, dst_filter->allow, src_filter->allow);
+
+ damos_destroy_filter(src_filter);
+ damos_destroy_filter(dst_filter);
+}
+
static void damos_test_filter_out(struct kunit *test)
{
struct damon_target *t;
@@ -434,25 +450,29 @@ static void damos_test_filter_out(struct kunit *test)
damon_add_region(r, t);
/* region in the range */
- KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 2;
- KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region after the range */
r->ar.start = DAMON_MIN_REGION * 6;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region started before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 4;
- KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2);
@@ -465,7 +485,8 @@ static void damos_test_filter_out(struct kunit *test)
/* region started in the range */
r->ar.start = DAMON_MIN_REGION * 2;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6);
@@ -594,6 +615,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_set_attrs),
KUNIT_CASE(damon_test_moving_sum),
KUNIT_CASE(damos_test_new_filter),
+ KUNIT_CASE(damos_test_commit_filter),
KUNIT_CASE(damos_test_filter_out),
KUNIT_CASE(damon_test_feed_loop_next_input),
KUNIT_CASE(damon_test_set_filters_default_reject),
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index d2b37ccf2cc0..fce38dd53cf8 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -141,7 +141,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
damon_add_region(r, t);
}
- damon_set_regions(t, three_regions, 3);
+ damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION);
for (i = 0; i < nr_expected / 2; i++) {
r = __nth_region_of(t, i);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 87e825349bdf..8c048f9b129e 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -299,7 +299,7 @@ static void damon_va_update(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
if (damon_va_three_regions(t, three_regions))
continue;
- damon_set_regions(t, three_regions, 3);
+ damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION);
}
}
@@ -890,6 +890,107 @@ free_lists:
return applied * PAGE_SIZE;
}
+struct damos_va_stat_private {
+ struct damos *scheme;
+ unsigned long *sz_filter_passed;
+};
+
+static inline bool damos_va_invalid_folio(struct folio *folio,
+ struct damos *s)
+{
+ return !folio || folio == s->last_applied;
+}
+
+static int damos_va_stat_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct damos_va_stat_private *priv = walk->private;
+ struct damos *s = priv->scheme;
+ unsigned long *sz_filter_passed = priv->sz_filter_passed;
+ struct vm_area_struct *vma = walk->vma;
+ struct folio *folio;
+ spinlock_t *ptl;
+ pte_t *start_pte, *pte, ptent;
+ int nr;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(*pmd)) {
+ pmd_t pmde;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+ pmde = pmdp_get(pmd);
+ if (!pmd_present(pmde))
+ goto huge_unlock;
+
+ folio = vm_normal_folio_pmd(vma, addr, pmde);
+
+ if (damos_va_invalid_folio(folio, s))
+ goto huge_unlock;
+
+ if (!damos_va_filter_out(s, folio, vma, addr, NULL, pmd))
+ *sz_filter_passed += folio_size(folio);
+ s->last_applied = folio;
+
+huge_unlock:
+ spin_unlock(ptl);
+ return 0;
+ }
+#endif
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!start_pte)
+ return 0;
+
+ for (; addr < next; pte += nr, addr += nr * PAGE_SIZE) {
+ nr = 1;
+ ptent = ptep_get(pte);
+
+ if (pte_none(ptent) || !pte_present(ptent))
+ continue;
+
+ folio = vm_normal_folio(vma, addr, ptent);
+
+ if (damos_va_invalid_folio(folio, s))
+ continue;
+
+ if (!damos_va_filter_out(s, folio, vma, addr, pte, NULL))
+ *sz_filter_passed += folio_size(folio);
+ nr = folio_nr_pages(folio);
+ s->last_applied = folio;
+ }
+ pte_unmap_unlock(start_pte, ptl);
+ return 0;
+}
+
+static unsigned long damos_va_stat(struct damon_target *target,
+ struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
+{
+ struct damos_va_stat_private priv;
+ struct mm_struct *mm;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = damos_va_stat_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+ };
+
+ priv.scheme = s;
+ priv.sz_filter_passed = sz_filter_passed;
+
+ if (!damos_ops_has_filter(s))
+ return 0;
+
+ mm = damon_get_mm(target);
+ if (!mm)
+ return 0;
+
+ mmap_read_lock(mm);
+ walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
+ mmap_read_unlock(mm);
+ mmput(mm);
+ return 0;
+}
+
static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
struct damos *scheme, unsigned long *sz_filter_passed)
@@ -916,7 +1017,7 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
case DAMOS_MIGRATE_COLD:
return damos_va_migrate(t, r, scheme, sz_filter_passed);
case DAMOS_STAT:
- return 0;
+ return damos_va_stat(t, r, scheme, sz_filter_passed);
default:
/*
* DAMOS actions that are not yet supported by 'vaddr'.
diff --git a/mm/debug.c b/mm/debug.c
index b4388f4dcd4d..64ddb0c4b4be 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -182,7 +182,7 @@ void dump_mm(const struct mm_struct *mm)
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
- "binfmt %px flags %lx\n"
+ "binfmt %px flags %*pb\n"
#ifdef CONFIG_AIO
"ioctx_table %px\n"
#endif
@@ -211,7 +211,7 @@ void dump_mm(const struct mm_struct *mm)
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
- mm->binfmt, mm->flags,
+ mm->binfmt, NUM_MM_FLAG_BITS, __mm_flags_get_bitmap(mm),
#ifdef CONFIG_AIO
mm->ioctx_table,
#endif
diff --git a/mm/filemap.c b/mm/filemap.c
index 751838ef05e5..cd9387b0a5b5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -190,6 +190,9 @@ static void filemap_unaccount_folio(struct address_space *mapping,
__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
}
+ if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags))
+ mod_node_page_state(folio_pgdat(folio),
+ NR_KERNEL_FILE_PAGES, -nr);
/*
* At this point folio must be either written or cleaned by
@@ -960,8 +963,14 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
{
void *shadow = NULL;
int ret;
+ struct mem_cgroup *tmp;
+ bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags);
+ if (kernel_file)
+ tmp = set_active_memcg(root_mem_cgroup);
ret = mem_cgroup_charge(folio, NULL, gfp);
+ if (kernel_file)
+ set_active_memcg(tmp);
if (ret)
return ret;
@@ -983,6 +992,10 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
if (!(gfp & __GFP_WRITE) && shadow)
workingset_refault(folio, shadow);
folio_add_lru(folio);
+ if (kernel_file)
+ mod_node_page_state(folio_pgdat(folio),
+ NR_KERNEL_FILE_PAGES,
+ folio_nr_pages(folio));
}
return ret;
}
@@ -1140,10 +1153,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
*/
flags = wait->flags;
if (flags & WQ_FLAG_EXCLUSIVE) {
- if (test_bit(key->bit_nr, &key->folio->flags))
+ if (test_bit(key->bit_nr, &key->folio->flags.f))
return -1;
if (flags & WQ_FLAG_CUSTOM) {
- if (test_and_set_bit(key->bit_nr, &key->folio->flags))
+ if (test_and_set_bit(key->bit_nr, &key->folio->flags.f))
return -1;
flags |= WQ_FLAG_DONE;
}
@@ -1226,9 +1239,9 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
struct wait_queue_entry *wait)
{
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
- if (test_and_set_bit(bit_nr, &folio->flags))
+ if (test_and_set_bit(bit_nr, &folio->flags.f))
return false;
- } else if (test_bit(bit_nr, &folio->flags))
+ } else if (test_bit(bit_nr, &folio->flags.f))
return false;
wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
@@ -1961,7 +1974,7 @@ no_page:
gfp &= ~__GFP_FS;
if (fgp_flags & FGP_NOWAIT) {
gfp &= ~GFP_KERNEL;
- gfp |= GFP_NOWAIT | __GFP_NOWARN;
+ gfp |= GFP_NOWAIT;
}
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
@@ -2447,6 +2460,9 @@ static bool filemap_range_uptodate(struct address_space *mapping,
pos -= folio_pos(folio);
}
+ if (pos == 0 && count >= folio_size(folio))
+ return false;
+
return mapping->a_ops->is_partially_uptodate(folio, pos, count);
}
@@ -2619,9 +2635,10 @@ retry:
goto err;
}
if (!folio_test_uptodate(folio)) {
- if ((iocb->ki_flags & IOCB_WAITQ) &&
- folio_batch_count(fbatch) > 1)
- iocb->ki_flags |= IOCB_NOWAIT;
+ if (folio_batch_count(fbatch) > 1) {
+ err = -EAGAIN;
+ goto err;
+ }
err = filemap_update_page(iocb, mapping, count, folio,
need_uptodate);
if (err)
@@ -3323,9 +3340,17 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
- mmap_miss = READ_ONCE(ra->mmap_miss);
- if (mmap_miss)
- WRITE_ONCE(ra->mmap_miss, --mmap_miss);
+ /*
+ * If the folio is locked, we're likely racing against another fault.
+ * Don't touch the mmap_miss counter to avoid decreasing it multiple
+ * times for a single folio and break the balance with mmap_miss
+ * increase in do_sync_mmap_readahead().
+ */
+ if (likely(!folio_test_locked(folio))) {
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss)
+ WRITE_ONCE(ra->mmap_miss, --mmap_miss);
+ }
if (folio_test_readahead(folio)) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
diff --git a/mm/gup.c b/mm/gup.c
index 0bc4d140fc07..ed02d65f9c72 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -148,7 +148,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
return -ENOMEM;
- if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page)))
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio)))
return -EREMOTEIO;
if (flags & FOLL_GET)
@@ -475,10 +475,10 @@ EXPORT_SYMBOL_GPL(unpin_folios);
* lifecycle. Avoid setting the bit unless necessary, or it might cause write
* cache bouncing on large SMP machines for concurrent pinned gups.
*/
-static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
+static inline void mm_set_has_pinned_flag(struct mm_struct *mm)
{
- if (!test_bit(MMF_HAS_PINNED, mm_flags))
- set_bit(MMF_HAS_PINNED, mm_flags);
+ if (!mm_flags_test(MMF_HAS_PINNED, mm))
+ mm_flags_set(MMF_HAS_PINNED, mm);
}
#ifdef CONFIG_MMU
@@ -1693,7 +1693,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
mmap_assert_locked(mm);
if (flags & FOLL_PIN)
- mm_set_has_pinned_flag(&mm->flags);
+ mm_set_has_pinned_flag(mm);
/*
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
@@ -3218,7 +3218,7 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
return -EINVAL;
if (gup_flags & FOLL_PIN)
- mm_set_has_pinned_flag(&current->mm->flags);
+ mm_set_has_pinned_flag(current->mm);
if (!(gup_flags & FOLL_FAST_ONLY))
might_lock_read(&current->mm->mmap_lock);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9c38a95e9f09..26cedfcd7418 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -99,12 +99,12 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders)
{
- bool smaps = tva_flags & TVA_SMAPS;
- bool in_pf = tva_flags & TVA_IN_PF;
- bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
+ const bool smaps = type == TVA_SMAPS;
+ const bool in_pf = type == TVA_PAGEFAULT;
+ const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
unsigned long supported_orders;
/* Check the intersection of requested and supported orders. */
@@ -122,7 +122,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!vma->vm_mm) /* vdso */
return 0;
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse))
return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
@@ -167,14 +167,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!in_pf && shmem_file(vma->vm_file))
return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
vma, vma->vm_pgoff, 0,
- !enforce_sysfs);
+ forced_collapse);
if (!vma_is_anonymous(vma)) {
/*
- * Enforce sysfs THP requirements as necessary. Anonymous vmas
+ * Enforce THP collapse requirements as necessary. Anonymous vmas
* were already handled in thp_vma_allowable_orders().
*/
- if (enforce_sysfs &&
+ if (!forced_collapse &&
(!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
!hugepage_global_always())))
return 0;
@@ -207,7 +207,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
return orders;
}
-static bool get_huge_zero_page(void)
+static bool get_huge_zero_folio(void)
{
struct folio *zero_folio;
retry:
@@ -237,7 +237,7 @@ retry:
return true;
}
-static void put_huge_zero_page(void)
+static void put_huge_zero_folio(void)
{
/*
* Counter should never go to zero here. Only shrinker can put
@@ -248,33 +248,39 @@ static void put_huge_zero_page(void)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return huge_zero_folio;
+
+ if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
return READ_ONCE(huge_zero_folio);
- if (!get_huge_zero_page())
+ if (!get_huge_zero_folio())
return NULL;
- if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
+ if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm))
+ put_huge_zero_folio();
return READ_ONCE(huge_zero_folio);
}
void mm_put_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return;
+
+ if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
+ put_huge_zero_folio();
}
-static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
/* we can free zero page only if last reference remains */
return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}
-static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
@@ -287,7 +293,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
return 0;
}
-static struct shrinker *huge_zero_page_shrinker;
+static struct shrinker *huge_zero_folio_shrinker;
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
@@ -849,33 +855,47 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
static int __init thp_shrinker_init(void)
{
- huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
- if (!huge_zero_page_shrinker)
- return -ENOMEM;
-
deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
SHRINKER_MEMCG_AWARE |
SHRINKER_NONSLAB,
"thp-deferred_split");
- if (!deferred_split_shrinker) {
- shrinker_free(huge_zero_page_shrinker);
+ if (!deferred_split_shrinker)
return -ENOMEM;
- }
-
- huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
- huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
- shrinker_register(huge_zero_page_shrinker);
deferred_split_shrinker->count_objects = deferred_split_count;
deferred_split_shrinker->scan_objects = deferred_split_scan;
shrinker_register(deferred_split_shrinker);
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) {
+ /*
+ * Bump the reference of the huge_zero_folio and do not
+ * initialize the shrinker.
+ *
+ * huge_zero_folio will always be NULL on failure. We assume
+ * that get_huge_zero_folio() will most likely not fail as
+ * thp_shrinker_init() is invoked early on during boot.
+ */
+ if (!get_huge_zero_folio())
+ pr_warn("Allocating persistent huge zero folio failed\n");
+ return 0;
+ }
+
+ huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
+ if (!huge_zero_folio_shrinker) {
+ shrinker_free(deferred_split_shrinker);
+ return -ENOMEM;
+ }
+
+ huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count;
+ huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
+ shrinker_register(huge_zero_folio_shrinker);
+
return 0;
}
static void __init thp_shrinker_exit(void)
{
- shrinker_free(huge_zero_page_shrinker);
+ shrinker_free(huge_zero_folio_shrinker);
shrinker_free(deferred_split_shrinker);
}
@@ -911,7 +931,7 @@ static int __init hugepage_init(void)
* where the extra memory used could hurt more than TLB overhead
* is likely to save. The admin can still enable it through /sys.
*/
- if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
+ if (totalram_pages() < MB_TO_PAGES(512)) {
transparent_hugepage_flags = 0;
return 0;
}
@@ -1125,7 +1145,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
off_sub = (off - ret) & (size - 1);
- if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
+ if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub)
return ret + size;
ret += off_sub;
@@ -1309,6 +1329,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
{
pmd_t entry;
entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
+ entry = pmd_mkspecial(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
@@ -1379,15 +1400,25 @@ struct folio_or_pfn {
bool is_folio;
};
-static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
+static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot,
- bool write, pgtable_t pgtable)
+ bool write)
{
struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable = NULL;
+ spinlock_t *ptl;
pmd_t entry;
- lockdep_assert_held(pmd_lockptr(mm, pmd));
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+ ptl = pmd_lock(mm, pmd);
if (!pmd_none(*pmd)) {
const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
fop.pfn;
@@ -1395,23 +1426,26 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
if (write) {
if (pmd_pfn(*pmd) != pfn) {
WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
- return -EEXIST;
+ goto out_unlock;
}
entry = pmd_mkyoung(*pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
-
- return -EEXIST;
+ goto out_unlock;
}
if (fop.is_folio) {
entry = folio_mk_pmd(fop.folio, vma->vm_page_prot);
- folio_get(fop.folio);
- folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
- add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
+ if (is_huge_zero_folio(fop.folio)) {
+ entry = pmd_mkspecial(entry);
+ } else {
+ folio_get(fop.folio);
+ folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
+ }
} else {
entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
entry = pmd_mkspecial(entry);
@@ -1424,11 +1458,17 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
mm_inc_nr_ptes(mm);
+ pgtable = NULL;
}
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
- return 0;
+
+out_unlock:
+ spin_unlock(ptl);
+ if (pgtable)
+ pte_free(mm, pgtable);
+ return VM_FAULT_NOPAGE;
}
/**
@@ -1450,9 +1490,6 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
struct folio_or_pfn fop = {
.pfn = pfn,
};
- pgtable_t pgtable = NULL;
- spinlock_t *ptl;
- int error;
/*
* If we had pmd_special, we could avoid all these restrictions,
@@ -1464,25 +1501,9 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
-
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write,
- pgtable);
- spin_unlock(ptl);
- if (error && pgtable)
- pte_free(vma->vm_mm, pgtable);
-
- return VM_FAULT_NOPAGE;
+ return insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -1491,35 +1512,15 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
{
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address & PMD_MASK;
- struct mm_struct *mm = vma->vm_mm;
struct folio_or_pfn fop = {
.folio = folio,
.is_folio = true,
};
- spinlock_t *ptl;
- pgtable_t pgtable = NULL;
- int error;
-
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
return VM_FAULT_SIGBUS;
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
- ptl = pmd_lock(mm, vmf->pmd);
- error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot,
- write, pgtable);
- spin_unlock(ptl);
- if (error && pgtable)
- pte_free(mm, pgtable);
-
- return VM_FAULT_NOPAGE;
+ return insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
@@ -1531,25 +1532,30 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
return pud;
}
-static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
+static vm_fault_t insert_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
{
struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
pud_t entry;
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ ptl = pud_lock(mm, pud);
if (!pud_none(*pud)) {
const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
fop.pfn;
if (write) {
if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
- return;
+ goto out_unlock;
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
update_mmu_cache_pud(vma, addr, pud);
}
- return;
+ goto out_unlock;
}
if (fop.is_folio) {
@@ -1568,6 +1574,9 @@ static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
}
set_pud_at(mm, addr, pud, entry);
update_mmu_cache_pud(vma, addr, pud);
+out_unlock:
+ spin_unlock(ptl);
+ return VM_FAULT_NOPAGE;
}
/**
@@ -1589,7 +1598,6 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
struct folio_or_pfn fop = {
.pfn = pfn,
};
- spinlock_t *ptl;
/*
* If we had pud_special, we could avoid all these restrictions,
@@ -1601,16 +1609,9 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
-
pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- ptl = pud_lock(vma->vm_mm, vmf->pud);
- insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
- spin_unlock(ptl);
-
- return VM_FAULT_NOPAGE;
+ return insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
@@ -1627,25 +1628,15 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
{
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address & PUD_MASK;
- pud_t *pud = vmf->pud;
- struct mm_struct *mm = vma->vm_mm;
struct folio_or_pfn fop = {
.folio = folio,
.is_folio = true,
};
- spinlock_t *ptl;
-
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
return VM_FAULT_SIGBUS;
- ptl = pud_lock(mm, pud);
- insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
- spin_unlock(ptl);
-
- return VM_FAULT_NOPAGE;
+ return insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -1675,7 +1666,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
int ret = -ENOMEM;
pmd = pmdp_get_lockless(src_pmd);
- if (unlikely(pmd_present(pmd) && pmd_special(pmd))) {
+ if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
+ !is_huge_zero_pmd(pmd))) {
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -3310,8 +3302,8 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
* unreferenced sub-pages of an anonymous THP: we can simply drop
* PG_anon_exclusive (-> PG_mappedtodisk) for these here.
*/
- new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- new_folio->flags |= (folio->flags &
+ new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ new_folio->flags.f |= (folio->flags.f &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
(1L << PG_swapcache) |
@@ -4327,8 +4319,8 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
goto out;
}
- pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
- pid, vaddr_start, vaddr_end);
+ pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
+ pid, vaddr_start, vaddr_end, new_order, in_folio_offset);
mmap_read_lock(mm);
/*
@@ -4438,8 +4430,8 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
if (IS_ERR(candidate))
goto out;
- pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
- file_path, off_start, off_end);
+ pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
+ file_path, off_start, off_end, new_order, in_folio_offset);
mapping = candidate->f_mapping;
min_order = mapping_min_folio_order(mapping);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eed59cfb5d21..1e777cc51ad0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3654,6 +3654,9 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
return;
}
+ if (!h->max_huge_pages)
+ return;
+
/* do node specific alloc */
if (hugetlb_hstate_alloc_pages_specific_nodes(h))
return;
@@ -6932,6 +6935,11 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
if (IS_ERR(folio)) {
+ pte_t *actual_pte = hugetlb_walk(dst_vma, dst_addr, PMD_SIZE);
+ if (actual_pte) {
+ ret = -EEXIST;
+ goto out;
+ }
ret = -ENOMEM;
goto out;
}
diff --git a/mm/internal.h b/mm/internal.h
index 45b725c3dc03..45da9ff5694f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1333,11 +1333,6 @@ extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];
-static inline bool is_migrate_highatomic(enum migratetype migratetype)
-{
- return migratetype == MIGRATE_HIGHATOMIC;
-}
-
void setup_zone_pageset(struct zone *zone);
struct migration_target_control {
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index 8fce3370c84e..f084e7a5df1e 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -266,11 +266,9 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
}
if (pgd_none(*pgd)) {
- p4d_t *p;
if (slab_is_available()) {
- p = p4d_alloc(&init_mm, pgd, addr);
- if (!p)
+ if (!p4d_alloc(&init_mm, pgd, addr))
return -ENOMEM;
} else {
pgd_populate_kernel(addr, pgd,
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index f4b17984b627..4cf2b5f8d6c1 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -1073,6 +1073,45 @@ static void kmem_cache_rcu_uaf(struct kunit *test)
kmem_cache_destroy(cache);
}
+/*
+ * Check that SLAB_TYPESAFE_BY_RCU objects are immediately reused when
+ * CONFIG_SLUB_RCU_DEBUG is off, and stay at the same address.
+ * Without this, KASAN builds would be unable to trigger bugs caused by
+ * SLAB_TYPESAFE_BY_RCU users handling reycled objects improperly.
+ */
+static void kmem_cache_rcu_reuse(struct kunit *test)
+{
+ char *p, *p2;
+ struct kmem_cache *cache;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_SLUB_RCU_DEBUG);
+
+ cache = kmem_cache_create("test_cache", 16, 0, SLAB_TYPESAFE_BY_RCU,
+ NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ migrate_disable();
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ goto out;
+ }
+
+ kmem_cache_free(cache, p);
+ p2 = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p2) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ goto out;
+ }
+ KUNIT_EXPECT_PTR_EQ(test, p, p2);
+
+ kmem_cache_free(cache, p2);
+
+out:
+ migrate_enable();
+ kmem_cache_destroy(cache);
+}
+
static void kmem_cache_double_destroy(struct kunit *test)
{
struct kmem_cache *cache;
@@ -2106,6 +2145,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kmem_cache_double_free),
KUNIT_CASE(kmem_cache_invalid_free),
KUNIT_CASE(kmem_cache_rcu_uaf),
+ KUNIT_CASE(kmem_cache_rcu_reuse),
KUNIT_CASE(kmem_cache_double_destroy),
KUNIT_CASE(kmem_cache_accounted),
KUNIT_CASE(kmem_cache_bulk),
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b486c1d19b2d..4ec324a4c1fe 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
{
return hpage_collapse_test_exit(mm) ||
- test_bit(MMF_DISABLE_THP, &mm->flags);
+ mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}
static bool hugepage_pmd_enabled(void)
@@ -445,7 +445,7 @@ void __khugepaged_enter(struct mm_struct *mm)
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
- if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
+ if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
return;
mm_slot = mm_slot_alloc(mm_slot_cache);
@@ -472,10 +472,9 @@ void __khugepaged_enter(struct mm_struct *mm)
void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
+ if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
hugepage_pmd_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
- PMD_ORDER))
+ if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
__khugepaged_enter(vma->vm_mm);
}
}
@@ -497,7 +496,7 @@ void __khugepaged_exit(struct mm_struct *mm)
spin_unlock(&khugepaged_mm_lock);
if (free) {
- clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ mm_flags_clear(MMF_VM_HUGEPAGE, mm);
mm_slot_free(mm_slot_cache, mm_slot);
mmdrop(mm);
} else if (mm_slot) {
@@ -921,7 +920,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct collapse_control *cc)
{
struct vm_area_struct *vma;
- unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;
+ enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
+ TVA_FORCED_COLLAPSE;
if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
return SCAN_ANY_PROCESS;
@@ -932,7 +932,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
@@ -1459,7 +1459,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
/*
* Not strictly needed because the mm exited already.
*
- * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
*/
/* khugepaged_mm_lock actually not necessary for the below */
@@ -1533,9 +1533,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* in the page cache with a single hugepage. If a mm were to fault-in
* this memory (mapped by a suitably aligned VMA), we'd get the hugepage
* and map it by a PMD, regardless of sysfs THP settings. As such, let's
- * analogously elide sysfs THP settings here.
+ * analogously elide sysfs THP settings here and force collapse.
*/
- if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -2402,7 +2402,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
mm_slot = khugepaged_scan.mm_slot;
slot = &mm_slot->slot;
} else {
- slot = list_entry(khugepaged_scan.mm_head.next,
+ slot = list_first_entry(&khugepaged_scan.mm_head,
struct mm_slot, mm_node);
mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
@@ -2432,8 +2432,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
progress++;
break;
}
- if (!thp_vma_allowable_order(vma, vma->vm_flags,
- TVA_ENFORCE_SYSFS, PMD_ORDER)) {
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
skip:
progress++;
continue;
@@ -2516,9 +2515,8 @@ breakouterloop_mmap_lock:
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
- if (slot->mm_node.next != &khugepaged_scan.mm_head) {
- slot = list_entry(slot->mm_node.next,
- struct mm_slot, mm_node);
+ if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
+ slot = list_next_entry(slot, mm_node);
khugepaged_scan.mm_slot =
mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
@@ -2767,7 +2765,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return -EINVAL;
cc = kmalloc(sizeof(*cc), GFP_KERNEL);
diff --git a/mm/ksm.c b/mm/ksm.c
index 160787bb121c..2ef29802a49b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1217,8 +1217,8 @@ mm_exiting:
spin_unlock(&ksm_mmlist_lock);
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGEABLE, mm);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
mmdrop(mm);
} else
spin_unlock(&ksm_mmlist_lock);
@@ -2620,8 +2620,8 @@ no_vmas:
spin_unlock(&ksm_mmlist_lock);
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGEABLE, mm);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
mmap_read_unlock(mm);
mmdrop(mm);
} else {
@@ -2742,7 +2742,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
vm_flags_t vm_flags)
{
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) &&
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm) &&
__ksm_should_add_vma(file, vm_flags))
vm_flags |= VM_MERGEABLE;
@@ -2784,16 +2784,16 @@ int ksm_enable_merge_any(struct mm_struct *mm)
{
int err;
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
return 0;
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) {
err = __ksm_enter(mm);
if (err)
return err;
}
- set_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_set(MMF_VM_MERGE_ANY, mm);
ksm_add_vmas(mm);
return 0;
@@ -2815,7 +2815,7 @@ int ksm_disable_merge_any(struct mm_struct *mm)
{
int err;
- if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (!mm_flags_test(MMF_VM_MERGE_ANY, mm))
return 0;
err = ksm_del_vmas(mm);
@@ -2824,7 +2824,7 @@ int ksm_disable_merge_any(struct mm_struct *mm)
return err;
}
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
return 0;
}
@@ -2832,9 +2832,9 @@ int ksm_disable(struct mm_struct *mm)
{
mmap_assert_write_locked(mm);
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
+ if (!mm_flags_test(MMF_VM_MERGEABLE, mm))
return 0;
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
return ksm_disable_merge_any(mm);
return ksm_del_vmas(mm);
}
@@ -2852,7 +2852,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
if (!vma_ksm_compatible(vma))
return 0;
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) {
err = __ksm_enter(mm);
if (err)
return err;
@@ -2912,7 +2912,7 @@ int __ksm_enter(struct mm_struct *mm)
list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
- set_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mm_flags_set(MMF_VM_MERGEABLE, mm);
mmgrab(mm);
if (needs_wakeup)
@@ -2954,8 +2954,8 @@ void __ksm_exit(struct mm_struct *mm)
if (easy_to_free) {
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
+ mm_flags_clear(MMF_VM_MERGEABLE, mm);
mmdrop(mm);
} else if (mm_slot) {
mmap_write_lock(mm);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8dd7fbed5a94..9712a751690f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2203,7 +2203,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
* try_charge() (context permitting), as well as from the userland
* return path where reclaim is always able to block.
*/
-void mem_cgroup_handle_over_high(gfp_t gfp_mask)
+void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
unsigned long penalty_jiffies;
unsigned long pflags;
@@ -2213,9 +2213,6 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask)
struct mem_cgroup *memcg;
bool in_retry = false;
- if (likely(!nr_pages))
- return;
-
memcg = get_mem_cgroup_from_mm(current->mm);
current->memcg_nr_pages_over_high = 0;
@@ -2486,7 +2483,7 @@ done_restock:
if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
!(current->flags & PF_MEMALLOC) &&
gfpflags_allow_blocking(gfp_mask))
- mem_cgroup_handle_over_high(gfp_mask);
+ __mem_cgroup_handle_over_high(gfp_mask);
return 0;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index df6ee59527dd..b93ab99ad3ef 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1708,10 +1708,10 @@ static int identify_page_state(unsigned long pfn, struct page *p,
* carried out only if the first check can't determine the page status.
*/
for (ps = error_states;; ps++)
- if ((p->flags & ps->mask) == ps->res)
+ if ((p->flags.f & ps->mask) == ps->res)
break;
- page_flags |= (p->flags & (1UL << PG_dirty));
+ page_flags |= (p->flags.f & (1UL << PG_dirty));
if (!ps->mask)
for (ps = error_states;; ps++)
@@ -2137,7 +2137,7 @@ retry:
return action_result(pfn, MF_MSG_FREE_HUGE, res);
}
- page_flags = folio->flags;
+ page_flags = folio->flags.f;
if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
folio_unlock(folio);
@@ -2397,7 +2397,7 @@ try_again:
* folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
* status correctly, we save a copy of the page flags at this time.
*/
- page_flags = folio->flags;
+ page_flags = folio->flags.f;
/*
* __munlock_folio() may clear a writeback folio's LRU flag without
@@ -2742,13 +2742,13 @@ static int soft_offline_in_use_page(struct page *page)
putback_movable_pages(&pagelist);
pr_info("%#lx: %s migration failed %ld, type %pGp\n",
- pfn, msg_page[huge], ret, &page->flags);
+ pfn, msg_page[huge], ret, &page->flags.f);
if (ret > 0)
ret = -EBUSY;
}
} else {
pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n",
- pfn, msg_page[huge], page_count(page), &page->flags);
+ pfn, msg_page[huge], page_count(page), &page->flags.f);
ret = -EBUSY;
}
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 0ba4f6b71847..d9de6c056179 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -491,22 +491,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
add_mm_counter(mm, i, rss[i]);
}
-/*
- * This function is called to print an error when a bad pte
- * is found. For example, we might have a PFN-mapped pte in
- * a region that doesn't allow it.
- *
- * The calling function must still handle the error.
- */
-static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte, struct page *page)
+static bool is_bad_page_map_ratelimited(void)
{
- pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
- p4d_t *p4d = p4d_offset(pgd, addr);
- pud_t *pud = pud_offset(p4d, addr);
- pmd_t *pmd = pmd_offset(pud, addr);
- struct address_space *mapping;
- pgoff_t index;
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;
@@ -518,7 +504,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
if (nr_shown == 60) {
if (time_before(jiffies, resume)) {
nr_unshown++;
- return;
+ return true;
}
if (nr_unshown) {
pr_alert("BUG: Bad page map: %lu messages suppressed\n",
@@ -529,15 +515,91 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
}
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
+ return false;
+}
+
+static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long long pgdv, p4dv, pudv, pmdv;
+ p4d_t p4d, *p4dp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pgd_t *pgdp;
+
+ /*
+ * Although this looks like a fully lockless pgtable walk, it is not:
+ * see locking requirements for print_bad_page_map().
+ */
+ pgdp = pgd_offset(mm, addr);
+ pgdv = pgd_val(*pgdp);
+
+ if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) {
+ pr_alert("pgd:%08llx\n", pgdv);
+ return;
+ }
+
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = p4dp_get(p4dp);
+ p4dv = p4d_val(p4d);
+
+ if (!p4d_present(p4d) || p4d_leaf(p4d)) {
+ pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv);
+ return;
+ }
+
+ pudp = pud_offset(p4dp, addr);
+ pud = pudp_get(pudp);
+ pudv = pud_val(pud);
+
+ if (!pud_present(pud) || pud_leaf(pud)) {
+ pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv);
+ return;
+ }
+
+ pmdp = pmd_offset(pudp, addr);
+ pmd = pmdp_get(pmdp);
+ pmdv = pmd_val(pmd);
+
+ /*
+ * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE,
+ * because the table should already be mapped by the caller and
+ * doing another map would be bad. print_bad_page_map() should
+ * already take care of printing the PTE.
+ */
+ pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv,
+ p4dv, pudv, pmdv);
+}
+
+/*
+ * This function is called to print an error when a bad page table entry (e.g.,
+ * corrupted page table entry) is found. For example, we might have a
+ * PFN-mapped pte in a region that doesn't allow it.
+ *
+ * The calling function must still handle the error.
+ *
+ * This function must be called during a proper page table walk, as it will
+ * re-walk the page table to dump information: the caller MUST prevent page
+ * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf
+ * page table lock.
+ */
+static void print_bad_page_map(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long long entry, struct page *page,
+ enum pgtable_level level)
+{
+ struct address_space *mapping;
+ pgoff_t index;
+
+ if (is_bad_page_map_ratelimited())
+ return;
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
index = linear_page_index(vma, addr);
- pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
- current->comm,
- (long long)pte_val(pte), (long long)pmd_val(*pmd));
+ pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm,
+ pgtable_level_to_str(level), entry);
+ __print_bad_page_map_pgtable(vma->vm_mm, addr);
if (page)
- dump_page(page, "bad pte");
+ dump_page(page, "bad page map");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
@@ -549,18 +611,39 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
+#define print_bad_pte(vma, addr, pte, page) \
+ print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE)
-/*
- * vm_normal_page -- This function gets the "struct page" associated with a pte.
+/**
+ * __vm_normal_page() - Get the "struct page" associated with a page table entry.
+ * @vma: The VMA mapping the page table entry.
+ * @addr: The address where the page table entry is mapped.
+ * @pfn: The PFN stored in the page table entry.
+ * @special: Whether the page table entry is marked "special".
+ * @level: The page table level for error reporting purposes only.
+ * @entry: The page table entry value for error reporting purposes only.
*
* "Special" mappings do not wish to be associated with a "struct page" (either
* it doesn't exist, or it exists but they don't want to touch it). In this
- * case, NULL is returned here. "Normal" mappings do have a struct page.
+ * case, NULL is returned here. "Normal" mappings do have a struct page and
+ * are ordinarily refcounted.
+ *
+ * Page mappings of the shared zero folios are always considered "special", as
+ * they are not ordinarily refcounted: neither the refcount nor the mapcount
+ * of these folios is adjusted when mapping them into user page tables.
+ * Selected page table walkers (such as GUP) can still identify mappings of the
+ * shared zero folios and work with the underlying "struct page".
*
- * There are 2 broad cases. Firstly, an architecture may define a pte_special()
- * pte bit, in which case this function is trivial. Secondly, an architecture
- * may not have a spare pte bit, which requires a more complicated scheme,
- * described below.
+ * There are 2 broad cases. Firstly, an architecture may define a "special"
+ * page table entry bit, such as pte_special(), in which case this function is
+ * trivial. Secondly, an architecture may not have a spare page table
+ * entry bit, which requires a more complicated scheme, described below.
+ *
+ * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on
+ * page table entries that actually map "normal" pages: however, that page
+ * cannot be looked up through the PFN stored in the page table entry, but
+ * instead will be looked up through vm_ops->find_normal_page(). So far, this
+ * only applies to PTEs.
*
* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
* special mapping (even if there are underlying and valid "struct pages").
@@ -585,72 +668,104 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
*
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
* page" backing, however the difference is that _all_ pages with a struct
- * page (that is, those where pfn_valid is true) are refcounted and considered
- * normal pages by the VM. The only exception are zeropages, which are
- * *never* refcounted.
+ * page (that is, those where pfn_valid is true, except the shared zero
+ * folios) are refcounted and considered normal pages by the VM.
*
* The disadvantage is that pages are refcounted (which can be slower and
* simply not an option for some PFNMAP users). The advantage is that we
* don't have to follow the strict linearity rule of PFNMAP mappings in
* order to support COWable mappings.
*
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
*/
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+static inline struct page *__vm_normal_page(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn, bool special,
+ unsigned long long entry, enum pgtable_level level)
{
- unsigned long pfn = pte_pfn(pte);
-
if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
- if (likely(!pte_special(pte)))
- goto check_pfn;
- if (vma->vm_ops && vma->vm_ops->find_special_page)
- return vma->vm_ops->find_special_page(vma, addr);
- if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
- return NULL;
- if (is_zero_pfn(pfn))
- return NULL;
-
- print_bad_pte(vma, addr, pte, NULL);
- return NULL;
- }
-
- /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
-
- if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
- if (vma->vm_flags & VM_MIXEDMAP) {
- if (!pfn_valid(pfn))
- return NULL;
- if (is_zero_pfn(pfn))
+ if (unlikely(special)) {
+#ifdef CONFIG_FIND_NORMAL_PAGE
+ if (vma->vm_ops && vma->vm_ops->find_normal_page)
+ return vma->vm_ops->find_normal_page(vma, addr);
+#endif /* CONFIG_FIND_NORMAL_PAGE */
+ if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
- goto out;
- } else {
- unsigned long off;
- off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (pfn == vma->vm_pgoff + off)
- return NULL;
- if (!is_cow_mapping(vma->vm_flags))
+ if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
return NULL;
+
+ print_bad_page_map(vma, addr, entry, NULL, level);
+ return NULL;
}
- }
+ /*
+ * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table
+ * mappings (incl. shared zero folios) are marked accordingly.
+ */
+ } else {
+ if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) {
+ if (vma->vm_flags & VM_MIXEDMAP) {
+ /* If it has a "struct page", it's "normal". */
+ if (!pfn_valid(pfn))
+ return NULL;
+ } else {
+ unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (is_zero_pfn(pfn))
- return NULL;
+ /* Only CoW'ed anon folios are "normal". */
+ if (pfn == vma->vm_pgoff + off)
+ return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
+ }
+ }
+
+ if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
+ return NULL;
+ }
-check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
- print_bad_pte(vma, addr, pte, NULL);
+ /* Corrupted page table entry. */
+ print_bad_page_map(vma, addr, entry, NULL, level);
return NULL;
}
-
/*
* NOTE! We still have PageReserved() pages in the page tables.
- * eg. VDSO mappings can cause them to exist.
+ * For example, VDSO mappings can cause them to exist.
*/
-out:
- VM_WARN_ON_ONCE(is_zero_pfn(pfn));
+ VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn));
return pfn_to_page(pfn);
}
+/**
+ * vm_normal_page() - Get the "struct page" associated with a PTE
+ * @vma: The VMA mapping the @pte.
+ * @addr: The address where the @pte is mapped.
+ * @pte: The PTE.
+ *
+ * Get the "struct page" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte),
+ pte_val(pte), PGTABLE_LEVEL_PTE);
+}
+
+/**
+ * vm_normal_folio() - Get the "struct folio" associated with a PTE
+ * @vma: The VMA mapping the @pte.
+ * @addr: The address where the @pte is mapped.
+ * @pte: The PTE.
+ *
+ * Get the "struct folio" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
pte_t pte)
{
@@ -662,43 +777,37 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
}
#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
+/**
+ * vm_normal_page_pmd() - Get the "struct page" associated with a PMD
+ * @vma: The VMA mapping the @pmd.
+ * @addr: The address where the @pmd is mapped.
+ * @pmd: The PMD.
+ *
+ * Get the "struct page" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
{
- unsigned long pfn = pmd_pfn(pmd);
-
- /* Currently it's only used for huge pfnmaps */
- if (unlikely(pmd_special(pmd)))
- return NULL;
-
- if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
- if (vma->vm_flags & VM_MIXEDMAP) {
- if (!pfn_valid(pfn))
- return NULL;
- goto out;
- } else {
- unsigned long off;
- off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (pfn == vma->vm_pgoff + off)
- return NULL;
- if (!is_cow_mapping(vma->vm_flags))
- return NULL;
- }
- }
-
- if (is_huge_zero_pfn(pfn))
- return NULL;
- if (unlikely(pfn > highest_memmap_pfn))
- return NULL;
-
- /*
- * NOTE! We still have PageReserved() pages in the page tables.
- * eg. VDSO mappings can cause them to exist.
- */
-out:
- return pfn_to_page(pfn);
+ return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd),
+ pmd_val(pmd), PGTABLE_LEVEL_PMD);
}
+/**
+ * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD
+ * @vma: The VMA mapping the @pmd.
+ * @addr: The address where the @pmd is mapped.
+ * @pmd: The PMD.
+ *
+ * Get the "struct folio" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd)
{
@@ -708,6 +817,25 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
return page_folio(page);
return NULL;
}
+
+/**
+ * vm_normal_page_pud() - Get the "struct page" associated with a PUD
+ * @vma: The VMA mapping the @pud.
+ * @addr: The address where the @pud is mapped.
+ * @pud: The PUD.
+ *
+ * Get the "struct page" associated with a PUD. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
+struct page *vm_normal_page_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t pud)
+{
+ return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud),
+ pud_val(pud), PGTABLE_LEVEL_PUD);
+}
#endif
/**
@@ -4387,8 +4515,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
* Get a list of all the (large) orders below PMD_ORDER that are enabled
* and suitable for swapping THP.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
+ BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
orders = thp_swap_suitable_orders(swp_offset(entry),
vmf->address, orders);
@@ -4935,8 +5063,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
* for this vma. Then filter out the orders that can't be allocated over
* the faulting address and still be fully contained in the vma.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
+ BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
if (!orders)
@@ -5204,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa
* It is too late to allocate a small folio, we already have a large
* folio in the pagecache: especially s390 KVM cannot tolerate any
* PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
- * PMD mappings if THPs are disabled.
+ * PMD mappings if THPs are disabled. As we already have a THP,
+ * behave as if we are forcing a collapse.
*/
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags,
+ /* forced_collapse=*/ true))
return ret;
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
@@ -6126,8 +6256,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return VM_FAULT_OOM;
retry_pud:
if (pud_none(*vmf.pud) &&
- thp_vma_allowable_order(vma, vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
+ thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -6161,8 +6290,7 @@ retry_pud:
goto retry_pud;
if (pmd_none(*vmf.pmd) &&
- thp_vma_allowable_order(vma, vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
+ thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
diff --git a/mm/migrate.c b/mm/migrate.c
index 9e5ef39ce73a..8e435a078fc3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -231,18 +231,17 @@ static void putback_movable_ops_page(struct page *page)
* src and dst are also released by migration core. These pages will not be
* folios in the future, so that must be reworked.
*
- * Returns MIGRATEPAGE_SUCCESS on success, otherwise a negative error
- * code.
+ * Returns 0 on success, otherwise a negative error code.
*/
static int migrate_movable_ops_page(struct page *dst, struct page *src,
enum migrate_mode mode)
{
- int rc = MIGRATEPAGE_SUCCESS;
+ int rc;
VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src);
VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src);
rc = page_movable_ops(src)->migrate_page(dst, src, mode);
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (!rc)
ClearPageMovableOpsIsolated(src);
return rc;
}
@@ -587,7 +586,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
oldzone = folio_zone(folio);
@@ -688,7 +687,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
}
local_irq_enable();
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
int folio_migrate_mapping(struct address_space *mapping,
@@ -737,7 +736,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
xas_unlock_irq(&xas);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
/*
@@ -853,14 +852,14 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
return rc;
rc = __folio_migrate_mapping(mapping, dst, src, expected_count);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
return rc;
if (src_private)
folio_attach_private(dst, folio_detach_private(src));
folio_migrate_flags(dst, src);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
/**
@@ -967,7 +966,7 @@ recheck_buffers:
}
rc = filemap_migrate_folio(mapping, dst, src, mode);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
goto unlock_buffers;
bh = head;
@@ -1071,7 +1070,7 @@ static int fallback_migrate_folio(struct address_space *mapping,
*
* Return value:
* < 0 - error code
- * MIGRATEPAGE_SUCCESS - success
+ * 0 - success
*/
static int move_to_new_folio(struct folio *dst, struct folio *src,
enum migrate_mode mode)
@@ -1099,7 +1098,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
else
rc = fallback_migrate_folio(mapping, dst, src, mode);
- if (rc == MIGRATEPAGE_SUCCESS) {
+ if (!rc) {
/*
* For pagecache folios, src->mapping must be cleared before src
* is freed. Anonymous folios must stay anonymous until freed.
@@ -1189,7 +1188,7 @@ static void migrate_folio_done(struct folio *src,
static int migrate_folio_unmap(new_folio_t get_new_folio,
free_folio_t put_new_folio, unsigned long private,
struct folio *src, struct folio **dstp, enum migrate_mode mode,
- enum migrate_reason reason, struct list_head *ret)
+ struct list_head *ret)
{
struct folio *dst;
int rc = -EAGAIN;
@@ -1198,16 +1197,6 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
bool locked = false;
bool dst_locked = false;
- if (folio_ref_count(src) == 1) {
- /* Folio was freed from under us. So we are done. */
- folio_clear_active(src);
- folio_clear_unevictable(src);
- /* free_pages_prepare() will clear PG_isolated. */
- list_del(&src->lru);
- migrate_folio_done(src, reason);
- return MIGRATEPAGE_SUCCESS;
- }
-
dst = get_new_folio(src, private);
if (!dst)
return -ENOMEM;
@@ -1297,7 +1286,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
if (unlikely(page_has_movable_ops(&src->page))) {
__migrate_folio_record(dst, old_page_state, anon_vma);
- return MIGRATEPAGE_UNMAP;
+ return 0;
}
/*
@@ -1327,7 +1316,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
if (!folio_mapped(src)) {
__migrate_folio_record(dst, old_page_state, anon_vma);
- return MIGRATEPAGE_UNMAP;
+ return 0;
}
out:
@@ -1459,7 +1448,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
if (folio_ref_count(src) == 1) {
/* page was freed from under us. So we are done. */
folio_putback_hugetlb(src);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
dst = get_new_folio(src, private);
@@ -1522,8 +1511,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
rc = move_to_new_folio(dst, src, mode);
if (page_was_mapped)
- remove_migration_ptes(src,
- rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
+ remove_migration_ptes(src, !rc ? dst : src, 0);
unlock_put_anon:
folio_unlock(dst);
@@ -1532,7 +1520,7 @@ put_anon:
if (anon_vma)
put_anon_vma(anon_vma);
- if (rc == MIGRATEPAGE_SUCCESS) {
+ if (!rc) {
move_hugetlb_state(src, dst, reason);
put_new_folio = NULL;
}
@@ -1540,7 +1528,7 @@ put_anon:
out_unlock:
folio_unlock(src);
out:
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (!rc)
folio_putback_hugetlb(src);
else if (rc != -EAGAIN)
list_move_tail(&src->lru, ret);
@@ -1650,7 +1638,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
reason, ret_folios);
/*
* The rules are:
- * Success: hugetlb folio will be put back
+ * 0: hugetlb folio will be put back
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
* Other errno: put on ret_folios list
@@ -1667,7 +1655,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
retry++;
nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
+ case 0:
stats->nr_succeeded += nr_pages;
break;
default:
@@ -1721,7 +1709,7 @@ static void migrate_folios_move(struct list_head *src_folios,
reason, ret_folios);
/*
* The rules are:
- * Success: folio will be freed
+ * 0: folio will be freed
* -EAGAIN: stay on the unmap_folios list
* Other errno: put on ret_folios list
*/
@@ -1731,7 +1719,7 @@ static void migrate_folios_move(struct list_head *src_folios,
*thp_retry += is_thp;
*nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
+ case 0:
stats->nr_succeeded += nr_pages;
stats->nr_thp_succeeded += is_thp;
break;
@@ -1870,14 +1858,27 @@ static int migrate_pages_batch(struct list_head *from,
continue;
}
+ /*
+ * If we are holding the last folio reference, the folio
+ * was freed from under us, so just drop our reference.
+ */
+ if (likely(!page_has_movable_ops(&folio->page)) &&
+ folio_ref_count(folio) == 1) {
+ folio_clear_active(folio);
+ folio_clear_unevictable(folio);
+ list_del(&folio->lru);
+ migrate_folio_done(folio, reason);
+ stats->nr_succeeded += nr_pages;
+ stats->nr_thp_succeeded += is_thp;
+ continue;
+ }
+
rc = migrate_folio_unmap(get_new_folio, put_new_folio,
- private, folio, &dst, mode, reason,
- ret_folios);
+ private, folio, &dst, mode, ret_folios);
/*
* The rules are:
- * Success: folio will be freed
- * Unmap: folio will be put on unmap_folios list,
- * dst folio put on dst_folios list
+ * 0: folio will be put on unmap_folios list,
+ * dst folio put on dst_folios list
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
* Other errno: put on ret_folios list
@@ -1927,11 +1928,7 @@ static int migrate_pages_batch(struct list_head *from,
thp_retry += is_thp;
nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
- stats->nr_succeeded += nr_pages;
- stats->nr_thp_succeeded += is_thp;
- break;
- case MIGRATEPAGE_UNMAP:
+ case 0:
list_move_tail(&folio->lru, &unmap_folios);
list_add_tail(&dst->lru, &dst_folios);
break;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index e05e14d6eacd..abd9f6850db6 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -778,7 +778,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (migrate && migrate->fault_page == page)
extra_cnt = 1;
r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
- if (r != MIGRATEPAGE_SUCCESS)
+ if (r)
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
else
folio_migrate_flags(newfolio, folio);
diff --git a/mm/mincore.c b/mm/mincore.c
index 10dabefc3acc..2f3e1816a30d 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -47,6 +47,48 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
return 0;
}
+static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
+{
+ struct swap_info_struct *si;
+ struct folio *folio = NULL;
+ unsigned char present = 0;
+
+ if (!IS_ENABLED(CONFIG_SWAP)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * Shmem mapping may contain swapin error entries, which are
+ * absent. Page table may contain migration or hwpoison
+ * entries which are always uptodate.
+ */
+ if (non_swap_entry(entry))
+ return !shmem;
+
+ /*
+ * Shmem mapping lookup is lockless, so we need to grab the swap
+ * device. mincore page table walk locks the PTL, and the swap
+ * device is stable, avoid touching the si for better performance.
+ */
+ if (shmem) {
+ si = get_swap_device(entry);
+ if (!si)
+ return 0;
+ }
+ folio = filemap_get_entry(swap_address_space(entry),
+ swap_cache_index(entry));
+ if (shmem)
+ put_swap_device(si);
+ /* The swap cache space contains either folio, shadow or NULL */
+ if (folio && !xa_is_value(folio)) {
+ present = folio_test_uptodate(folio);
+ folio_put(folio);
+ }
+
+ return present;
+}
+
/*
* Later we can get more picky about what "in core" means precisely.
* For now, simply check to see if the page is in the page cache,
@@ -64,8 +106,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
- folio = filemap_get_incore_folio(mapping, index);
- if (!IS_ERR(folio)) {
+ folio = filemap_get_entry(mapping, index);
+ if (folio) {
+ if (xa_is_value(folio)) {
+ if (shmem_mapping(mapping))
+ return mincore_swap(radix_to_swp_entry(folio),
+ true);
+ else
+ return 0;
+ }
present = folio_test_uptodate(folio);
folio_put(folio);
}
@@ -143,23 +192,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
for (i = 0; i < step; i++)
vec[i] = 1;
} else { /* pte is a swap entry */
- swp_entry_t entry = pte_to_swp_entry(pte);
-
- if (non_swap_entry(entry)) {
- /*
- * migration or hwpoison entries are always
- * uptodate
- */
- *vec = 1;
- } else {
-#ifdef CONFIG_SWAP
- *vec = mincore_page(swap_address_space(entry),
- swap_cache_index(entry));
-#else
- WARN_ON(1);
- *vec = 1;
-#endif
- }
+ *vec = mincore_swap(pte_to_swp_entry(pte), false);
}
vec += step;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 7306253cc3b5..7a057e0e8da9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -802,7 +802,7 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi
unsigned long pgoff, unsigned long flags,
vm_flags_t vm_flags)
{
- if (test_bit(MMF_TOPDOWN, &mm->flags))
+ if (mm_flags_test(MMF_TOPDOWN, mm))
return arch_get_unmapped_area_topdown(filp, addr, len, pgoff,
flags, vm_flags);
return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
@@ -1284,7 +1284,7 @@ void exit_mmap(struct mm_struct *mm)
* Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
* because the memory has been already freed.
*/
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
mmap_write_lock(mm);
mt_clear_in_rcu(&mm->mm_mt);
vma_iter_set(&vmi, vma->vm_end);
@@ -1859,14 +1859,14 @@ loop_out:
mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
mas_store(&vmi.mas, XA_ZERO_ENTRY);
/* Avoid OOM iterating a broken tree */
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
}
/*
* The mm_struct is going to exit, but the locks will be dropped
* first. Set the mm_struct as unstable is advisable as it is
* not fully initialised.
*/
- set_bit(MMF_UNSTABLE, &mm->flags);
+ mm_flags_set(MMF_UNSTABLE, mm);
}
out:
mmap_write_unlock(mm);
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index b006cec8e6fe..0a0db5849b8e 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -128,6 +128,95 @@ void vma_mark_detached(struct vm_area_struct *vma)
}
/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ * False locked result is possible if mm_lock_seq overflows or if vma gets
+ * reused and attached to a different mm before we lock it.
+ * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
+ * detached.
+ *
+ * IMPORTANT: RCU lock must be held upon entering the function, but upon error
+ * IT IS RELEASED. The caller must handle this correctly.
+ */
+static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ struct mm_struct *other_mm;
+ int oldcnt;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
+ /*
+ * Check before locking. A race might cause false locked result.
+ * We can use READ_ONCE() for the mm_lock_seq here, and don't need
+ * ACQUIRE semantics, because this is just a lockless check whose result
+ * we don't rely on for anything - the mm_lock_seq read against which we
+ * need ordering is below.
+ */
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
+ vma = NULL;
+ goto err;
+ }
+
+ /*
+ * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
+ * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
+ * Acquire fence is required here to avoid reordering against later
+ * vm_lock_seq check and checks inside lock_vma_under_rcu().
+ */
+ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+ VMA_REF_LIMIT))) {
+ /* return EAGAIN if vma got detached from under us */
+ vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
+ goto err;
+ }
+
+ rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+
+ if (unlikely(vma->vm_mm != mm))
+ goto err_unstable;
+
+ /*
+ * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
+ * False unlocked result is impossible because we modify and check
+ * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
+ * modification invalidates all existing locks.
+ *
+ * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
+ * racing with vma_end_write_all(), we only start reading from the VMA
+ * after it has been unlocked.
+ * This pairs with RELEASE semantics in vma_end_write_all().
+ */
+ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
+ vma_refcount_put(vma);
+ vma = NULL;
+ goto err;
+ }
+
+ return vma;
+err:
+ rcu_read_unlock();
+
+ return vma;
+err_unstable:
+ /*
+ * If vma got attached to another mm from under us, that mm is not
+ * stable and can be freed in the narrow window after vma->vm_refcnt
+ * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
+ * releasing vma->vm_refcnt.
+ */
+ other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
+
+ /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
+ rcu_read_unlock();
+ mmgrab(other_mm);
+ vma_refcount_put(vma);
+ mmdrop(other_mm);
+
+ return NULL;
+}
+
+/*
* Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
* stable and not isolated. If the VMA is not found or is being modified the
* function returns NULL.
@@ -138,11 +227,13 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
MA_STATE(mas, &mm->mm_mt, address, address);
struct vm_area_struct *vma;
- rcu_read_lock();
retry:
+ rcu_read_lock();
vma = mas_walk(&mas);
- if (!vma)
+ if (!vma) {
+ rcu_read_unlock();
goto inval;
+ }
vma = vma_start_read(mm, vma);
if (IS_ERR_OR_NULL(vma)) {
@@ -162,18 +253,17 @@ retry:
* From here on, we can access the VMA without worrying about which
* fields are accessible for RCU readers.
*/
+ rcu_read_unlock();
/* Check if the vma we locked is the right one. */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
- goto inval_end_read;
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+ vma_end_read(vma);
+ goto inval;
+ }
- rcu_read_unlock();
return vma;
-inval_end_read:
- vma_end_read(vma);
inval:
- rcu_read_unlock();
count_vm_vma_lock_event(VMA_LOCK_ABORT);
return NULL;
}
@@ -228,6 +318,7 @@ retry:
*/
if (PTR_ERR(vma) == -EAGAIN) {
/* reset to search from the last address */
+ rcu_read_lock();
vma_iter_set(vmi, from_addr);
goto retry;
}
@@ -257,9 +348,9 @@ retry:
return vma;
fallback_unlock:
+ rcu_read_unlock();
vma_end_read(vma);
fallback:
- rcu_read_unlock();
vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
rcu_read_lock();
/* Reinitialize the iterator after re-entering rcu read section */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index b49cc6385f1f..374aa6f021c6 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -32,7 +32,7 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
return false;
- batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ batch = (void *)__get_free_page(GFP_NOWAIT);
if (!batch)
return false;
@@ -364,7 +364,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
struct mmu_table_batch **batch = &tlb->batch;
if (*batch == NULL) {
- *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
if (*batch == NULL) {
tlb_table_invalidate(tlb);
tlb_remove_table_one(table);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f9baa8882fbf..0c8f181d9d50 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -99,14 +99,14 @@ int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
unsigned long old_flags, flags;
int last_cpupid;
- old_flags = READ_ONCE(folio->flags);
+ old_flags = READ_ONCE(folio->flags.f);
do {
flags = old_flags;
last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
- } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags)));
+ } while (unlikely(!try_cmpxchg(&folio->flags.f, &old_flags, flags)));
return last_cpupid;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 8b819fafd57b..c3a23b082adb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -64,7 +64,7 @@ const struct vm_operations_struct generic_file_vm_ops = {
*/
unsigned int kobjsize(const void *objp)
{
- struct page *page;
+ struct folio *folio;
/*
* If the object we have should not have ksize performed on it,
@@ -73,22 +73,22 @@ unsigned int kobjsize(const void *objp)
if (!objp || !virt_addr_valid(objp))
return 0;
- page = virt_to_head_page(objp);
+ folio = virt_to_folio(objp);
/*
* If the allocator sets PageSlab, we know the pointer came from
* kmalloc().
*/
- if (PageSlab(page))
+ if (folio_test_slab(folio))
return ksize(objp);
/*
- * If it's not a compound page, see if we have a matching VMA
+ * If it's not a large folio, see if we have a matching VMA
* region. This test is intentionally done in reverse order,
* so if there's no VMA, we still fall through and hand back
- * PAGE_SIZE for 0-order pages.
+ * PAGE_SIZE for 0-order folios.
*/
- if (!PageCompound(page)) {
+ if (!folio_test_large(folio)) {
struct vm_area_struct *vma;
vma = find_vma(current->mm, (unsigned long)objp);
@@ -100,7 +100,7 @@ unsigned int kobjsize(const void *objp)
* The ksize() function is only guaranteed to work for pointers
* returned by kmalloc(). So handle arbitrary pointers here.
*/
- return page_size(page);
+ return folio_size(folio);
}
void vfree(const void *addr)
@@ -119,7 +119,8 @@ void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
}
EXPORT_SYMBOL(__vmalloc_noprof);
-void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int node)
{
return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM);
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 25923cfec9c6..17650f0b516e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/oom_kill.c
- *
+ *
* Copyright (C) 1998,2000 Rik van Riel
* Thanks go out to Claus Fischer for some serious inspiration and
* for goading me into coding this file...
@@ -218,7 +218,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
*/
adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN ||
- test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
+ mm_flags_test(MMF_OOM_SKIP, p->mm) ||
in_vfork(p)) {
task_unlock(p);
return LONG_MIN;
@@ -325,7 +325,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
* any memory is quite low.
*/
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
- if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
+ if (mm_flags_test(MMF_OOM_SKIP, task->signal->oom_mm))
goto next;
goto abort;
}
@@ -524,7 +524,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
* should imply barriers already and the reader would hit a page fault
* if it stumbled over a reaped memory.
*/
- set_bit(MMF_UNSTABLE, &mm->flags);
+ mm_flags_set(MMF_UNSTABLE, mm);
for_each_vma(vmi, vma) {
if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
@@ -583,7 +583,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* under mmap_lock for reading because it serializes against the
* mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
*/
- if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ if (mm_flags_test(MMF_OOM_SKIP, mm)) {
trace_skip_task_reaping(tsk->pid);
goto out_unlock;
}
@@ -619,7 +619,7 @@ static void oom_reap_task(struct task_struct *tsk)
schedule_timeout_idle(HZ/10);
if (attempts <= MAX_OOM_REAP_RETRIES ||
- test_bit(MMF_OOM_SKIP, &mm->flags))
+ mm_flags_test(MMF_OOM_SKIP, mm))
goto done;
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
@@ -634,7 +634,7 @@ done:
* Hide this mm from OOM killer because it has been either reaped or
* somebody can't call mmap_write_unlock(mm).
*/
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
/* Drop a reference taken by queue_oom_reaper */
put_task_struct(tsk);
@@ -670,7 +670,7 @@ static void wake_oom_reaper(struct timer_list *timer)
unsigned long flags;
/* The victim managed to terminate on its own - see exit_mmap */
- if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ if (mm_flags_test(MMF_OOM_SKIP, mm)) {
put_task_struct(tsk);
return;
}
@@ -695,7 +695,7 @@ static void wake_oom_reaper(struct timer_list *timer)
static void queue_oom_reaper(struct task_struct *tsk)
{
/* mm is already queued? */
- if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
+ if (mm_flags_test_and_set(MMF_OOM_REAP_QUEUED, tsk->signal->oom_mm))
return;
get_task_struct(tsk);
@@ -892,7 +892,7 @@ static bool task_will_free_mem(struct task_struct *task)
* This task has already been drained by the oom reaper so there are
* only small chances it will free some more
*/
- if (test_bit(MMF_OOM_SKIP, &mm->flags))
+ if (mm_flags_test(MMF_OOM_SKIP, mm))
return false;
if (atomic_read(&mm->mm_users) <= 1)
@@ -977,7 +977,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
continue;
if (is_global_init(p)) {
can_oom_reap = false;
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
task_pid_nr(victim), victim->comm,
task_pid_nr(p), p->comm);
@@ -1235,7 +1235,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
reap = true;
else {
/* Error only if the work has not been done already */
- if (!test_bit(MMF_OOM_SKIP, &mm->flags))
+ if (!mm_flags_test(MMF_OOM_SKIP, mm))
ret = -EINVAL;
}
task_unlock(p);
@@ -1251,7 +1251,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
* Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
* possible change in exit_mmap is seen
*/
- if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
+ if (mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm))
ret = -EAGAIN;
mmap_read_unlock(mm);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3e248d1c3969..5f90fd6a7137 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -38,10 +38,10 @@
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/mm_inline.h>
+#include <linux/shmem_fs.h>
#include <trace/events/writeback.h>
#include "internal.h"
-#include "swap.h"
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
@@ -2590,36 +2590,6 @@ done:
}
EXPORT_SYMBOL_GPL(writeback_iter);
-/**
- * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @writepage: function called for each page
- * @data: data passed to writepage function
- *
- * Return: %0 on success, negative error code otherwise
- *
- * Note: please use writeback_iter() instead.
- */
-int write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc, writepage_t writepage,
- void *data)
-{
- struct folio *folio = NULL;
- int error;
-
- while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
- error = writepage(folio, wbc, data);
- if (error == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- error = 0;
- }
- }
-
- return error;
-}
-EXPORT_SYMBOL(write_cache_pages);
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2735,12 +2705,18 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
{
unsigned long flags;
+ /*
+ * Shmem writeback relies on swap, and swap writeback is LRU based,
+ * not using the dirty mark.
+ */
+ VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping));
+
xa_lock_irqsave(&mapping->i_pages, flags);
if (folio->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
folio_account_dirtied(folio, mapping);
- __xa_set_mark(&mapping->i_pages, folio_index(folio),
- PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&mapping->i_pages, folio->index,
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
@@ -3019,7 +2995,7 @@ bool __folio_end_writeback(struct folio *folio)
xa_lock_irqsave(&mapping->i_pages, flags);
ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
- __xa_clear_mark(&mapping->i_pages, folio_index(folio),
+ __xa_clear_mark(&mapping->i_pages, folio->index,
PAGECACHE_TAG_WRITEBACK);
if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
struct bdi_writeback *wb = inode_to_wb(inode);
@@ -3056,7 +3032,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
- XA_STATE(xas, &mapping->i_pages, folio_index(folio));
+ XA_STATE(xas, &mapping->i_pages, folio->index);
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d1d037f97c5f..0873d640f26c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -355,7 +355,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
{
- return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS;
+ return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS;
}
static __always_inline void
@@ -370,7 +370,7 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
#else
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
#endif
- BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits));
+ BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK);
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
bitmap = get_pageblock_bitmap(page, pfn);
@@ -538,8 +538,7 @@ static void set_pageblock_migratetype(struct page *page,
"Use set_pageblock_isolate() for pageblock isolation");
return;
}
- VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page),
- PB_migrate_isolate),
+ VM_WARN_ONCE(get_pageblock_isolate(page),
"Use clear_pageblock_isolate() to unisolate pageblock");
/* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */
#endif
@@ -797,7 +796,7 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
if (is_migrate_cma(migratetype))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
- else if (is_migrate_highatomic(migratetype))
+ else if (migratetype == MIGRATE_HIGHATOMIC)
WRITE_ONCE(zone->nr_free_highatomic,
zone->nr_free_highatomic + nr_pages);
}
@@ -950,7 +949,7 @@ static inline void __free_one_page(struct page *page,
bool to_tail;
VM_BUG_ON(!zone_is_initialized(zone));
- VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
+ VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
@@ -1043,7 +1042,7 @@ static inline bool page_expected_state(struct page *page,
page->memcg_data |
#endif
page_pool_page_is_pp(page) |
- (page->flags & check_flags)))
+ (page->flags.f & check_flags)))
return false;
return true;
@@ -1059,7 +1058,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _refcount";
- if (unlikely(page->flags & flags)) {
+ if (unlikely(page->flags.f & flags)) {
if (flags == PAGE_FLAGS_CHECK_AT_PREP)
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
else
@@ -1358,7 +1357,7 @@ __always_inline bool free_pages_prepare(struct page *page,
int i;
if (compound) {
- page[1].flags &= ~PAGE_FLAGS_SECOND;
+ page[1].flags.f &= ~PAGE_FLAGS_SECOND;
#ifdef NR_PAGES_IN_LARGE_FOLIO
folio->_nr_pages = 0;
#endif
@@ -1372,7 +1371,7 @@ __always_inline bool free_pages_prepare(struct page *page,
continue;
}
}
- (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ (page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
if (folio_test_anon(folio)) {
@@ -1391,7 +1390,7 @@ __always_inline bool free_pages_prepare(struct page *page,
}
page_cpupid_reset_last(page);
- page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);
@@ -2034,7 +2033,13 @@ static int move_freepages_block(struct zone *zone, struct page *page,
/* Look for a buddy that straddles start_pfn */
static unsigned long find_large_buddy(unsigned long start_pfn)
{
- int order = 0;
+ /*
+ * If start_pfn is not an order-0 PageBuddy, next PageBuddy containing
+ * start_pfn has minimal order of __ffs(start_pfn) + 1. Start checking
+ * the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy,
+ * the starting order does not matter.
+ */
+ int order = start_pfn ? __ffs(start_pfn) : MAX_PAGE_ORDER;
struct page *page;
unsigned long pfn = start_pfn;
@@ -2058,9 +2063,9 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
{
if (isolate)
- set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+ set_pageblock_isolate(page);
else
- clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+ clear_pageblock_isolate(page);
}
/**
@@ -4182,7 +4187,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
}
static inline bool
-should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
@@ -4408,7 +4413,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
alloc_flags |= ALLOC_NON_BLOCK;
- if (order > 0)
+ if (order > 0 && (alloc_flags & ALLOC_MIN_RESERVE))
alloc_flags |= ALLOC_HIGHATOMIC;
}
@@ -5946,7 +5951,6 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
pcp->high_min = BOOT_PAGESET_HIGH;
pcp->high_max = BOOT_PAGESET_HIGH;
pcp->batch = BOOT_PAGESET_BATCH;
- pcp->free_count = 0;
}
static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
@@ -6236,16 +6240,13 @@ static void calculate_totalreserve_pages(void)
unsigned long managed_pages = zone_managed_pages(zone);
/* Find valid and maximum lowmem_reserve in the zone */
- for (j = i; j < MAX_NR_ZONES; j++) {
- if (zone->lowmem_reserve[j] > max)
- max = zone->lowmem_reserve[j];
- }
+ for (j = i; j < MAX_NR_ZONES; j++)
+ max = max(max, zone->lowmem_reserve[j]);
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
- if (max > managed_pages)
- max = managed_pages;
+ max = min_t(unsigned long, max, managed_pages);
pgdat->totalreserve_pages += max;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 648038247a8d..c6753d370ff4 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -902,23 +902,23 @@ struct folio *folio_walk_start(struct folio_walk *fw,
fw->pudp = pudp;
fw->pud = pud;
- /*
- * TODO: FW_MIGRATION support for PUD migration entries
- * once there are relevant users.
- */
- if (!pud_present(pud) || pud_special(pud)) {
+ if (pud_none(pud)) {
spin_unlock(ptl);
goto not_found;
- } else if (!pud_leaf(pud)) {
+ } else if (pud_present(pud) && !pud_leaf(pud)) {
spin_unlock(ptl);
goto pmd_table;
+ } else if (pud_present(pud)) {
+ page = vm_normal_page_pud(vma, addr, pud);
+ if (page)
+ goto found;
}
/*
- * TODO: vm_normal_page_pud() will be handy once we want to
- * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs.
+ * TODO: FW_MIGRATION support for PUD migration entries
+ * once there are relevant users.
*/
- page = pud_page(pud);
- goto found;
+ spin_unlock(ptl);
+ goto not_found;
}
pmd_table:
diff --git a/mm/rmap.c b/mm/rmap.c
index 568198e9efc2..34333ae3bd80 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -79,7 +79,6 @@
#include <asm/tlbflush.h>
#define CREATE_TRACE_POINTS
-#include <trace/events/tlb.h>
#include <trace/events/migrate.h>
#include "internal.h"
@@ -285,7 +284,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma;
- avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+ avc = anon_vma_chain_alloc(GFP_NOWAIT);
if (unlikely(!avc)) {
unlock_anon_vma_root(root);
root = NULL;
@@ -1241,18 +1240,40 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
return page_vma_mkclean_one(&pvmw);
}
-static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
+static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
+{
+ int idx;
+
+ if (nr) {
+ idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
+ __lruvec_stat_mod_folio(folio, idx, nr);
+ }
+ if (nr_pmdmapped) {
+ if (folio_test_anon(folio)) {
+ idx = NR_ANON_THPS;
+ __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
+ } else {
+ /* NR_*_PMDMAPPED are not maintained per-memcg */
+ idx = folio_test_swapbacked(folio) ?
+ NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
+ __mod_node_page_state(folio_pgdat(folio), idx,
+ nr_pmdmapped);
+ }
+ }
+}
+
+static __always_inline void __folio_add_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- enum rmap_level level, int *nr_pmdmapped)
+ enum pgtable_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
const int orig_nr_pages = nr_pages;
- int first = 0, nr = 0;
+ int first = 0, nr = 0, nr_pmdmapped = 0;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
nr = atomic_inc_and_test(&folio->_mapcount);
break;
@@ -1278,12 +1299,12 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
folio_add_large_mapcount(folio, orig_nr_pages, vma);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
- if (level == RMAP_LEVEL_PMD && first)
- *nr_pmdmapped = folio_large_nr_pages(folio);
+ if (level == PGTABLE_LEVEL_PMD && first)
+ nr_pmdmapped = folio_large_nr_pages(folio);
nr = folio_inc_return_large_mapcount(folio, vma);
if (nr == 1)
/* Was completely unmapped. */
@@ -1301,8 +1322,8 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
* We only track PMD mappings of PMD-sized
* folios separately.
*/
- if (level == RMAP_LEVEL_PMD)
- *nr_pmdmapped = nr_pages;
+ if (level == PGTABLE_LEVEL_PMD)
+ nr_pmdmapped = nr_pages;
nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of a remove and another add? */
if (unlikely(nr < 0))
@@ -1314,8 +1335,10 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
}
folio_inc_large_mapcount(folio, vma);
break;
+ default:
+ BUILD_BUG();
}
- return nr;
+ __folio_mod_stat(folio, nr, nr_pmdmapped);
}
/**
@@ -1403,59 +1426,37 @@ static void __page_check_anon_rmap(const struct folio *folio,
page);
}
-static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
-{
- int idx;
-
- if (nr) {
- idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
- __lruvec_stat_mod_folio(folio, idx, nr);
- }
- if (nr_pmdmapped) {
- if (folio_test_anon(folio)) {
- idx = NR_ANON_THPS;
- __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
- } else {
- /* NR_*_PMDMAPPED are not maintained per-memcg */
- idx = folio_test_swapbacked(folio) ?
- NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
- __mod_node_page_state(folio_pgdat(folio), idx,
- nr_pmdmapped);
- }
- }
-}
-
static __always_inline void __folio_add_anon_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- unsigned long address, rmap_t flags, enum rmap_level level)
+ unsigned long address, rmap_t flags, enum pgtable_level level)
{
- int i, nr, nr_pmdmapped = 0;
+ int i;
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
+ __folio_add_rmap(folio, page, nr_pages, vma, level);
if (likely(!folio_test_ksm(folio)))
__page_check_anon_rmap(folio, page, vma, address);
- __folio_mod_stat(folio, nr, nr_pmdmapped);
-
if (flags & RMAP_EXCLUSIVE) {
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
for (i = 0; i < nr_pages; i++)
SetPageAnonExclusive(page + i);
break;
- case RMAP_LEVEL_PMD:
+ case PGTABLE_LEVEL_PMD:
SetPageAnonExclusive(page);
break;
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PUD:
/*
* Keep the compiler happy, we don't support anonymous
* PUD mappings.
*/
WARN_ON_ONCE(1);
break;
+ default:
+ BUILD_BUG();
}
}
@@ -1509,7 +1510,7 @@ void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
rmap_t flags)
{
__folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
- RMAP_LEVEL_PTE);
+ PGTABLE_LEVEL_PTE);
}
/**
@@ -1530,7 +1531,7 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
__folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
- RMAP_LEVEL_PMD);
+ PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1611,14 +1612,11 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
static __always_inline void __folio_add_file_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- enum rmap_level level)
+ enum pgtable_level level)
{
- int nr, nr_pmdmapped = 0;
-
VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
- __folio_mod_stat(folio, nr, nr_pmdmapped);
+ __folio_add_rmap(folio, page, nr_pages, vma, level);
/* See comments in folio_add_anon_rmap_*() */
if (!folio_test_large(folio))
@@ -1639,7 +1637,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio,
void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
int nr_pages, struct vm_area_struct *vma)
{
- __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
+ __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
}
/**
@@ -1656,7 +1654,7 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
+ __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1677,7 +1675,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
- __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+ __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1685,7 +1683,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
static __always_inline void __folio_remove_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- enum rmap_level level)
+ enum pgtable_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
int last = 0, nr = 0, nr_pmdmapped = 0;
@@ -1694,7 +1692,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
nr = atomic_add_negative(-1, &folio->_mapcount);
break;
@@ -1704,7 +1702,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
if (!nr) {
/* Now completely unmapped. */
- nr = folio_nr_pages(folio);
+ nr = folio_large_nr_pages(folio);
} else {
partially_mapped = nr < folio_large_nr_pages(folio) &&
!folio_entire_mapcount(folio);
@@ -1724,11 +1722,11 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && atomic_read(mapped);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
last = atomic_add_negative(-1, &folio->_entire_mapcount);
- if (level == RMAP_LEVEL_PMD && last)
+ if (level == PGTABLE_LEVEL_PMD && last)
nr_pmdmapped = folio_large_nr_pages(folio);
nr = folio_dec_return_large_mapcount(folio, vma);
if (!nr) {
@@ -1748,9 +1746,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED)) {
nr_pages = folio_large_nr_pages(folio);
- if (level == RMAP_LEVEL_PMD)
+ if (level == PGTABLE_LEVEL_PMD)
nr_pmdmapped = nr_pages;
- nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
+ nr = nr_pages - nr;
/* Raced ahead of another remove and an add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1762,6 +1760,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && nr < nr_pmdmapped;
break;
+ default:
+ BUILD_BUG();
}
/*
@@ -1801,7 +1801,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
int nr_pages, struct vm_area_struct *vma)
{
- __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
+ __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
}
/**
@@ -1818,7 +1818,7 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
+ __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1839,7 +1839,7 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
- __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+ __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
#else
WARN_ON_ONCE(true);
#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index e2c76a30802b..640fecc42f60 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1006,15 +1006,15 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
+ struct folio *folio;
unsigned long swapped = 0;
unsigned long max = end - 1;
rcu_read_lock();
- xas_for_each(&xas, page, max) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, max) {
+ if (xas_retry(&xas, folio))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(folio))
swapped += 1 << xas_get_order(&xas);
if (xas.xa_index == max)
break;
@@ -1817,7 +1817,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
unsigned int global_orders;
- if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
+ if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force)))
return 0;
global_orders = shmem_huge_global_enabled(inode, index, write_end,
@@ -2430,7 +2430,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
goto failed;
}
folio_wait_writeback(folio);
- nr_pages = folio_nr_pages(folio);
/*
* Some architectures may have to restore extra metadata to the
@@ -5081,7 +5080,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
- sb->s_flags |= SB_NOSEC | SB_I_VERSION;
+ sb->s_flags |= SB_NOSEC;
#if IS_ENABLED(CONFIG_UNICODE)
if (!ctx->encoding && ctx->strict_encoding) {
@@ -5385,6 +5384,9 @@ int shmem_init_fs_context(struct fs_context *fc)
fc->fs_private = ctx;
fc->ops = &shmem_fs_context_ops;
+#ifdef CONFIG_TMPFS
+ fc->sb_flags |= SB_I_VERSION;
+#endif
return 0;
}
diff --git a/mm/slab.h b/mm/slab.h
index 248b34c839b7..c41a512dd07c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -50,7 +50,7 @@ typedef union {
/* Reuses the bits in struct page */
struct slab {
- unsigned long flags;
+ memdesc_flags_t flags;
struct kmem_cache *slab_cache;
union {
@@ -174,12 +174,12 @@ static inline void *slab_address(const struct slab *slab)
static inline int slab_nid(const struct slab *slab)
{
- return folio_nid(slab_folio(slab));
+ return memdesc_nid(slab->flags);
}
static inline pg_data_t *slab_pgdat(const struct slab *slab)
{
- return folio_pgdat(slab_folio(slab));
+ return NODE_DATA(slab_nid(slab));
}
static inline struct slab *virt_to_slab(const void *addr)
diff --git a/mm/slub.c b/mm/slub.c
index 30003763d224..af343ca570b5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -657,17 +657,17 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
*/
static inline bool slab_test_pfmemalloc(const struct slab *slab)
{
- return test_bit(SL_pfmemalloc, &slab->flags);
+ return test_bit(SL_pfmemalloc, &slab->flags.f);
}
static inline void slab_set_pfmemalloc(struct slab *slab)
{
- set_bit(SL_pfmemalloc, &slab->flags);
+ set_bit(SL_pfmemalloc, &slab->flags.f);
}
static inline void __slab_clear_pfmemalloc(struct slab *slab)
{
- __clear_bit(SL_pfmemalloc, &slab->flags);
+ __clear_bit(SL_pfmemalloc, &slab->flags.f);
}
/*
@@ -675,12 +675,12 @@ static inline void __slab_clear_pfmemalloc(struct slab *slab)
*/
static __always_inline void slab_lock(struct slab *slab)
{
- bit_spin_lock(SL_locked, &slab->flags);
+ bit_spin_lock(SL_locked, &slab->flags.f);
}
static __always_inline void slab_unlock(struct slab *slab)
{
- bit_spin_unlock(SL_locked, &slab->flags);
+ bit_spin_unlock(SL_locked, &slab->flags.f);
}
static inline bool
@@ -1046,7 +1046,7 @@ static void print_slab_info(const struct slab *slab)
{
pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
slab, slab->objects, slab->inuse, slab->freelist,
- &slab->flags);
+ &slab->flags.f);
}
void skip_orig_size_check(struct kmem_cache *s, const void *object)
@@ -2755,17 +2755,17 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab)
static inline bool slab_test_node_partial(const struct slab *slab)
{
- return test_bit(SL_partial, &slab->flags);
+ return test_bit(SL_partial, &slab->flags.f);
}
static inline void slab_set_node_partial(struct slab *slab)
{
- set_bit(SL_partial, &slab->flags);
+ set_bit(SL_partial, &slab->flags.f);
}
static inline void slab_clear_node_partial(struct slab *slab)
{
- clear_bit(SL_partial, &slab->flags);
+ clear_bit(SL_partial, &slab->flags.f);
}
/*
@@ -4881,7 +4881,7 @@ void kfree(const void *object)
EXPORT_SYMBOL(kfree);
static __always_inline __realloc_size(2) void *
-__do_krealloc(const void *p, size_t new_size, gfp_t flags)
+__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid)
{
void *ret;
size_t ks = 0;
@@ -4895,6 +4895,16 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
if (!kasan_check_byte(p))
return NULL;
+ /*
+ * If reallocation is not necessary (e. g. the new size is less
+ * than the current allocated size), the current allocation will be
+ * preserved unless __GFP_THISNODE is set. In the latter case a new
+ * allocation on the requested node will be attempted.
+ */
+ if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
+ nid != page_to_nid(virt_to_page(p)))
+ goto alloc_new;
+
if (is_kfence_address(p)) {
ks = orig_size = kfence_ksize(p);
} else {
@@ -4917,6 +4927,10 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
if (new_size > ks)
goto alloc_new;
+ /* If the old object doesn't satisfy the new alignment, allocate a new one */
+ if (!IS_ALIGNED((unsigned long)p, align))
+ goto alloc_new;
+
/* Zero out spare memory. */
if (want_init_on_alloc(flags)) {
kasan_disable_current();
@@ -4939,7 +4953,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
return (void *)p;
alloc_new:
- ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_);
+ ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_);
if (ret && p) {
/* Disable KASAN checks as the object's redzone is accessed. */
kasan_disable_current();
@@ -4951,14 +4965,19 @@ alloc_new:
}
/**
- * krealloc - reallocate memory. The contents will remain unchanged.
+ * krealloc_node_align - reallocate memory. The contents will remain unchanged.
* @p: object to reallocate memory for.
* @new_size: how many bytes of memory are required.
+ * @align: desired alignment.
* @flags: the type of memory to allocate.
+ * @nid: NUMA node or NUMA_NO_NODE
*
* If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
* is 0 and @p is not a %NULL pointer, the object pointed to is freed.
*
+ * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
+ * Documentation/core-api/memory-allocation.rst for more details.
+ *
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
@@ -4983,7 +5002,8 @@ alloc_new:
*
* Return: pointer to the allocated memory or %NULL in case of error
*/
-void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
+void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align,
+ gfp_t flags, int nid)
{
void *ret;
@@ -4992,13 +5012,13 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
return ZERO_SIZE_PTR;
}
- ret = __do_krealloc(p, new_size, flags);
+ ret = __do_krealloc(p, new_size, align, flags, nid);
if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
kfree(p);
return ret;
}
-EXPORT_SYMBOL(krealloc_noprof);
+EXPORT_SYMBOL(krealloc_node_align_noprof);
static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
{
@@ -5029,9 +5049,13 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
* failure, fall back to non-contiguous (vmalloc) allocation.
* @size: size of the request.
* @b: which set of kmalloc buckets to allocate from.
+ * @align: desired alignment.
* @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
* @node: numa node to allocate from
*
+ * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
+ * Documentation/core-api/memory-allocation.rst for more details.
+ *
* Uses kmalloc to get the memory but if the allocation fails then falls back
* to the vmalloc allocator. Use kvfree for freeing the memory.
*
@@ -5041,7 +5065,8 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
*
* Return: pointer to the allocated memory of %NULL in case of failure
*/
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
+ gfp_t flags, int node)
{
void *ret;
@@ -5071,7 +5096,7 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
* about the resulting pointer, and cannot play
* protection games.
*/
- return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+ return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
node, __builtin_return_address(0));
}
@@ -5115,14 +5140,19 @@ void kvfree_sensitive(const void *addr, size_t len)
EXPORT_SYMBOL(kvfree_sensitive);
/**
- * kvrealloc - reallocate memory; contents remain unchanged
+ * kvrealloc_node_align - reallocate memory; contents remain unchanged
* @p: object to reallocate memory for
* @size: the size to reallocate
+ * @align: desired alignment
* @flags: the flags for the page level allocator
+ * @nid: NUMA node id
*
* If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
* and @p is not a %NULL pointer, the object pointed to is freed.
*
+ * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
+ * Documentation/core-api/memory-allocation.rst for more details.
+ *
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
@@ -5136,17 +5166,18 @@ EXPORT_SYMBOL(kvfree_sensitive);
*
* Return: pointer to the allocated memory or %NULL in case of error
*/
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
+void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid)
{
void *n;
if (is_vmalloc_addr(p))
- return vrealloc_noprof(p, size, flags);
+ return vrealloc_node_align_noprof(p, size, align, flags, nid);
- n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
+ n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid);
if (!n) {
/* We failed to krealloc(), fall back to kvmalloc(). */
- n = kvmalloc_noprof(size, flags);
+ n = kvmalloc_node_align_noprof(size, align, flags, nid);
if (!n)
return NULL;
@@ -5162,7 +5193,7 @@ void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
return n;
}
-EXPORT_SYMBOL(kvrealloc_noprof);
+EXPORT_SYMBOL(kvrealloc_node_align_noprof);
struct detached_freelist {
struct slab *slab;
diff --git a/mm/sparse.c b/mm/sparse.c
index e6075b622407..17c50a6415c2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -43,11 +43,11 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif
-int page_to_nid(const struct page *page)
+int memdesc_nid(memdesc_flags_t mdf)
{
- return section_to_node_table[page_to_section(page)];
+ return section_to_node_table[memdesc_section(mdf)];
}
-EXPORT_SYMBOL(page_to_nid);
+EXPORT_SYMBOL(memdesc_nid);
static void set_section_nid(unsigned long section_nr, int nid)
{
diff --git a/mm/swap.c b/mm/swap.c
index b74ebe865dd9..b8cea6a1b86f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -388,14 +388,14 @@ static void __lru_cache_activate_folio(struct folio *folio)
static void lru_gen_inc_refs(struct folio *folio)
{
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
if (folio_test_unevictable(folio))
return;
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio)) {
- set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
return;
}
@@ -407,7 +407,7 @@ static void lru_gen_inc_refs(struct folio *folio)
}
new_flags = old_flags + BIT(LRU_REFS_PGOFF);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
}
static bool lru_gen_clear_refs(struct folio *folio)
@@ -419,7 +419,7 @@ static bool lru_gen_clear_refs(struct folio *folio)
if (gen < 0)
return true;
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
lrugen = &folio_lruvec(folio)->lrugen;
/* whether can do without shuffling under the LRU lock */
@@ -1098,7 +1098,7 @@ static const struct ctl_table swap_sysctl_table[] = {
*/
void __init swap_setup(void)
{
- unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
+ unsigned long megs = PAGES_TO_MB(totalram_pages());
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
diff --git a/mm/swap.h b/mm/swap.h
index 911ad5ff0f89..1ae44d4193b1 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -64,9 +64,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index);
-
struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
struct swap_iocb **plug);
@@ -178,13 +175,6 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
return NULL;
}
-static inline
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index)
-{
- return filemap_get_folio(mapping, index);
-}
-
static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
{
return NULL;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c354435a0923..99513b74b5d8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -323,44 +323,6 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
return folio;
}
-/**
- * filemap_get_incore_folio - Find and get a folio from the page or swap caches.
- * @mapping: The address_space to search.
- * @index: The page cache index.
- *
- * This differs from filemap_get_folio() in that it will also look for the
- * folio in the swap cache.
- *
- * Return: The found folio or %NULL.
- */
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index)
-{
- swp_entry_t swp;
- struct swap_info_struct *si;
- struct folio *folio = filemap_get_entry(mapping, index);
-
- if (!folio)
- return ERR_PTR(-ENOENT);
- if (!xa_is_value(folio))
- return folio;
- if (!shmem_mapping(mapping))
- return ERR_PTR(-ENOENT);
-
- swp = radix_to_swp_entry(folio);
- /* There might be swapin error entries in shmem mapping. */
- if (non_swap_entry(swp))
- return ERR_PTR(-ENOENT);
- /* Prevent swapoff from happening to us */
- si = get_swap_device(swp);
- if (!si)
- return ERR_PTR(-ENOENT);
- index = swap_cache_index(swp);
- folio = filemap_get_folio(swap_address_space(swp), index);
- put_swap_device(si);
- return folio;
-}
-
struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
bool skip_if_exists)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b4f3cc712580..a7ffabbe65ef 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -470,11 +470,6 @@ static void move_cluster(struct swap_info_struct *si,
else
list_move_tail(&ci->list, list);
spin_unlock(&si->lock);
-
- if (ci->flags == CLUSTER_FLAG_FRAG)
- atomic_long_dec(&si->frag_cluster_nr[ci->order]);
- else if (new_flags == CLUSTER_FLAG_FRAG)
- atomic_long_inc(&si->frag_cluster_nr[ci->order]);
ci->flags = new_flags;
}
@@ -825,6 +820,29 @@ out:
return found;
}
+static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
+ struct list_head *list,
+ unsigned int order,
+ unsigned char usage,
+ bool scan_all)
+{
+ unsigned int found = SWAP_ENTRY_INVALID;
+
+ do {
+ struct swap_cluster_info *ci = isolate_lock_cluster(si, list);
+ unsigned long offset;
+
+ if (!ci)
+ break;
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
+ if (found)
+ break;
+ } while (scan_all);
+
+ return found;
+}
+
static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
{
long to_scan = 1;
@@ -913,46 +931,46 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
}
new_cluster:
- ci = isolate_lock_cluster(si, &si->free_clusters);
- if (ci) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
+ /*
+ * If the device need discard, prefer new cluster over nonfull
+ * to spread out the writes.
+ */
+ if (si->flags & SWP_PAGE_DISCARD) {
+ found = alloc_swap_scan_list(si, &si->free_clusters, order, usage,
+ false);
if (found)
goto done;
}
- /* Try reclaim from full clusters if free clusters list is drained */
+ if (order < PMD_ORDER) {
+ found = alloc_swap_scan_list(si, &si->nonfull_clusters[order],
+ order, usage, true);
+ if (found)
+ goto done;
+ }
+
+ if (!(si->flags & SWP_PAGE_DISCARD)) {
+ found = alloc_swap_scan_list(si, &si->free_clusters, order, usage,
+ false);
+ if (found)
+ goto done;
+ }
+
+ /* Try reclaim full clusters if free and nonfull lists are drained */
if (vm_swap_full())
swap_reclaim_full_clusters(si, false);
if (order < PMD_ORDER) {
- unsigned int frags = 0, frags_existing;
-
- while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
- if (found)
- goto done;
- /* Clusters failed to allocate are moved to frag_clusters */
- frags++;
- }
-
- frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
- while (frags < frags_existing &&
- (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) {
- atomic_long_dec(&si->frag_cluster_nr[order]);
- /*
- * Rotate the frag list to iterate, they were all
- * failing high order allocation or moved here due to
- * per-CPU usage, but they could contain newly released
- * reclaimable (eg. lazy-freed swap cache) slots.
- */
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
- if (found)
- goto done;
- frags++;
- }
+ /*
+ * Scan only one fragment cluster is good enough. Order 0
+ * allocation will surely success, and large allocation
+ * failure is not critical. Scanning one cluster still
+ * keeps the list rotated and reclaimed (for HAS_CACHE).
+ */
+ found = alloc_swap_scan_list(si, &si->frag_clusters[order], order,
+ usage, false);
+ if (found)
+ goto done;
}
/*
@@ -971,20 +989,15 @@ new_cluster:
* Clusters here have at least one usable slots and can't fail order 0
* allocation, but reclaim may drop si->lock and race with another user.
*/
- while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
- atomic_long_dec(&si->frag_cluster_nr[o]);
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- 0, usage);
- if (found)
- goto done;
- }
+ found = alloc_swap_scan_list(si, &si->frag_clusters[o],
+ 0, usage, true);
+ if (found)
+ goto done;
- while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- 0, usage);
- if (found)
- goto done;
- }
+ found = alloc_swap_scan_list(si, &si->nonfull_clusters[o],
+ 0, usage, true);
+ if (found)
+ goto done;
}
done:
if (!(si->flags & SWP_SOLIDSTATE))
@@ -3224,7 +3237,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < SWAP_NR_ORDERS; i++) {
INIT_LIST_HEAD(&si->nonfull_clusters[i]);
INIT_LIST_HEAD(&si->frag_clusters[i]);
- atomic_long_set(&si->frag_cluster_nr[i], 0);
}
/*
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index aefdf3a812a1..50aaa8dcd24c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1026,18 +1026,64 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
}
-static int move_present_pte(struct mm_struct *mm,
- struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma,
- unsigned long dst_addr, unsigned long src_addr,
- pte_t *dst_pte, pte_t *src_pte,
- pte_t orig_dst_pte, pte_t orig_src_pte,
- pmd_t *dst_pmd, pmd_t dst_pmdval,
- spinlock_t *dst_ptl, spinlock_t *src_ptl,
- struct folio *src_folio)
+/*
+ * Checks if the two ptes and the corresponding folio are eligible for batched
+ * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL.
+ *
+ * NOTE: folio's reference is not required as the whole operation is within
+ * PTL's critical section.
+ */
+static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
+ unsigned long src_addr,
+ pte_t *src_pte, pte_t *dst_pte,
+ struct anon_vma *src_anon_vma)
+{
+ pte_t orig_dst_pte, orig_src_pte;
+ struct folio *folio;
+
+ orig_dst_pte = ptep_get(dst_pte);
+ if (!pte_none(orig_dst_pte))
+ return NULL;
+
+ orig_src_pte = ptep_get(src_pte);
+ if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte)))
+ return NULL;
+
+ folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
+ if (!folio || !folio_trylock(folio))
+ return NULL;
+ if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) ||
+ folio_anon_vma(folio) != src_anon_vma) {
+ folio_unlock(folio);
+ return NULL;
+ }
+ return folio;
+}
+
+/*
+ * Moves src folios to dst in a batch as long as they share the same
+ * anon_vma as the first folio, are not large, and can successfully
+ * take the lock via folio_trylock().
+ */
+static long move_present_ptes(struct mm_struct *mm,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio **first_src_folio, unsigned long len,
+ struct anon_vma *src_anon_vma)
{
int err = 0;
+ struct folio *src_folio = *first_src_folio;
+ unsigned long src_start = src_addr;
+ unsigned long src_end;
+ len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr;
+ src_end = pmd_addr_end(src_addr, src_addr + len);
+ flush_cache_range(src_vma, src_addr, src_end);
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1051,31 +1097,56 @@ static int move_present_pte(struct mm_struct *mm,
err = -EBUSY;
goto out;
}
+ /* It's safe to drop the reference now as the page-table is holding one. */
+ folio_put(*first_src_folio);
+ *first_src_folio = NULL;
+ arch_enter_lazy_mmu_mode();
+
+ while (true) {
+ orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+ /* Folio got pinned from under us. Put it back and fail the move. */
+ if (folio_maybe_dma_pinned(src_folio)) {
+ set_pte_at(mm, src_addr, src_pte, orig_src_pte);
+ err = -EBUSY;
+ break;
+ }
- orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
- /* Folio got pinned from under us. Put it back and fail the move. */
- if (folio_maybe_dma_pinned(src_folio)) {
- set_pte_at(mm, src_addr, src_pte, orig_src_pte);
- err = -EBUSY;
- goto out;
- }
-
- folio_move_anon_rmap(src_folio, dst_vma);
- src_folio->index = linear_page_index(dst_vma, dst_addr);
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
- orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
- /* Set soft dirty bit so userspace can notice the pte was moved */
+ orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
#ifdef CONFIG_MEM_SOFT_DIRTY
- orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
#endif
- if (pte_dirty(orig_src_pte))
- orig_dst_pte = pte_mkdirty(orig_dst_pte);
- orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+ set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+
+ src_addr += PAGE_SIZE;
+ if (src_addr == src_end)
+ break;
+ dst_addr += PAGE_SIZE;
+ dst_pte++;
+ src_pte++;
+
+ folio_unlock(src_folio);
+ src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte,
+ dst_pte, src_anon_vma);
+ if (!src_folio)
+ break;
+ }
- set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+ arch_leave_lazy_mmu_mode();
+ if (src_addr > src_start)
+ flush_tlb_range(src_vma, src_start, src_addr);
+
+ if (src_folio)
+ folio_unlock(src_folio);
out:
double_pt_unlock(dst_ptl, src_ptl);
- return err;
+ return src_addr > src_start ? src_addr - src_start : err;
}
static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
@@ -1140,7 +1211,7 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
- return 0;
+ return PAGE_SIZE;
}
static int move_zeropage_pte(struct mm_struct *mm,
@@ -1167,20 +1238,20 @@ static int move_zeropage_pte(struct mm_struct *mm,
set_pte_at(mm, dst_addr, dst_pte, zero_pte);
double_pt_unlock(dst_ptl, src_ptl);
- return 0;
+ return PAGE_SIZE;
}
/*
- * The mmap_lock for reading is held by the caller. Just move the page
- * from src_pmd to dst_pmd if possible, and return true if succeeded
- * in moving the page.
+ * The mmap_lock for reading is held by the caller. Just move the page(s)
+ * from src_pmd to dst_pmd if possible, and return number of bytes moved.
+ * On failure, an error code is returned.
*/
-static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
- struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma,
- unsigned long dst_addr, unsigned long src_addr,
- __u64 mode)
+static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ unsigned long len, __u64 mode)
{
swp_entry_t entry;
struct swap_info_struct *si = NULL;
@@ -1194,11 +1265,10 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
struct folio *src_folio = NULL;
struct anon_vma *src_anon_vma = NULL;
struct mmu_notifier_range range;
- int err = 0;
+ long ret = 0;
- flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
- src_addr, src_addr + PAGE_SIZE);
+ src_addr, src_addr + len);
mmu_notifier_invalidate_range_start(&range);
retry:
/*
@@ -1212,7 +1282,7 @@ retry:
/* Retry if a huge pmd materialized from under us */
if (unlikely(!dst_pte)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
@@ -1231,14 +1301,14 @@ retry:
* transparent huge pages under us.
*/
if (unlikely(!src_pte)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/* Sanity checks before the operation */
if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
- err = -EINVAL;
+ ret = -EINVAL;
goto out;
}
@@ -1246,7 +1316,7 @@ retry:
orig_dst_pte = ptep_get(dst_pte);
spin_unlock(dst_ptl);
if (!pte_none(orig_dst_pte)) {
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
@@ -1255,21 +1325,21 @@ retry:
spin_unlock(src_ptl);
if (pte_none(orig_src_pte)) {
if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
- err = -ENOENT;
+ ret = -ENOENT;
else /* nothing to do to move a hole */
- err = 0;
+ ret = PAGE_SIZE;
goto out;
}
/* If PTE changed after we locked the folio them start over */
if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
if (pte_present(orig_src_pte)) {
if (is_zero_pfn(pte_pfn(orig_src_pte))) {
- err = move_zeropage_pte(mm, dst_vma, src_vma,
+ ret = move_zeropage_pte(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte,
dst_pmd, dst_pmdval, dst_ptl, src_ptl);
@@ -1292,14 +1362,14 @@ retry:
spin_lock(src_ptl);
if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
spin_unlock(src_ptl);
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
if (!folio || !PageAnonExclusive(&folio->page)) {
spin_unlock(src_ptl);
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
@@ -1313,7 +1383,7 @@ retry:
*/
if (!locked && folio_test_large(folio)) {
spin_unlock(src_ptl);
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
@@ -1332,7 +1402,7 @@ retry:
}
if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
}
@@ -1343,8 +1413,8 @@ retry:
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
- err = split_folio(src_folio);
- if (err)
+ ret = split_folio(src_folio);
+ if (ret)
goto out;
/* have to reacquire the folio after it got split */
folio_unlock(src_folio);
@@ -1362,7 +1432,7 @@ retry:
src_anon_vma = folio_get_anon_vma(src_folio);
if (!src_anon_vma) {
/* page was unmapped from under us */
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
@@ -1375,10 +1445,11 @@ retry:
}
}
- err = move_present_pte(mm, dst_vma, src_vma,
- dst_addr, src_addr, dst_pte, src_pte,
- orig_dst_pte, orig_src_pte, dst_pmd,
- dst_pmdval, dst_ptl, src_ptl, src_folio);
+ ret = move_present_ptes(mm, dst_vma, src_vma,
+ dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd,
+ dst_pmdval, dst_ptl, src_ptl, &src_folio,
+ len, src_anon_vma);
} else {
struct folio *folio = NULL;
@@ -1389,20 +1460,20 @@ retry:
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
- err = -EAGAIN;
+ ret = -EAGAIN;
} else
- err = -EFAULT;
+ ret = -EFAULT;
goto out;
}
if (!pte_swp_exclusive(orig_src_pte)) {
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
si = get_swap_device(entry);
if (unlikely(!si)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/*
@@ -1422,7 +1493,7 @@ retry:
swap_cache_index(entry));
if (!IS_ERR_OR_NULL(folio)) {
if (folio_test_large(folio)) {
- err = -EBUSY;
+ ret = -EBUSY;
folio_put(folio);
goto out;
}
@@ -1439,7 +1510,7 @@ retry:
goto retry;
}
}
- err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
dst_ptl, src_ptl, src_folio, si, entry);
}
@@ -1466,7 +1537,7 @@ out:
if (si)
put_swap_device(si);
- return err;
+ return ret;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1737,7 +1808,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
{
struct mm_struct *mm = ctx->mm;
struct vm_area_struct *src_vma, *dst_vma;
- unsigned long src_addr, dst_addr;
+ unsigned long src_addr, dst_addr, src_end;
pmd_t *src_pmd, *dst_pmd;
long err = -EINVAL;
ssize_t moved = 0;
@@ -1780,8 +1851,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
if (err)
goto out_unlock;
- for (src_addr = src_start, dst_addr = dst_start;
- src_addr < src_start + len;) {
+ for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len;
+ src_addr < src_end;) {
spinlock_t *ptl;
pmd_t dst_pmdval;
unsigned long step_size;
@@ -1849,6 +1920,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
dst_addr, src_addr);
step_size = HPAGE_PMD_SIZE;
} else {
+ long ret;
+
if (pmd_none(*src_pmd)) {
if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
err = -ENOENT;
@@ -1865,10 +1938,13 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
break;
}
- err = move_pages_pte(mm, dst_pmd, src_pmd,
- dst_vma, src_vma,
- dst_addr, src_addr, mode);
- step_size = PAGE_SIZE;
+ ret = move_pages_ptes(mm, dst_pmd, src_pmd,
+ dst_vma, src_vma, dst_addr,
+ src_addr, src_end - src_addr, mode);
+ if (ret < 0)
+ err = ret;
+ else
+ step_size = ret;
}
cond_resched();
diff --git a/mm/util.c b/mm/util.c
index f814e6a59ab1..d235b74f7aff 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -471,17 +471,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- clear_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_clear(MMF_TOPDOWN, mm);
} else {
mm->mmap_base = mmap_base(random_factor, rlim_stack);
- set_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_set(MMF_TOPDOWN, mm);
}
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
- clear_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_clear(MMF_TOPDOWN, mm);
}
#endif
#ifdef CONFIG_MMU
diff --git a/mm/vma.h b/mm/vma.h
index b123a9cdedb0..bcdc261c5b15 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -145,7 +145,7 @@ struct vma_merge_struct {
*/
bool __remove_middle :1;
/*
- * Internal flag used during the merge operationr to indicate we will
+ * Internal flag used during the merge operation to indicate we will
* remove vmg->next.
*/
bool __remove_next :1;
diff --git a/mm/vma_init.c b/mm/vma_init.c
index 8e53c7943561..d847c6557261 100644
--- a/mm/vma_init.c
+++ b/mm/vma_init.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
+ * Functions for initializing, allocating, freeing and duplicating VMAs. Shared
* between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
*/
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5edd536ba9d2..4249e1e01947 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4089,19 +4089,29 @@ void *vzalloc_node_noprof(unsigned long size, int node)
EXPORT_SYMBOL(vzalloc_node_noprof);
/**
- * vrealloc - reallocate virtually contiguous memory; contents remain unchanged
+ * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents
+ * remain unchanged
* @p: object to reallocate memory for
* @size: the size to reallocate
+ * @align: requested alignment
* @flags: the flags for the page level allocator
+ * @nid: node number of the target node
+ *
+ * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size
+ * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
*
- * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and
- * @p is not a %NULL pointer, the object pointed to is freed.
+ * If the caller wants the new memory to be on specific node *only*,
+ * __GFP_THISNODE flag should be set, otherwise the function will try to avoid
+ * reallocation and possibly disregard the specified @nid.
*
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
* __GFP_ZERO is not fully honored by this API.
*
+ * Requesting an alignment that is bigger than the alignment of the existing
+ * allocation will fail.
+ *
* In any case, the contents of the object pointed to are preserved up to the
* lesser of the new and old sizes.
*
@@ -4111,7 +4121,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof);
* Return: pointer to the allocated memory; %NULL if @size is zero or in case of
* failure
*/
-void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid)
{
struct vm_struct *vm = NULL;
size_t alloced_size = 0;
@@ -4135,6 +4146,12 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
if (WARN(alloced_size < old_size,
"vrealloc() has mismatched area vs requested sizes (%p)\n", p))
return NULL;
+ if (WARN(!IS_ALIGNED((unsigned long)p, align),
+ "will not reallocate with a bigger alignment (0x%lx)\n", align))
+ return NULL;
+ if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
+ nid != page_to_nid(vmalloc_to_page(p)))
+ goto need_realloc;
}
/*
@@ -4165,8 +4182,10 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
return (void *)p;
}
+need_realloc:
/* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
- n = __vmalloc_noprof(size, flags);
+ n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0));
+
if (!n)
return NULL;
@@ -5177,7 +5196,7 @@ static void vmap_init_nodes(void)
int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
if (n > 1) {
- vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
+ vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT);
if (vn) {
/* Node partition is 16 pages. */
vmap_zone_size = (1 << 4) * PAGE_SIZE;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 674999999cd0..ca9e1cd3cd68 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -398,14 +398,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
- /*
- * If there are no reclaimable file-backed or anonymous pages,
- * ensure zones with sufficient free pages are not skipped.
- * This prevents zones like DMA32 from being ignored in reclaim
- * scenarios where they can still help alleviate memory pressure.
- */
- if (nr == 0)
- nr = zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
return nr;
}
@@ -888,11 +881,11 @@ static bool lru_gen_set_refs(struct folio *folio)
{
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
- set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
return false;
}
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset));
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset));
return true;
}
#else
@@ -3257,13 +3250,13 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
/* promote pages accessed through page tables */
static int folio_update_gen(struct folio *folio, int gen)
{
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
- set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
return -1;
}
@@ -3274,7 +3267,7 @@ static int folio_update_gen(struct folio *folio, int gen)
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
@@ -3285,7 +3278,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
int type = folio_is_file_lru(folio);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
@@ -3302,7 +3295,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
/* for folio_end_writeback() */
if (reclaiming)
new_flags |= BIT(PG_reclaim);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
lru_gen_update_size(lruvec, folio, old_gen, new_gen);
@@ -4553,7 +4546,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio))
- set_mask_bits(&folio->flags, LRU_REFS_MASK, 0);
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
@@ -4766,7 +4759,7 @@ retry:
/* don't add rejected folios to the oldest generation */
if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
}
spin_lock_irq(&lruvec->lru_lock);
@@ -5561,6 +5554,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (memcg_id != mem_cgroup_id(memcg))
goto done;
+ sc->target_mem_cgroup = memcg;
lruvec = get_lruvec(memcg, nid);
if (swappiness < MIN_SWAPPINESS)
@@ -5597,6 +5591,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
.may_swap = true,
.reclaim_idx = MAX_NR_ZONES - 1,
.gfp_mask = GFP_KERNEL,
+ .proactive = true,
};
buf = kvmalloc(len + 1, GFP_KERNEL);
@@ -6493,7 +6488,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
return true;
for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
- if (!zone_reclaimable_pages(zone))
+ if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, NR_FREE_PAGES))
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 71cd1ceba191..e522decf6a72 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1280,6 +1280,7 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_NUMA_BALANCING
[I(PGPROMOTE_SUCCESS)] = "pgpromote_success",
[I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate",
+ [I(PGPROMOTE_CANDIDATE_NRL)] = "pgpromote_candidate_nrl",
#endif
[I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd",
[I(PGDEMOTE_DIRECT)] = "pgdemote_direct",
@@ -1289,6 +1290,7 @@ const char * const vmstat_text[] = {
[I(NR_HUGETLB)] = "nr_hugetlb",
#endif
[I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
+ [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages",
#undef I
/* system-wide enum vm_stat_item counters */
diff --git a/mm/workingset.c b/mm/workingset.c
index 6e7f4cb1b9a7..68a76a91111f 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -318,7 +318,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
folio_set_workingset(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
} else
- set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
unlock:
rcu_read_unlock();
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 805a10b41266..153783d49d34 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1746,7 +1746,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
* instead.
*/
if (!zpdesc->zspage)
- return MIGRATEPAGE_SUCCESS;
+ return 0;
/* The page is locked, so this pointer must remain valid */
zspage = get_zspage(zpdesc);
@@ -1813,7 +1813,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
reset_zpdesc(zpdesc);
zpdesc_put(zpdesc);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
static void zs_page_putback(struct page *page)
diff --git a/mm/zswap.c b/mm/zswap.c
index 3c0fd8a13718..e5e1f5687f5e 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -42,8 +42,10 @@
/*********************************
* statistics
**********************************/
-/* The number of compressed pages currently stored in zswap */
+/* The number of pages currently stored in zswap */
atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0);
+/* The number of incompressible pages currently stored in zswap */
+static atomic_long_t zswap_stored_incompressible_pages = ATOMIC_LONG_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -811,6 +813,8 @@ static void zswap_entry_free(struct zswap_entry *entry)
obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
obj_cgroup_put(entry->objcg);
}
+ if (entry->length == PAGE_SIZE)
+ atomic_long_dec(&zswap_stored_incompressible_pages);
zswap_entry_cache_free(entry);
atomic_long_dec(&zswap_stored_pages);
}
@@ -827,7 +831,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
u8 *buffer = NULL;
int ret;
- buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
if (!buffer) {
ret = -ENOMEM;
goto fail;
@@ -948,18 +952,14 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
struct zpool *zpool;
gfp_t gfp;
u8 *dst;
+ bool mapped = false;
acomp_ctx = acomp_ctx_get_cpu_lock(pool);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
- /*
- * We need PAGE_SIZE * 2 here since there maybe over-compression case,
- * and hardware-accelerators may won't check the dst buffer size, so
- * giving the dst buffer with enough length to avoid buffer overflow.
- */
- sg_init_one(&output, dst, PAGE_SIZE * 2);
+ sg_init_one(&output, dst, PAGE_SIZE);
acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
/*
@@ -976,8 +976,26 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
*/
comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
dlen = acomp_ctx->req->dlen;
- if (comp_ret)
- goto unlock;
+
+ /*
+ * If a page cannot be compressed into a size smaller than PAGE_SIZE,
+ * save the content as is without a compression, to keep the LRU order
+ * of writebacks. If writeback is disabled, reject the page since it
+ * only adds metadata overhead. swap_writeout() will put the page back
+ * to the active LRU list in the case.
+ */
+ if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
+ dlen = PAGE_SIZE;
+ if (!mem_cgroup_zswap_writeback_enabled(
+ folio_memcg(page_folio(page)))) {
+ comp_ret = comp_ret ? comp_ret : -EINVAL;
+ goto unlock;
+ }
+ comp_ret = 0;
+ dlen = PAGE_SIZE;
+ dst = kmap_local_page(page);
+ mapped = true;
+ }
zpool = pool->zpool;
gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
@@ -990,6 +1008,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
entry->length = dlen;
unlock:
+ if (mapped)
+ kunmap_local(dst);
if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC)
zswap_reject_compress_poor++;
else if (comp_ret)
@@ -1006,12 +1026,18 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
struct zpool *zpool = entry->pool->zpool;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- int decomp_ret, dlen;
+ int decomp_ret = 0, dlen = PAGE_SIZE;
u8 *src, *obj;
acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
+ /* zswap entries of length PAGE_SIZE are not compressed. */
+ if (entry->length == PAGE_SIZE) {
+ memcpy_to_folio(folio, 0, obj, entry->length);
+ goto read_done;
+ }
+
/*
* zpool_obj_read_begin() might return a kmap address of highmem when
* acomp_ctx->buffer is not used. However, sg_init_one() does not
@@ -1032,6 +1058,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
dlen = acomp_ctx->req->dlen;
+read_done:
zpool_obj_read_end(zpool, entry->handle, obj);
acomp_ctx_put_unlock(acomp_ctx);
@@ -1524,6 +1551,8 @@ static bool zswap_store_page(struct page *page,
obj_cgroup_charge_zswap(objcg, entry->length);
}
atomic_long_inc(&zswap_stored_pages);
+ if (entry->length == PAGE_SIZE)
+ atomic_long_inc(&zswap_stored_incompressible_pages);
/*
* We finish initializing the entry while it's already in xarray.
@@ -1792,6 +1821,14 @@ static int debugfs_get_stored_pages(void *data, u64 *val)
}
DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n");
+static int debugfs_get_stored_incompressible_pages(void *data, u64 *val)
+{
+ *val = atomic_long_read(&zswap_stored_incompressible_pages);
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(stored_incompressible_pages_fops,
+ debugfs_get_stored_incompressible_pages, NULL, "%llu\n");
+
static int zswap_debugfs_init(void)
{
if (!debugfs_initialized())
@@ -1819,6 +1856,9 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, NULL, &total_size_fops);
debugfs_create_file("stored_pages", 0444,
zswap_debugfs_root, NULL, &stored_pages_fops);
+ debugfs_create_file("stored_incompressible_pages", 0444,
+ zswap_debugfs_root, NULL,
+ &stored_incompressible_pages_fops);
return 0;
}