summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-10-02 18:18:33 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-10-02 18:18:33 -0700
commit8804d970fab45726b3c7cd7f240b31122aa94219 (patch)
tree941833bb8b535ed01a5478be14c08522e7c3644a /include/linux
parent24d9e8b3c9c8a6f72c8b4c196a703e144928d919 (diff)
parent1367da7eb875d01102d2ed18654b24d261ff5393 (diff)
Merge tag 'mm-stable-2025-10-01-19-00' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - "mm, swap: improve cluster scan strategy" from Kairui Song improves performance and reduces the failure rate of swap cluster allocation - "support large align and nid in Rust allocators" from Vitaly Wool permits Rust allocators to set NUMA node and large alignment when perforning slub and vmalloc reallocs - "mm/damon/vaddr: support stat-purpose DAMOS" from Yueyang Pan extend DAMOS_STAT's handling of the DAMON operations sets for virtual address spaces for ops-level DAMOS filters - "execute PROCMAP_QUERY ioctl under per-vma lock" from Suren Baghdasaryan reduces mmap_lock contention during reads of /proc/pid/maps - "mm/mincore: minor clean up for swap cache checking" from Kairui Song performs some cleanup in the swap code - "mm: vm_normal_page*() improvements" from David Hildenbrand provides code cleanup in the pagemap code - "add persistent huge zero folio support" from Pankaj Raghav provides a block layer speedup by optionalls making the huge_zero_pagepersistent, instead of releasing it when its refcount falls to zero - "kho: fixes and cleanups" from Mike Rapoport adds a few touchups to the recently added Kexec Handover feature - "mm: make mm->flags a bitmap and 64-bit on all arches" from Lorenzo Stoakes turns mm_struct.flags into a bitmap. To end the constant struggle with space shortage on 32-bit conflicting with 64-bit's needs - "mm/swapfile.c and swap.h cleanup" from Chris Li cleans up some swap code - "selftests/mm: Fix false positives and skip unsupported tests" from Donet Tom fixes a few things in our selftests code - "prctl: extend PR_SET_THP_DISABLE to only provide THPs when advised" from David Hildenbrand "allows individual processes to opt-out of THP=always into THP=madvise, without affecting other workloads on the system". It's a long story - the [1/N] changelog spells out the considerations - "Add and use memdesc_flags_t" from Matthew Wilcox gets us started on the memdesc project. Please see https://kernelnewbies.org/MatthewWilcox/Memdescs and https://blogs.oracle.com/linux/post/introducing-memdesc - "Tiny optimization for large read operations" from Chi Zhiling improves the efficiency of the pagecache read path - "Better split_huge_page_test result check" from Zi Yan improves our folio splitting selftest code - "test that rmap behaves as expected" from Wei Yang adds some rmap selftests - "remove write_cache_pages()" from Christoph Hellwig removes that function and converts its two remaining callers - "selftests/mm: uffd-stress fixes" from Dev Jain fixes some UFFD selftests issues - "introduce kernel file mapped folios" from Boris Burkov introduces the concept of "kernel file pages". Using these permits btrfs to account its metadata pages to the root cgroup, rather than to the cgroups of random inappropriate tasks - "mm/pageblock: improve readability of some pageblock handling" from Wei Yang provides some readability improvements to the page allocator code - "mm/damon: support ARM32 with LPAE" from SeongJae Park teaches DAMON to understand arm32 highmem - "tools: testing: Use existing atomic.h for vma/maple tests" from Brendan Jackman performs some code cleanups and deduplication under tools/testing/ - "maple_tree: Fix testing for 32bit compiles" from Liam Howlett fixes a couple of 32-bit issues in tools/testing/radix-tree.c - "kasan: unify kasan_enabled() and remove arch-specific implementations" from Sabyrzhan Tasbolatov moves KASAN arch-specific initialization code into a common arch-neutral implementation - "mm: remove zpool" from Johannes Weiner removes zspool - an indirection layer which now only redirects to a single thing (zsmalloc) - "mm: task_stack: Stack handling cleanups" from Pasha Tatashin makes a couple of cleanups in the fork code - "mm: remove nth_page()" from David Hildenbrand makes rather a lot of adjustments at various nth_page() callsites, eventually permitting the removal of that undesirable helper function - "introduce kasan.write_only option in hw-tags" from Yeoreum Yun creates a KASAN read-only mode for ARM, using that architecture's memory tagging feature. It is felt that a read-only mode KASAN is suitable for use in production systems rather than debug-only - "mm: hugetlb: cleanup hugetlb folio allocation" from Kefeng Wang does some tidying in the hugetlb folio allocation code - "mm: establish const-correctness for pointer parameters" from Max Kellermann makes quite a number of the MM API functions more accurate about the constness of their arguments. This was getting in the way of subsystems (in this case CEPH) when they attempt to improving their own const/non-const accuracy - "Cleanup free_pages() misuse" from Vishal Moola fixes a number of code sites which were confused over when to use free_pages() vs __free_pages() - "Add Rust abstraction for Maple Trees" from Alice Ryhl makes the mapletree code accessible to Rust. Required by nouveau and by its forthcoming successor: the new Rust Nova driver - "selftests/mm: split_huge_page_test: split_pte_mapped_thp improvements" from David Hildenbrand adds a fix and some cleanups to the thp selftesting code - "mm, swap: introduce swap table as swap cache (phase I)" from Chris Li and Kairui Song is the first step along the path to implementing "swap tables" - a new approach to swap allocation and state tracking which is expected to yield speed and space improvements. This patchset itself yields a 5-20% performance benefit in some situations - "Some ptdesc cleanups" from Matthew Wilcox utilizes the new memdesc layer to clean up the ptdesc code a little - "Fix va_high_addr_switch.sh test failure" from Chunyu Hu fixes some issues in our 5-level pagetable selftesting code - "Minor fixes for memory allocation profiling" from Suren Baghdasaryan addresses a couple of minor issues in relatively new memory allocation profiling feature - "Small cleanups" from Matthew Wilcox has a few cleanups in preparation for more memdesc work - "mm/damon: add addr_unit for DAMON_LRU_SORT and DAMON_RECLAIM" from Quanmin Yan makes some changes to DAMON in furtherance of supporting arm highmem - "selftests/mm: Add -Wunreachable-code and fix warnings" from Muhammad Anjum adds that compiler check to selftests code and fixes the fallout, by removing dead code - "Improvements to Victim Process Thawing and OOM Reaper Traversal Order" from zhongjinji makes a number of improvements in the OOM killer: mainly thawing a more appropriate group of victim threads so they can release resources - "mm/damon: misc fixups and improvements for 6.18" from SeongJae Park is a bunch of small and unrelated fixups for DAMON - "mm/damon: define and use DAMON initialization check function" from SeongJae Park implement reliability and maintainability improvements to a recently-added bug fix - "mm/damon/stat: expose auto-tuned intervals and non-idle ages" from SeongJae Park provides additional transparency to userspace clients of the DAMON_STAT information - "Expand scope of khugepaged anonymous collapse" from Dev Jain removes some constraints on khubepaged's collapsing of anon VMAs. It also increases the success rate of MADV_COLLAPSE against an anon vma - "mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare()" from Lorenzo Stoakes moves us further towards removal of file_operations.mmap(). This patchset concentrates upon clearing up the treatment of stacked filesystems - "mm: Improve mlock tracking for large folios" from Kiryl Shutsemau provides some fixes and improvements to mlock's tracking of large folios. /proc/meminfo's "Mlocked" field became more accurate - "mm/ksm: Fix incorrect accounting of KSM counters during fork" from Donet Tom fixes several user-visible KSM stats inaccuracies across forks and adds selftest code to verify these counters - "mm_slot: fix the usage of mm_slot_entry" from Wei Yang addresses some potential but presently benign issues in KSM's mm_slot handling * tag 'mm-stable-2025-10-01-19-00' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (372 commits) mm: swap: check for stable address space before operating on the VMA mm: convert folio_page() back to a macro mm/khugepaged: use start_addr/addr for improved readability hugetlbfs: skip VMAs without shareable locks in hugetlb_vmdelete_list alloc_tag: fix boot failure due to NULL pointer dereference mm: silence data-race in update_hiwater_rss mm/memory-failure: don't select MEMORY_ISOLATION mm/khugepaged: remove definition of struct khugepaged_mm_slot mm/ksm: get mm_slot by mm_slot_entry() when slot is !NULL hugetlb: increase number of reserving hugepages via cmdline selftests/mm: add fork inheritance test for ksm_merging_pages counter mm/ksm: fix incorrect KSM counter handling in mm_struct during fork drivers/base/node: fix double free in register_one_node() mm: remove PMD alignment constraint in execmem_vmalloc() mm/memory_hotplug: fix typo 'esecially' -> 'especially' mm/rmap: improve mlock tracking for large folios mm/filemap: map entire large folio faultaround mm/fault: try to map the entire file folio in finish_fault() mm/rmap: mlock large folios in try_to_unmap_one() mm/rmap: fix a mlock race condition in folio_referenced_one() ...
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/alloc_tag.h12
-rw-r--r--include/linux/blkdev.h2
-rw-r--r--include/linux/bpfptr.h2
-rw-r--r--include/linux/bvec.h7
-rw-r--r--include/linux/codetag.h5
-rw-r--r--include/linux/damon.h18
-rw-r--r--include/linux/freezer.h2
-rw-r--r--include/linux/fs.h8
-rw-r--r--include/linux/highmem-internal.h36
-rw-r--r--include/linux/highmem.h8
-rw-r--r--include/linux/huge_mm.h112
-rw-r--r--include/linux/hugetlb.h7
-rw-r--r--include/linux/kasan-enabled.h32
-rw-r--r--include/linux/kasan.h6
-rw-r--r--include/linux/khugepaged.h6
-rw-r--r--include/linux/ksm.h12
-rw-r--r--include/linux/maple_tree.h27
-rw-r--r--include/linux/memcontrol.h10
-rw-r--r--include/linux/mempool.h2
-rw-r--r--include/linux/memremap.h45
-rw-r--r--include/linux/migrate.h11
-rw-r--r--include/linux/mm.h273
-rw-r--r--include/linux/mm_inline.h37
-rw-r--r--include/linux/mm_types.h136
-rw-r--r--include/linux/mman.h2
-rw-r--r--include/linux/mmap_lock.h85
-rw-r--r--include/linux/mmzone.h91
-rw-r--r--include/linux/netfs.h2
-rw-r--r--include/linux/oom.h2
-rw-r--r--include/linux/page-flags.h42
-rw-r--r--include/linux/pageblock-flags.h12
-rw-r--r--include/linux/pagemap.h65
-rw-r--r--include/linux/pagevec.h4
-rw-r--r--include/linux/pgalloc_tag.h7
-rw-r--r--include/linux/pgtable.h26
-rw-r--r--include/linux/rmap.h67
-rw-r--r--include/linux/scatterlist.h3
-rw-r--r--include/linux/sched/coredump.h18
-rw-r--r--include/linux/sched/mm.h4
-rw-r--r--include/linux/shmem_fs.h4
-rw-r--r--include/linux/slab.h39
-rw-r--r--include/linux/swap.h50
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--include/linux/vmalloc.h12
-rw-r--r--include/linux/writeback.h6
-rw-r--r--include/linux/zpool.h86
46 files changed, 792 insertions, 653 deletions
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 9ef2633e2c08..d40ac39bfbe8 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -221,6 +221,16 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
ref->ct = NULL;
}
+static inline void alloc_tag_set_inaccurate(struct alloc_tag *tag)
+{
+ tag->ct.flags |= CODETAG_FLAG_INACCURATE;
+}
+
+static inline bool alloc_tag_is_inaccurate(struct alloc_tag *tag)
+{
+ return !!(tag->ct.flags & CODETAG_FLAG_INACCURATE);
+}
+
#define alloc_tag_record(p) ((p) = current->alloc_tag)
#else /* CONFIG_MEM_ALLOC_PROFILING */
@@ -230,6 +240,8 @@ static inline bool mem_alloc_profiling_enabled(void) { return false; }
static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
size_t bytes) {}
static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
+static inline void alloc_tag_set_inaccurate(struct alloc_tag *tag) {}
+static inline bool alloc_tag_is_inaccurate(struct alloc_tag *tag) { return false; }
#define alloc_tag_record(p) do {} while (0)
#endif /* CONFIG_MEM_ALLOC_PROFILING */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 066e5309bd45..dad5cb5b3812 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -199,7 +199,7 @@ struct gendisk {
unsigned int zone_wplugs_hash_bits;
atomic_t nr_zone_wplugs;
spinlock_t zone_wplugs_lock;
- struct mempool_s *zone_wplugs_pool;
+ struct mempool *zone_wplugs_pool;
struct hlist_head *zone_wplugs_hash;
struct workqueue_struct *zone_wplugs_wq;
#endif /* CONFIG_BLK_DEV_ZONED */
diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h
index 1af241525a17..f6e0795db484 100644
--- a/include/linux/bpfptr.h
+++ b/include/linux/bpfptr.h
@@ -67,7 +67,7 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset,
static inline void *kvmemdup_bpfptr_noprof(bpfptr_t src, size_t len)
{
- void *p = kvmalloc_noprof(len, GFP_USER | __GFP_NOWARN);
+ void *p = kvmalloc_node_align_noprof(len, 1, GFP_USER | __GFP_NOWARN, NUMA_NO_NODE);
if (!p)
return ERR_PTR(-ENOMEM);
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 0a80e1f9aa20..3fc0efa0825b 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -22,11 +22,8 @@ struct page;
* @bv_len: Number of bytes in the address range.
* @bv_offset: Start of the address range relative to the start of @bv_page.
*
- * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len:
- *
- * nth_page(@bv_page, n) == @bv_page + n
- *
- * This holds because page_is_mergeable() checks the above property.
+ * All pages within a bio_vec starting from @bv_page are contiguous and
+ * can simply be iterated (see bvec_advance()).
*/
struct bio_vec {
struct page *bv_page;
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index 457ed8fd3214..8ea2a5f7c98a 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -16,13 +16,16 @@ struct module;
#define CODETAG_SECTION_START_PREFIX "__start_"
#define CODETAG_SECTION_STOP_PREFIX "__stop_"
+/* codetag flags */
+#define CODETAG_FLAG_INACCURATE (1 << 0)
+
/*
* An instance of this structure is created in a special ELF section at every
* code location being tagged. At runtime, the special section is treated as
* an array of these.
*/
struct codetag {
- unsigned int flags; /* used in later patches */
+ unsigned int flags;
unsigned int lineno;
const char *modname;
const char *function;
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 9e62b2a85538..cae8c613c5fc 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -110,7 +110,7 @@ struct damon_target {
*
* @DAMOS_WILLNEED: Call ``madvise()`` for the region with MADV_WILLNEED.
* @DAMOS_COLD: Call ``madvise()`` for the region with MADV_COLD.
- * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT.
+ * @DAMOS_PAGEOUT: Reclaim the region.
* @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE.
* @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
* @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists.
@@ -121,10 +121,10 @@ struct damon_target {
* @NR_DAMOS_ACTIONS: Total number of DAMOS actions
*
* The support of each action is up to running &struct damon_operations.
- * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except
- * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR
- * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum
- * DAMOS_LRU_DEPRIO, and &DAMOS_STAT.
+ * Refer to 'Operation Action' section of Documentation/mm/damon/design.rst for
+ * status of the supports.
+ *
+ * Note that DAMOS_PAGEOUT doesn't trigger demotions.
*/
enum damos_action {
DAMOS_WILLNEED,
@@ -748,7 +748,8 @@ struct damon_attrs {
* Accesses to other fields must be protected by themselves.
*
* @ops: Set of monitoring operations for given use cases.
- *
+ * @addr_unit: Scale factor for core to ops address conversion.
+ * @min_sz_region: Minimum region size.
* @adaptive_targets: Head of monitoring targets (&damon_target) list.
* @schemes: Head of schemes (&damos) list.
*/
@@ -790,6 +791,8 @@ struct damon_ctx {
struct mutex kdamond_lock;
struct damon_operations ops;
+ unsigned long addr_unit;
+ unsigned long min_sz_region;
struct list_head adaptive_targets;
struct list_head schemes;
@@ -878,7 +881,7 @@ static inline void damon_insert_region(struct damon_region *r,
void damon_add_region(struct damon_region *r, struct damon_target *t);
void damon_destroy_region(struct damon_region *r, struct damon_target *t);
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
- unsigned int nr_ranges);
+ unsigned int nr_ranges, unsigned long min_sz_region);
void damon_update_region_access_rate(struct damon_region *r, bool accessed,
struct damon_attrs *attrs);
@@ -935,6 +938,7 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs
}
+bool damon_initialized(void);
int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
bool damon_is_running(struct damon_ctx *ctx);
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index b303472255be..32884c9721e5 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -47,6 +47,7 @@ extern int freeze_processes(void);
extern int freeze_kernel_threads(void);
extern void thaw_processes(void);
extern void thaw_kernel_threads(void);
+extern void thaw_process(struct task_struct *p);
static inline bool try_to_freeze(void)
{
@@ -80,6 +81,7 @@ static inline int freeze_processes(void) { return -ENOSYS; }
static inline int freeze_kernel_threads(void) { return -ENOSYS; }
static inline void thaw_processes(void) {}
static inline void thaw_kernel_threads(void) {}
+static inline void thaw_process(struct task_struct *p) {}
static inline bool try_to_freeze(void) { return false; }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e9d7c757efe..75fb216b0f7a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -537,7 +537,7 @@ struct address_space {
/*
* Returns true if any of the pages in the mapping are marked with the tag.
*/
-static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
+static inline bool mapping_tagged(const struct address_space *mapping, xa_mark_t tag)
{
return xa_marked(&mapping->i_pages, tag);
}
@@ -585,7 +585,7 @@ static inline void i_mmap_assert_write_locked(struct address_space *mapping)
/*
* Might pages of this file be mapped into userspace?
*/
-static inline int mapping_mapped(struct address_space *mapping)
+static inline int mapping_mapped(const struct address_space *mapping)
{
return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
}
@@ -599,7 +599,7 @@ static inline int mapping_mapped(struct address_space *mapping)
* If i_mmap_writable is negative, no new writable mappings are allowed. You
* can only deny writable mappings, if none exists right now.
*/
-static inline int mapping_writably_mapped(struct address_space *mapping)
+static inline int mapping_writably_mapped(const struct address_space *mapping)
{
return atomic_read(&mapping->i_mmap_writable) > 0;
}
@@ -2385,6 +2385,8 @@ static inline bool can_mmap_file(struct file *file)
return true;
}
+int __compat_vma_mmap_prepare(const struct file_operations *f_op,
+ struct file *file, struct vm_area_struct *vma);
int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma);
static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 36053c3d6d64..0574c21ca45d 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -7,7 +7,7 @@
*/
#ifdef CONFIG_KMAP_LOCAL
void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
-void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
+void *__kmap_local_page_prot(const struct page *page, pgprot_t prot);
void kunmap_local_indexed(const void *vaddr);
void kmap_local_fork(struct task_struct *tsk);
void __kmap_local_sched_out(void);
@@ -33,7 +33,7 @@ static inline void kmap_flush_tlb(unsigned long addr) { }
#endif
void *kmap_high(struct page *page);
-void kunmap_high(struct page *page);
+void kunmap_high(const struct page *page);
void __kmap_flush_unused(void);
struct page *__kmap_to_page(void *addr);
@@ -50,7 +50,7 @@ static inline void *kmap(struct page *page)
return addr;
}
-static inline void kunmap(struct page *page)
+static inline void kunmap(const struct page *page)
{
might_sleep();
if (!PageHighMem(page))
@@ -68,12 +68,12 @@ static inline void kmap_flush_unused(void)
__kmap_flush_unused();
}
-static inline void *kmap_local_page(struct page *page)
+static inline void *kmap_local_page(const struct page *page)
{
return __kmap_local_page_prot(page, kmap_prot);
}
-static inline void *kmap_local_page_try_from_panic(struct page *page)
+static inline void *kmap_local_page_try_from_panic(const struct page *page)
{
if (!PageHighMem(page))
return page_address(page);
@@ -81,13 +81,13 @@ static inline void *kmap_local_page_try_from_panic(struct page *page)
return NULL;
}
-static inline void *kmap_local_folio(struct folio *folio, size_t offset)
+static inline void *kmap_local_folio(const struct folio *folio, size_t offset)
{
- struct page *page = folio_page(folio, offset / PAGE_SIZE);
+ const struct page *page = folio_page(folio, offset / PAGE_SIZE);
return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE;
}
-static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
+static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot)
{
return __kmap_local_page_prot(page, prot);
}
@@ -102,7 +102,7 @@ static inline void __kunmap_local(const void *vaddr)
kunmap_local_indexed(vaddr);
}
-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
migrate_disable();
@@ -113,7 +113,7 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
return __kmap_local_page_prot(page, prot);
}
-static inline void *kmap_atomic(struct page *page)
+static inline void *kmap_atomic(const struct page *page)
{
return kmap_atomic_prot(page, kmap_prot);
}
@@ -173,32 +173,32 @@ static inline void *kmap(struct page *page)
return page_address(page);
}
-static inline void kunmap_high(struct page *page) { }
+static inline void kunmap_high(const struct page *page) { }
static inline void kmap_flush_unused(void) { }
-static inline void kunmap(struct page *page)
+static inline void kunmap(const struct page *page)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
kunmap_flush_on_unmap(page_address(page));
#endif
}
-static inline void *kmap_local_page(struct page *page)
+static inline void *kmap_local_page(const struct page *page)
{
return page_address(page);
}
-static inline void *kmap_local_page_try_from_panic(struct page *page)
+static inline void *kmap_local_page_try_from_panic(const struct page *page)
{
return page_address(page);
}
-static inline void *kmap_local_folio(struct folio *folio, size_t offset)
+static inline void *kmap_local_folio(const struct folio *folio, size_t offset)
{
return folio_address(folio) + offset;
}
-static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
+static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot)
{
return kmap_local_page(page);
}
@@ -215,7 +215,7 @@ static inline void __kunmap_local(const void *addr)
#endif
}
-static inline void *kmap_atomic(struct page *page)
+static inline void *kmap_atomic(const struct page *page)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
migrate_disable();
@@ -225,7 +225,7 @@ static inline void *kmap_atomic(struct page *page)
return page_address(page);
}
-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot)
{
return kmap_atomic(page);
}
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 6234f316468c..105cc4c00cc3 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -43,7 +43,7 @@ static inline void *kmap(struct page *page);
* Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of
* pages in the low memory area.
*/
-static inline void kunmap(struct page *page);
+static inline void kunmap(const struct page *page);
/**
* kmap_to_page - Get the page for a kmap'ed address
@@ -93,7 +93,7 @@ static inline void kmap_flush_unused(void);
* disabling migration in order to keep the virtual address stable across
* preemption. No caller of kmap_local_page() can rely on this side effect.
*/
-static inline void *kmap_local_page(struct page *page);
+static inline void *kmap_local_page(const struct page *page);
/**
* kmap_local_folio - Map a page in this folio for temporary usage
@@ -129,7 +129,7 @@ static inline void *kmap_local_page(struct page *page);
* Context: Can be invoked from any context.
* Return: The virtual address of @offset.
*/
-static inline void *kmap_local_folio(struct folio *folio, size_t offset);
+static inline void *kmap_local_folio(const struct folio *folio, size_t offset);
/**
* kmap_atomic - Atomically map a page for temporary usage - Deprecated!
@@ -176,7 +176,7 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset);
* kunmap_atomic(vaddr2);
* kunmap_atomic(vaddr1);
*/
-static inline void *kmap_atomic(struct page *page);
+static inline void *kmap_atomic(const struct page *page);
/* Highmem related interfaces for management code */
static inline unsigned long nr_free_highpages(void);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7748489fde1b..f327d62fc985 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -94,12 +94,15 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
#define THP_ORDERS_ALL \
(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT)
-#define TVA_SMAPS (1 << 0) /* Will be used for procfs */
-#define TVA_IN_PF (1 << 1) /* Page fault handler */
-#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */
+enum tva_type {
+ TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */
+ TVA_PAGEFAULT, /* Serving a page fault. */
+ TVA_KHUGEPAGED, /* Khugepaged collapse. */
+ TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */
+};
-#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
- (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))
+#define thp_vma_allowable_order(vma, vm_flags, type, order) \
+ (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order)))
#define split_folio(f) split_folio_to_list(f, NULL)
@@ -264,14 +267,14 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders);
/**
* thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
* @vma: the vm area to check
* @vm_flags: use these vm_flags instead of vma->vm_flags
- * @tva_flags: Which TVA flags to honour
+ * @type: TVA type
* @orders: bitfield of all orders to consider
*
* Calculates the intersection of the requested hugepage orders and the allowed
@@ -285,11 +288,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
static inline
unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders)
{
- /* Optimization to check if required orders are enabled early. */
- if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
+ /*
+ * Optimization to check if required orders are enabled early. Only
+ * forced collapse ignores sysfs configs.
+ */
+ if (type != TVA_FORCED_COLLAPSE && vma_is_anonymous(vma)) {
unsigned long mask = READ_ONCE(huge_anon_orders_always);
if (vm_flags & VM_HUGEPAGE)
@@ -303,7 +309,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
return 0;
}
- return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
+ return __thp_vma_allowable_orders(vma, vm_flags, type, orders);
}
struct thpsize {
@@ -318,16 +324,32 @@ struct thpsize {
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
+/*
+ * Check whether THPs are explicitly disabled for this VMA, for example,
+ * through madvise or prctl.
+ */
static inline bool vma_thp_disabled(struct vm_area_struct *vma,
- vm_flags_t vm_flags)
-{
+ vm_flags_t vm_flags, bool forced_collapse)
+{
+ /* Are THPs disabled for this VMA? */
+ if (vm_flags & VM_NOHUGEPAGE)
+ return true;
+ /* Are THPs disabled for all VMAs in the whole process? */
+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, vma->vm_mm))
+ return true;
/*
- * Explicitly disabled through madvise or prctl, or some
- * architectures may disable THP for some mappings, for
- * example, s390 kvm.
+ * Are THPs disabled only for VMAs where we didn't get an explicit
+ * advise to use them?
*/
- return (vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags);
+ if (vm_flags & VM_HUGEPAGE)
+ return false;
+ /*
+ * Forcing a collapse (e.g., madv_collapse), is a clear advice to
+ * use THPs.
+ */
+ if (forced_collapse)
+ return false;
+ return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm);
}
static inline bool thp_disabled_by_hw(void)
@@ -479,6 +501,8 @@ extern unsigned long huge_zero_pfn;
static inline bool is_huge_zero_folio(const struct folio *folio)
{
+ VM_WARN_ON_ONCE(!folio);
+
return READ_ONCE(huge_zero_folio) == folio;
}
@@ -495,6 +519,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);
+static inline struct folio *get_persistent_huge_zero_folio(void)
+{
+ if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return NULL;
+
+ if (unlikely(!huge_zero_folio))
+ return NULL;
+
+ return huge_zero_folio;
+}
+
static inline bool thp_migration_supported(void)
{
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
@@ -526,7 +561,7 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders)
{
return 0;
@@ -553,22 +588,26 @@ static inline int
split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order)
{
- return 0;
+ VM_WARN_ON_ONCE_PAGE(1, page);
+ return -EINVAL;
}
static inline int split_huge_page(struct page *page)
{
- return 0;
+ VM_WARN_ON_ONCE_PAGE(1, page);
+ return -EINVAL;
}
static inline int split_folio_to_list(struct folio *folio, struct list_head *list)
{
- return 0;
+ VM_WARN_ON_ONCE_FOLIO(1, folio);
+ return -EINVAL;
}
static inline int try_folio_split(struct folio *folio, struct page *page,
struct list_head *list)
{
- return 0;
+ VM_WARN_ON_ONCE_FOLIO(1, folio);
+ return -EINVAL;
}
static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
@@ -685,6 +724,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb,
{
return 0;
}
+
+static inline struct folio *get_persistent_huge_zero_folio(void)
+{
+ return NULL;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline int split_folio_to_list_to_order(struct folio *folio,
@@ -698,4 +742,26 @@ static inline int split_folio_to_order(struct folio *folio, int new_order)
return split_folio_to_list_to_order(folio, NULL, new_order);
}
+/**
+ * largest_zero_folio - Get the largest zero size folio available
+ *
+ * This function shall be used when mm_get_huge_zero_folio() cannot be
+ * used as there is no appropriate mm lifetime to tie the huge zero folio
+ * from the caller.
+ *
+ * Deduce the size of the folio with folio_size instead of assuming the
+ * folio size.
+ *
+ * Return: pointer to PMD sized zero folio if CONFIG_PERSISTENT_HUGE_ZERO_FOLIO
+ * is enabled or a single page sized zero folio
+ */
+static inline struct folio *largest_zero_folio(void)
+{
+ struct folio *folio = get_persistent_huge_zero_folio();
+
+ if (folio)
+ return folio;
+
+ return page_folio(ZERO_PAGE(0));
+}
#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 526d27e88b3b..8e63e46b8e1f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -788,9 +788,14 @@ static inline unsigned huge_page_shift(struct hstate *h)
return h->order + PAGE_SHIFT;
}
+static inline bool order_is_gigantic(unsigned int order)
+{
+ return order > MAX_PAGE_ORDER;
+}
+
static inline bool hstate_is_gigantic(struct hstate *h)
{
- return huge_page_order(h) > MAX_PAGE_ORDER;
+ return order_is_gigantic(huge_page_order(h));
}
static inline unsigned int pages_per_huge_page(const struct hstate *h)
diff --git a/include/linux/kasan-enabled.h b/include/linux/kasan-enabled.h
index 6f612d69ea0c..9eca967d8526 100644
--- a/include/linux/kasan-enabled.h
+++ b/include/linux/kasan-enabled.h
@@ -4,32 +4,46 @@
#include <linux/static_key.h>
-#ifdef CONFIG_KASAN_HW_TAGS
-
+#if defined(CONFIG_ARCH_DEFER_KASAN) || defined(CONFIG_KASAN_HW_TAGS)
+/*
+ * Global runtime flag for KASAN modes that need runtime control.
+ * Used by ARCH_DEFER_KASAN architectures and HW_TAGS mode.
+ */
DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled);
+/*
+ * Runtime control for shadow memory initialization or HW_TAGS mode.
+ * Uses static key for architectures that need deferred KASAN or HW_TAGS.
+ */
static __always_inline bool kasan_enabled(void)
{
return static_branch_likely(&kasan_flag_enabled);
}
-static inline bool kasan_hw_tags_enabled(void)
+static inline void kasan_enable(void)
{
- return kasan_enabled();
+ static_branch_enable(&kasan_flag_enabled);
}
-
-#else /* CONFIG_KASAN_HW_TAGS */
-
-static inline bool kasan_enabled(void)
+#else
+/* For architectures that can enable KASAN early, use compile-time check. */
+static __always_inline bool kasan_enabled(void)
{
return IS_ENABLED(CONFIG_KASAN);
}
+static inline void kasan_enable(void) {}
+#endif /* CONFIG_ARCH_DEFER_KASAN || CONFIG_KASAN_HW_TAGS */
+
+#ifdef CONFIG_KASAN_HW_TAGS
+static inline bool kasan_hw_tags_enabled(void)
+{
+ return kasan_enabled();
+}
+#else
static inline bool kasan_hw_tags_enabled(void)
{
return false;
}
-
#endif /* CONFIG_KASAN_HW_TAGS */
#endif /* LINUX_KASAN_ENABLED_H */
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b4973e7c2940..d12e1a5f5a9a 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -546,6 +546,12 @@ void kasan_report_async(void);
#endif /* CONFIG_KASAN_HW_TAGS */
+#ifdef CONFIG_KASAN_GENERIC
+void __init kasan_init_generic(void);
+#else
+static inline void kasan_init_generic(void) { }
+#endif
+
#ifdef CONFIG_KASAN_SW_TAGS
void __init kasan_init_sw_tags(void);
#else
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index ff6120463745..eb1946a70cff 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -2,6 +2,8 @@
#ifndef _LINUX_KHUGEPAGED_H
#define _LINUX_KHUGEPAGED_H
+#include <linux/mm.h>
+
extern unsigned int khugepaged_max_ptes_none __read_mostly;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern struct attribute_group khugepaged_attr_group;
@@ -20,13 +22,13 @@ extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
- if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
+ if (mm_flags_test(MMF_VM_HUGEPAGE, oldmm))
__khugepaged_enter(mm);
}
static inline void khugepaged_exit(struct mm_struct *mm)
{
- if (test_bit(MMF_VM_HUGEPAGE, &mm->flags))
+ if (mm_flags_test(MMF_VM_HUGEPAGE, mm))
__khugepaged_exit(mm);
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index c17b955e7b0b..067538fc4d58 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -56,13 +56,19 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm)
static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
/* Adding mm to ksm is best effort on fork. */
- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
+ if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) {
+ long nr_ksm_zero_pages = atomic_long_read(&mm->ksm_zero_pages);
+
+ mm->ksm_merging_pages = 0;
+ mm->ksm_rmap_items = 0;
+ atomic_long_add(nr_ksm_zero_pages, &ksm_zero_pages);
__ksm_enter(mm);
+ }
}
static inline int ksm_execve(struct mm_struct *mm)
{
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
return __ksm_enter(mm);
return 0;
@@ -70,7 +76,7 @@ static inline int ksm_execve(struct mm_struct *mm)
static inline void ksm_exit(struct mm_struct *mm)
{
- if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGEABLE, mm))
__ksm_exit(mm);
}
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 51a64ff23b88..66f98a3da8d8 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -57,17 +57,17 @@
* MT_FLAGS_ALLOC_RANGE flag.
*
* Node types:
- * 0x??1 = Root
- * 0x?00 = 16 bit nodes
- * 0x010 = 32 bit nodes
- * 0x110 = 64 bit nodes
+ * 0b??1 = Root
+ * 0b?00 = 16 bit nodes
+ * 0b010 = 32 bit nodes
+ * 0b110 = 64 bit nodes
*
* Slot size and location in the parent pointer:
* type : slot location
- * 0x??1 : Root
- * 0x?00 : 16 bit values, type in 0-1, slot in 2-6
- * 0x010 : 32 bit values, type in 0-2, slot in 3-6
- * 0x110 : 64 bit values, type in 0-2, slot in 3-6
+ * 0b??1 : Root
+ * 0b?00 : 16 bit values, type in 0-1, slot in 2-6
+ * 0b010 : 32 bit values, type in 0-2, slot in 3-6
+ * 0b110 : 64 bit values, type in 0-2, slot in 3-6
*/
/*
@@ -194,7 +194,6 @@ enum store_type {
#define MAPLE_RESERVED_RANGE 4096
#ifdef CONFIG_LOCKDEP
-typedef struct lockdep_map *lockdep_map_p;
#define mt_lock_is_held(mt) \
(!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock))
@@ -207,7 +206,6 @@ typedef struct lockdep_map *lockdep_map_p;
#define mt_on_stack(mt) (mt).ma_external_lock = NULL
#else
-typedef struct { /* nothing */ } lockdep_map_p;
#define mt_lock_is_held(mt) 1
#define mt_write_lock_is_held(mt) 1
#define mt_set_external_lock(mt, lock) do { } while (0)
@@ -230,8 +228,10 @@ typedef struct { /* nothing */ } lockdep_map_p;
*/
struct maple_tree {
union {
- spinlock_t ma_lock;
- lockdep_map_p ma_external_lock;
+ spinlock_t ma_lock;
+#ifdef CONFIG_LOCKDEP
+ struct lockdep_map *ma_external_lock;
+#endif
};
unsigned int ma_flags;
void __rcu *ma_root;
@@ -483,6 +483,9 @@ struct ma_wr_state {
#define MA_ERROR(err) \
((struct maple_enode *)(((unsigned long)err << 2) | 2UL))
+/*
+ * When changing MA_STATE, remember to also change rust/kernel/maple_tree.rs
+ */
#define MA_STATE(name, mt, first, end) \
struct ma_state name = { \
.tree = mt, \
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9924f157aae0..16fe0306e50e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -908,7 +908,13 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
}
-void mem_cgroup_handle_over_high(gfp_t gfp_mask);
+void __mem_cgroup_handle_over_high(gfp_t gfp_mask);
+
+static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
+{
+ if (unlikely(current->memcg_nr_pages_over_high))
+ __mem_cgroup_handle_over_high(gfp_mask);
+}
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
@@ -1061,6 +1067,8 @@ extern int mem_cgroup_init(void);
#define MEM_CGROUP_ID_SHIFT 0
+#define root_mem_cgroup (NULL)
+
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
return NULL;
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 7b151441341b..34941a4b9026 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -15,7 +15,7 @@ struct kmem_cache;
typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data);
typedef void (mempool_free_t)(void *element, void *pool_data);
-typedef struct mempool_s {
+typedef struct mempool {
spinlock_t lock;
int min_nr; /* nr of elements at *elements */
int curr_nr; /* Current nr of elements at *elements */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 4aa151914eab..e5951ba12a28 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -157,45 +157,52 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
return 1 << pgmap->vmemmap_shift;
}
-static inline bool is_device_private_page(const struct page *page)
+static inline bool folio_is_device_private(const struct folio *folio)
{
return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
- is_zone_device_page(page) &&
- page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE;
+ folio_is_zone_device(folio) &&
+ folio->pgmap->type == MEMORY_DEVICE_PRIVATE;
}
-static inline bool folio_is_device_private(const struct folio *folio)
+static inline bool is_device_private_page(const struct page *page)
{
- return is_device_private_page(&folio->page);
+ return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+ folio_is_device_private(page_folio(page));
}
-static inline bool is_pci_p2pdma_page(const struct page *page)
+static inline bool folio_is_pci_p2pdma(const struct folio *folio)
{
return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
- is_zone_device_page(page) &&
- page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA;
+ folio_is_zone_device(folio) &&
+ folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
}
-static inline bool is_device_coherent_page(const struct page *page)
+static inline bool is_pci_p2pdma_page(const struct page *page)
{
- return is_zone_device_page(page) &&
- page_pgmap(page)->type == MEMORY_DEVICE_COHERENT;
+ return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
+ folio_is_pci_p2pdma(page_folio(page));
}
static inline bool folio_is_device_coherent(const struct folio *folio)
{
- return is_device_coherent_page(&folio->page);
+ return folio_is_zone_device(folio) &&
+ folio->pgmap->type == MEMORY_DEVICE_COHERENT;
}
-static inline bool is_fsdax_page(const struct page *page)
+static inline bool is_device_coherent_page(const struct page *page)
{
- return is_zone_device_page(page) &&
- page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX;
+ return folio_is_device_coherent(page_folio(page));
}
static inline bool folio_is_fsdax(const struct folio *folio)
{
- return is_fsdax_page(&folio->page);
+ return folio_is_zone_device(folio) &&
+ folio->pgmap->type == MEMORY_DEVICE_FS_DAX;
+}
+
+static inline bool is_fsdax_page(const struct page *page)
+{
+ return folio_is_fsdax(page_folio(page));
}
#ifdef CONFIG_ZONE_DEVICE
@@ -204,8 +211,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
-struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
- struct dev_pagemap *pgmap);
+struct dev_pagemap *get_dev_pagemap(unsigned long pfn);
bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
unsigned long memremap_compat_align(void);
@@ -227,8 +233,7 @@ static inline void devm_memunmap_pages(struct device *dev,
{
}
-static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
- struct dev_pagemap *pgmap)
+static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn)
{
return NULL;
}
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 9009e27b5f44..1f0ac122c3bf 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -12,14 +12,6 @@ typedef void free_folio_t(struct folio *folio, unsigned long private);
struct migration_target_control;
-/*
- * Return values from addresss_space_operations.migratepage():
- * - negative errno on page migration failure;
- * - zero on page migration success;
- */
-#define MIGRATEPAGE_SUCCESS 0
-#define MIGRATEPAGE_UNMAP 1
-
/**
* struct movable_operations - Driver page migration
* @isolate_page:
@@ -35,8 +27,7 @@ struct migration_target_control;
* @src page. The driver should copy the contents of the
* @src page to the @dst page and set up the fields of @dst page.
* Both pages are locked.
- * If page migration is successful, the driver should
- * return MIGRATEPAGE_SUCCESS.
+ * If page migration is successful, the driver should return 0.
* If the driver cannot migrate the page at the moment, it can return
* -EAGAIN. The VM interprets this as a temporary migration failure and
* will retry it later. Any other error value is a permanent migration
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ae97a0b8ec7..06978b4dbeb8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -34,6 +34,8 @@
#include <linux/slab.h>
#include <linux/cacheinfo.h>
#include <linux/rcuwait.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
struct mempolicy;
struct anon_vma;
@@ -69,6 +71,15 @@ static inline void totalram_pages_add(long count)
extern void * high_memory;
+/*
+ * Convert between pages and MB
+ * 20 is the shift for 1MB (2^20 = 1MB)
+ * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages)
+ * So (20 - PAGE_SHIFT) converts between pages and MB
+ */
+#define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT))
+#define MB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
+
#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
@@ -198,11 +209,13 @@ extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
-#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
-#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio))
+bool page_range_contiguous(const struct page *page, unsigned long nr_pages);
#else
-#define nth_page(page,n) ((page) + (n))
-#define folio_page_idx(folio, p) ((p) - &(folio)->page)
+static inline bool page_range_contiguous(const struct page *page,
+ unsigned long nr_pages)
+{
+ return true;
+}
#endif
/* to align the pointer to the (next) page boundary */
@@ -214,6 +227,20 @@ extern unsigned long sysctl_admin_reserve_kbytes;
/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
+/**
+ * folio_page_idx - Return the number of a page in a folio.
+ * @folio: The folio.
+ * @page: The folio page.
+ *
+ * This function expects that the page is actually part of the folio.
+ * The returned number is relative to the start of the folio.
+ */
+static inline unsigned long folio_page_idx(const struct folio *folio,
+ const struct page *page)
+{
+ return page - &folio->page;
+}
+
static inline struct folio *lru_to_folio(struct list_head *head)
{
return list_entry((head)->prev, struct folio, lru);
@@ -648,13 +675,21 @@ struct vm_operations_struct {
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
unsigned long addr, pgoff_t *ilx);
#endif
+#ifdef CONFIG_FIND_NORMAL_PAGE
/*
- * Called by vm_normal_page() for special PTEs to find the
- * page for @addr. This is useful if the default behavior
- * (using pte_page()) would not find the correct page.
+ * Called by vm_normal_page() for special PTEs in @vma at @addr. This
+ * allows for returning a "normal" page from vm_normal_page() even
+ * though the PTE indicates that the "struct page" either does not exist
+ * or should not be touched: "special".
+ *
+ * Do not add new users: this really only works when a "normal" page
+ * was mapped, but then the PTE got changed to something weird (+
+ * marked special) that would not make pte_pfn() identify the originally
+ * inserted page.
*/
- struct page *(*find_special_page)(struct vm_area_struct *vma,
- unsigned long addr);
+ struct page *(*find_normal_page)(struct vm_area_struct *vma,
+ unsigned long addr);
+#endif /* CONFIG_FIND_NORMAL_PAGE */
};
#ifdef CONFIG_NUMA_BALANCING
@@ -684,7 +719,7 @@ static inline void release_fault_lock(struct vm_fault *vmf)
mmap_read_unlock(vmf->vma->vm_mm);
}
-static inline void assert_fault_locked(struct vm_fault *vmf)
+static inline void assert_fault_locked(const struct vm_fault *vmf)
{
if (vmf->flags & FAULT_FLAG_VMA_LOCK)
vma_assert_locked(vmf->vma);
@@ -697,12 +732,42 @@ static inline void release_fault_lock(struct vm_fault *vmf)
mmap_read_unlock(vmf->vma->vm_mm);
}
-static inline void assert_fault_locked(struct vm_fault *vmf)
+static inline void assert_fault_locked(const struct vm_fault *vmf)
{
mmap_assert_locked(vmf->vma->vm_mm);
}
#endif /* CONFIG_PER_VMA_LOCK */
+static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
+{
+ return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm)
+{
+ return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm)
+{
+ return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline void mm_flags_set(int flag, struct mm_struct *mm)
+{
+ set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline void mm_flags_clear(int flag, struct mm_struct *mm)
+{
+ clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline void mm_flags_clear_all(struct mm_struct *mm)
+{
+ bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS);
+}
+
extern const struct vm_operations_struct vma_dummy_vm_ops;
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
@@ -810,7 +875,7 @@ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
vma->vm_end >= vma->vm_mm->start_stack;
}
-static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
+static inline bool vma_is_temporary_stack(const struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -824,7 +889,7 @@ static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
return false;
}
-static inline bool vma_is_foreign(struct vm_area_struct *vma)
+static inline bool vma_is_foreign(const struct vm_area_struct *vma)
{
if (!current->mm)
return true;
@@ -835,7 +900,7 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma)
return false;
}
-static inline bool vma_is_accessible(struct vm_area_struct *vma)
+static inline bool vma_is_accessible(const struct vm_area_struct *vma)
{
return vma->vm_flags & VM_ACCESS_FLAGS;
}
@@ -846,7 +911,7 @@ static inline bool is_shared_maywrite(vm_flags_t vm_flags)
(VM_SHARED | VM_MAYWRITE);
}
-static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma)
{
return is_shared_maywrite(vma->vm_flags);
}
@@ -930,14 +995,14 @@ static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
* The vma_is_shmem is not inline because it is used only by slow
* paths in userfault.
*/
-bool vma_is_shmem(struct vm_area_struct *vma);
-bool vma_is_anon_shmem(struct vm_area_struct *vma);
+bool vma_is_shmem(const struct vm_area_struct *vma);
+bool vma_is_anon_shmem(const struct vm_area_struct *vma);
#else
-static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
-static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; }
+static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false; }
+static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; }
#endif
-int vma_is_stack_for_current(struct vm_area_struct *vma);
+int vma_is_stack_for_current(const struct vm_area_struct *vma);
/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
@@ -953,12 +1018,12 @@ static inline unsigned int folio_large_order(const struct folio *folio)
}
#ifdef NR_PAGES_IN_LARGE_FOLIO
-static inline long folio_large_nr_pages(const struct folio *folio)
+static inline unsigned long folio_large_nr_pages(const struct folio *folio)
{
return folio->_nr_pages;
}
#else
-static inline long folio_large_nr_pages(const struct folio *folio)
+static inline unsigned long folio_large_nr_pages(const struct folio *folio)
{
return 1L << folio_large_order(folio);
}
@@ -971,11 +1036,11 @@ static inline long folio_large_nr_pages(const struct folio *folio)
* set before the order is initialised, or this may be a tail page.
* See compaction.c for some good examples.
*/
-static inline unsigned int compound_order(struct page *page)
+static inline unsigned int compound_order(const struct page *page)
{
- struct folio *folio = (struct folio *)page;
+ const struct folio *folio = (struct folio *)page;
- if (!test_bit(PG_head, &folio->flags))
+ if (!test_bit(PG_head, &folio->flags.f))
return 0;
return folio_large_order(folio);
}
@@ -1191,7 +1256,7 @@ int folio_mc_copy(struct folio *dst, struct folio *src);
unsigned long nr_free_buffer_pages(void);
/* Returns the number of bytes in this potentially compound page. */
-static inline unsigned long page_size(struct page *page)
+static inline unsigned long page_size(const struct page *page)
{
return PAGE_SIZE << compound_order(page);
}
@@ -1505,21 +1570,26 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags)
*/
static inline int page_zone_id(struct page *page)
{
- return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
+ return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK;
}
#ifdef NODE_NOT_IN_PAGE_FLAGS
-int page_to_nid(const struct page *page);
+int memdesc_nid(memdesc_flags_t mdf);
#else
-static inline int page_to_nid(const struct page *page)
+static inline int memdesc_nid(memdesc_flags_t mdf)
{
- return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK;
+ return (mdf.f >> NODES_PGSHIFT) & NODES_MASK;
}
#endif
+static inline int page_to_nid(const struct page *page)
+{
+ return memdesc_nid(PF_POISONED_CHECK(page)->flags);
+}
+
static inline int folio_nid(const struct folio *folio)
{
- return page_to_nid(&folio->page);
+ return memdesc_nid(folio->flags);
}
#ifdef CONFIG_NUMA_BALANCING
@@ -1588,14 +1658,14 @@ static inline void page_cpupid_reset_last(struct page *page)
#else
static inline int folio_last_cpupid(struct folio *folio)
{
- return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
+ return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}
int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
static inline void page_cpupid_reset_last(struct page *page)
{
- page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
+ page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
@@ -1691,7 +1761,7 @@ static inline u8 page_kasan_tag(const struct page *page)
u8 tag = KASAN_TAG_KERNEL;
if (kasan_enabled()) {
- tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
+ tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
tag ^= 0xff;
}
@@ -1706,12 +1776,12 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag)
return;
tag ^= 0xff;
- old_flags = READ_ONCE(page->flags);
+ old_flags = READ_ONCE(page->flags.f);
do {
flags = old_flags;
flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
- } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
+ } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags)));
}
static inline void page_kasan_tag_reset(struct page *page)
@@ -1742,26 +1812,26 @@ static inline pg_data_t *page_pgdat(const struct page *page)
return NODE_DATA(page_to_nid(page));
}
-static inline struct zone *folio_zone(const struct folio *folio)
+static inline pg_data_t *folio_pgdat(const struct folio *folio)
{
- return page_zone(&folio->page);
+ return NODE_DATA(folio_nid(folio));
}
-static inline pg_data_t *folio_pgdat(const struct folio *folio)
+static inline struct zone *folio_zone(const struct folio *folio)
{
- return page_pgdat(&folio->page);
+ return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)];
}
#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
- page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
- page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
+ page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
+ page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
}
-static inline unsigned long page_to_section(const struct page *page)
+static inline unsigned long memdesc_section(memdesc_flags_t mdf)
{
- return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
+ return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
#endif
@@ -1785,7 +1855,7 @@ static inline struct folio *pfn_folio(unsigned long pfn)
}
#ifdef CONFIG_MMU
-static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
+static inline pte_t mk_pte(const struct page *page, pgprot_t pgprot)
{
return pfn_pte(page_to_pfn(page), pgprot);
}
@@ -1800,7 +1870,7 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
*
* Return: A page table entry suitable for mapping this folio.
*/
-static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot)
+static inline pte_t folio_mk_pte(const struct folio *folio, pgprot_t pgprot)
{
return pfn_pte(folio_pfn(folio), pgprot);
}
@@ -1816,7 +1886,7 @@ static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot)
*
* Return: A page table entry suitable for mapping this folio.
*/
-static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot)
+static inline pmd_t folio_mk_pmd(const struct folio *folio, pgprot_t pgprot)
{
return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot));
}
@@ -1832,7 +1902,7 @@ static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot)
*
* Return: A page table entry suitable for mapping this folio.
*/
-static inline pud_t folio_mk_pud(struct folio *folio, pgprot_t pgprot)
+static inline pud_t folio_mk_pud(const struct folio *folio, pgprot_t pgprot)
{
return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot));
}
@@ -1900,7 +1970,7 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
{
VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
- if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
+ if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))
return false;
return folio_maybe_dma_pinned(folio);
@@ -1966,14 +2036,14 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio)
static inline void set_page_zone(struct page *page, enum zone_type zone)
{
- page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
- page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
+ page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT);
+ page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}
static inline void set_page_node(struct page *page, unsigned long node)
{
- page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
- page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
+ page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT);
+ page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT;
}
static inline void set_page_links(struct page *page, enum zone_type zone,
@@ -1992,30 +2062,46 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
*
* Return: A positive power of two.
*/
-static inline long folio_nr_pages(const struct folio *folio)
+static inline unsigned long folio_nr_pages(const struct folio *folio)
{
if (!folio_test_large(folio))
return 1;
return folio_large_nr_pages(folio);
}
-/* Only hugetlbfs can allocate folios larger than MAX_ORDER */
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER)
+#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE)
+/*
+ * We don't expect any folios that exceed buddy sizes (and consequently
+ * memory sections).
+ */
+#define MAX_FOLIO_ORDER MAX_PAGE_ORDER
+#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+/*
+ * Only pages within a single memory section are guaranteed to be
+ * contiguous. By limiting folios to a single memory section, all folio
+ * pages are guaranteed to be contiguous.
+ */
+#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT
#else
-#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES
+/*
+ * There is no real limit on the folio size. We limit them to the maximum we
+ * currently expect (e.g., hugetlb, dax).
+ */
+#define MAX_FOLIO_ORDER PUD_ORDER
#endif
+#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER)
+
/*
* compound_nr() returns the number of pages in this potentially compound
* page. compound_nr() can be called on a tail page, and is defined to
* return 1 in that case.
*/
-static inline long compound_nr(struct page *page)
+static inline unsigned long compound_nr(const struct page *page)
{
- struct folio *folio = (struct folio *)page;
+ const struct folio *folio = (struct folio *)page;
- if (!test_bit(PG_head, &folio->flags))
+ if (!test_bit(PG_head, &folio->flags.f))
return 1;
return folio_large_nr_pages(folio);
}
@@ -2351,6 +2437,8 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
+struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t pud);
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
@@ -2529,7 +2617,7 @@ void folio_add_pin(struct folio *folio);
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
- struct task_struct *task, bool bypass_rlim);
+ const struct task_struct *task, bool bypass_rlim);
struct kvec;
struct page *get_dump_page(unsigned long addr, int *locked);
@@ -2654,7 +2742,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm)
unsigned long _rss = get_mm_rss(mm);
if (data_race(mm->hiwater_rss) < _rss)
- (mm)->hiwater_rss = _rss;
+ data_race(mm->hiwater_rss = _rss);
}
static inline void update_hiwater_vm(struct mm_struct *mm)
@@ -2846,16 +2934,22 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
}
#endif /* CONFIG_MMU */
+enum pt_flags {
+ PT_reserved = PG_reserved,
+ /* High bits are used for zone/node/section */
+};
+
static inline struct ptdesc *virt_to_ptdesc(const void *x)
{
return page_ptdesc(virt_to_page(x));
}
-static inline void *ptdesc_to_virt(const struct ptdesc *pt)
-{
- return page_to_virt(ptdesc_page(pt));
-}
-
+/**
+ * ptdesc_address - Virtual address of page table.
+ * @pt: Page table descriptor.
+ *
+ * Return: The first byte of the page table described by @pt.
+ */
static inline void *ptdesc_address(const struct ptdesc *pt)
{
return folio_address(ptdesc_folio(pt));
@@ -2863,7 +2957,7 @@ static inline void *ptdesc_address(const struct ptdesc *pt)
static inline bool pagetable_is_reserved(struct ptdesc *pt)
{
- return folio_test_reserved(ptdesc_folio(pt));
+ return test_bit(PT_reserved, &pt->pt_flags.f);
}
/**
@@ -2973,21 +3067,26 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
+static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc)
+{
+ return compound_nr(ptdesc_page(ptdesc));
+}
+
static inline void __pagetable_ctor(struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
+ pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags));
- __folio_set_pgtable(folio);
- lruvec_stat_add_folio(folio, NR_PAGETABLE);
+ __SetPageTable(ptdesc_page(ptdesc));
+ mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc));
}
static inline void pagetable_dtor(struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
+ pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags));
ptlock_free(ptdesc);
- __folio_clear_pgtable(folio);
- lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+ __ClearPageTable(ptdesc_page(ptdesc));
+ mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc));
}
static inline void pagetable_dtor_free(struct ptdesc *ptdesc)
@@ -3292,7 +3391,7 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
/* mmap.c */
-extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
+extern int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);
bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -3432,7 +3531,7 @@ struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
return mtree_load(&mm->mm_mt, addr);
}
-static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
+static inline unsigned long stack_guard_start_gap(const struct vm_area_struct *vma)
{
if (vma->vm_flags & VM_GROWSDOWN)
return stack_guard_gap;
@@ -3444,7 +3543,7 @@ static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
return 0;
}
-static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
+static inline unsigned long vm_start_gap(const struct vm_area_struct *vma)
{
unsigned long gap = stack_guard_start_gap(vma);
unsigned long vm_start = vma->vm_start;
@@ -3455,7 +3554,7 @@ static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
return vm_start;
}
-static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
+static inline unsigned long vm_end_gap(const struct vm_area_struct *vma)
{
unsigned long vm_end = vma->vm_end;
@@ -3467,7 +3566,7 @@ static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
return vm_end;
}
-static inline unsigned long vma_pages(struct vm_area_struct *vma)
+static inline unsigned long vma_pages(const struct vm_area_struct *vma)
{
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}
@@ -3484,7 +3583,7 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
return vma;
}
-static inline bool range_in_vma(struct vm_area_struct *vma,
+static inline bool range_in_vma(const struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
return (vma && vma->vm_start <= start && end <= vma->vm_end);
@@ -3600,7 +3699,7 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
* Indicates whether GUP can follow a PROT_NONE mapped page, or whether
* a (NUMA hinting) fault is required.
*/
-static inline bool gup_can_follow_protnone(struct vm_area_struct *vma,
+static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma,
unsigned int flags)
{
/*
@@ -3730,7 +3829,7 @@ static inline bool debug_guardpage_enabled(void)
return static_branch_unlikely(&_debug_guardpage_enabled);
}
-static inline bool page_is_guard(struct page *page)
+static inline bool page_is_guard(const struct page *page)
{
if (!debug_guardpage_enabled())
return false;
@@ -3761,7 +3860,7 @@ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
-static inline bool page_is_guard(struct page *page) { return false; }
+static inline bool page_is_guard(const struct page *page) { return false; }
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
@@ -3784,7 +3883,7 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
}
#endif /* __HAVE_ARCH_GATE_AREA */
-extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
+bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm);
void drop_slab(void);
@@ -3843,7 +3942,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap)
{
/* number of pfns from base where pfn_to_page() is valid */
if (altmap)
@@ -3857,7 +3956,7 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
altmap->alloc -= nr_pfns;
}
#else
-static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap)
{
return 0;
}
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 89b518ff097e..d6c1011b38f2 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -25,7 +25,7 @@
* 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise
* ram or swap backed folio.
*/
-static inline int folio_is_file_lru(struct folio *folio)
+static inline int folio_is_file_lru(const struct folio *folio)
{
return !folio_test_swapbacked(folio);
}
@@ -84,7 +84,7 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio)
* Return: The LRU list a folio should be on, as an index
* into the array of LRU lists.
*/
-static __always_inline enum lru_list folio_lru_list(struct folio *folio)
+static __always_inline enum lru_list folio_lru_list(const struct folio *folio)
{
enum lru_list lru;
@@ -141,9 +141,9 @@ static inline int lru_tier_from_refs(int refs, bool workingset)
return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs);
}
-static inline int folio_lru_refs(struct folio *folio)
+static inline int folio_lru_refs(const struct folio *folio)
{
- unsigned long flags = READ_ONCE(folio->flags);
+ unsigned long flags = READ_ONCE(folio->flags.f);
if (!(flags & BIT(PG_referenced)))
return 0;
@@ -154,14 +154,14 @@ static inline int folio_lru_refs(struct folio *folio)
return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1;
}
-static inline int folio_lru_gen(struct folio *folio)
+static inline int folio_lru_gen(const struct folio *folio)
{
- unsigned long flags = READ_ONCE(folio->flags);
+ unsigned long flags = READ_ONCE(folio->flags.f);
return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
-static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+static inline bool lru_gen_is_active(const struct lruvec *lruvec, int gen)
{
unsigned long max_seq = lruvec->lrugen.max_seq;
@@ -217,12 +217,13 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}
-static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio,
+static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec,
+ const struct folio *folio,
bool reclaiming)
{
int gen;
int type = folio_is_file_lru(folio);
- struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ const struct lru_gen_folio *lrugen = &lruvec->lrugen;
/*
* +-----------------------------------+-----------------------------------+
@@ -268,7 +269,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
gen = lru_gen_from_seq(seq);
flags = (gen + 1UL) << LRU_GEN_PGOFF;
/* see the comment on MIN_NR_GENS about PG_active */
- set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+ set_mask_bits(&folio->flags.f, LRU_GEN_MASK | BIT(PG_active), flags);
lru_gen_update_size(lruvec, folio, -1, gen);
/* for folio_rotate_reclaimable() */
@@ -293,7 +294,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
/* for folio_migrate_flags() */
flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
- flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
+ flags = set_mask_bits(&folio->flags.f, LRU_GEN_MASK, flags);
gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
lru_gen_update_size(lruvec, folio, gen, -1);
@@ -302,11 +303,11 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
return true;
}
-static inline void folio_migrate_refs(struct folio *new, struct folio *old)
+static inline void folio_migrate_refs(struct folio *new, const struct folio *old)
{
- unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK;
+ unsigned long refs = READ_ONCE(old->flags.f) & LRU_REFS_MASK;
- set_mask_bits(&new->flags, LRU_REFS_MASK, refs);
+ set_mask_bits(&new->flags.f, LRU_REFS_MASK, refs);
}
#else /* !CONFIG_LRU_GEN */
@@ -330,7 +331,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
return false;
}
-static inline void folio_migrate_refs(struct folio *new, struct folio *old)
+static inline void folio_migrate_refs(struct folio *new, const struct folio *old)
{
}
@@ -508,7 +509,7 @@ static inline void dec_tlb_flush_pending(struct mm_struct *mm)
atomic_dec(&mm->tlb_flush_pending);
}
-static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
+static inline bool mm_tlb_flush_pending(const struct mm_struct *mm)
{
/*
* Must be called after having acquired the PTL; orders against that
@@ -521,7 +522,7 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
return atomic_read(&mm->tlb_flush_pending);
}
-static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
+static inline bool mm_tlb_flush_nested(const struct mm_struct *mm)
{
/*
* Similar to mm_tlb_flush_pending(), we must have acquired the PTL
@@ -605,7 +606,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
return false;
}
-static inline bool vma_has_recency(struct vm_area_struct *vma)
+static inline bool vma_has_recency(const struct vm_area_struct *vma)
{
if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
return false;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f625c35128b..90e5790c318f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -20,6 +20,7 @@
#include <linux/seqlock.h>
#include <linux/percpu_counter.h>
#include <linux/types.h>
+#include <linux/bitmap.h>
#include <asm/mmu.h>
@@ -33,6 +34,10 @@ struct address_space;
struct futex_private_hash;
struct mem_cgroup;
+typedef struct {
+ unsigned long f;
+} memdesc_flags_t;
+
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
@@ -71,7 +76,7 @@ struct mem_cgroup;
#endif
struct page {
- unsigned long flags; /* Atomic flags, some possibly
+ memdesc_flags_t flags; /* Atomic flags, some possibly
* updated asynchronously */
/*
* Five words (20/40 bytes) are available in this union.
@@ -89,21 +94,10 @@ struct page {
union {
struct list_head lru;
- /* Or, for the Unevictable "LRU list" slot */
- struct {
- /* Always even, to negate PageTail */
- void *__filler;
- /* Count page's or folio's mlocks */
- unsigned int mlock_count;
- };
-
/* Or, free page */
struct list_head buddy_list;
struct list_head pcp_list;
- struct {
- struct llist_node pcp_llist;
- unsigned int order;
- };
+ struct llist_node pcp_llist;
};
struct address_space *mapping;
union {
@@ -114,7 +108,8 @@ struct page {
* @private: Mapping-private opaque data.
* Usually used for buffer_heads if PagePrivate.
* Used for swp_entry_t if swapcache flag set.
- * Indicates order in the buddy system if PageBuddy.
+ * Indicates order in the buddy system if PageBuddy
+ * or on pcp_llist.
*/
unsigned long private;
};
@@ -382,11 +377,13 @@ struct folio {
union {
struct {
/* public: */
- unsigned long flags;
+ memdesc_flags_t flags;
union {
struct list_head lru;
/* private: avoid cluttering the output */
+ /* For the Unevictable "LRU list" slot */
struct {
+ /* Avoid compound_head */
void *__filler;
/* public: */
unsigned int mlock_count;
@@ -525,7 +522,7 @@ FOLIO_MATCH(compound_head, _head_3);
/**
* struct ptdesc - Memory descriptor for page tables.
- * @__page_flags: Same as page flags. Powerpc only.
+ * @pt_flags: enum pt_flags plus zone/node/section.
* @pt_rcu_head: For freeing page table pages.
* @pt_list: List of used page tables. Used for s390 gmap shadow pages
* (which are not linked into the user page tables) and x86
@@ -547,7 +544,7 @@ FOLIO_MATCH(compound_head, _head_3);
* understanding of the issues.
*/
struct ptdesc {
- unsigned long __page_flags;
+ memdesc_flags_t pt_flags;
union {
struct rcu_head pt_rcu_head;
@@ -585,7 +582,7 @@ struct ptdesc {
#define TABLE_MATCH(pg, pt) \
static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt))
-TABLE_MATCH(flags, __page_flags);
+TABLE_MATCH(flags, pt_flags);
TABLE_MATCH(compound_head, pt_list);
TABLE_MATCH(compound_head, _pt_pad_1);
TABLE_MATCH(mapping, __page_mapping);
@@ -627,7 +624,7 @@ static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc)
atomic_dec(&ptdesc->pt_share_count);
}
-static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc)
+static inline int ptdesc_pmd_pts_count(const struct ptdesc *ptdesc)
{
return atomic_read(&ptdesc->pt_share_count);
}
@@ -660,7 +657,7 @@ static inline void set_page_private(struct page *page, unsigned long private)
page->private = private;
}
-static inline void *folio_get_private(struct folio *folio)
+static inline void *folio_get_private(const struct folio *folio)
{
return folio->private;
}
@@ -785,13 +782,14 @@ struct pfnmap_track_ctx {
*/
struct vm_area_desc {
/* Immutable state. */
- struct mm_struct *mm;
+ const struct mm_struct *const mm;
+ struct file *const file; /* May vary from vm_file in stacked callers. */
unsigned long start;
unsigned long end;
/* Mutable fields. Populated with initial state. */
pgoff_t pgoff;
- struct file *file;
+ struct file *vm_file;
vm_flags_t vm_flags;
pgprot_t page_prot;
@@ -932,6 +930,15 @@ struct mm_cid {
};
#endif
+/*
+ * Opaque type representing current mm_struct flag state. Must be accessed via
+ * mm_flags_xxx() helper functions.
+ */
+#define NUM_MM_FLAG_BITS (64)
+typedef struct {
+ DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
+} __private mm_flags_t;
+
struct kioctx_table;
struct iommu_mm_data;
struct mm_struct {
@@ -1031,10 +1038,10 @@ struct mm_struct {
* counters
*/
/*
- * With some kernel config, the current mmap_lock's offset
- * inside 'mm_struct' is at 0x120, which is very optimal, as
+ * Typically the current mmap_lock's offset is 56 bytes from
+ * the last cacheline boundary, which is very optimal, as
* its two hot fields 'count' and 'owner' sit in 2 different
- * cachelines, and when mmap_lock is highly contended, both
+ * cachelines, and when mmap_lock is highly contended, both
* of the 2 fields will be accessed frequently, current layout
* will help to reduce cache bouncing.
*
@@ -1119,7 +1126,7 @@ struct mm_struct {
/* Architecture-specific MM context */
mm_context_t context;
- unsigned long flags; /* Must use atomic bitops to access */
+ mm_flags_t flags; /* Must use mm_flags_* hlpers to access */
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
@@ -1229,6 +1236,40 @@ struct mm_struct {
unsigned long cpu_bitmap[];
};
+/* Set the first system word of mm flags, non-atomically. */
+static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags);
+
+ bitmap_copy(bitmap, &value, BITS_PER_LONG);
+}
+
+/* Obtain a read-only view of the bitmap. */
+static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm)
+{
+ return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags);
+}
+
+/* Read the first system word of mm flags, non-atomically. */
+static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm)
+{
+ const unsigned long *bitmap = __mm_flags_get_bitmap(mm);
+
+ return bitmap_read(bitmap, 0, BITS_PER_LONG);
+}
+
+/*
+ * Update the first system word of mm flags ONLY, applying the specified mask to
+ * it, then setting all flags specified by bits.
+ */
+static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm,
+ unsigned long mask, unsigned long bits)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags);
+
+ set_mask_bits(bitmap, mask, bits);
+}
+
#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \
MT_FLAGS_USE_RCU)
extern struct mm_struct init_mm;
@@ -1729,7 +1770,7 @@ enum {
* the modes are SUID_DUMP_* defined in linux/sched/coredump.h
*/
#define MMF_DUMPABLE_BITS 2
-#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
+#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1)
/* coredump filter bits */
#define MMF_DUMP_ANON_PRIVATE 2
#define MMF_DUMP_ANON_SHARED 3
@@ -1744,13 +1785,13 @@ enum {
#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
#define MMF_DUMP_FILTER_BITS 9
#define MMF_DUMP_FILTER_MASK \
- (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
+ ((BIT(MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
#define MMF_DUMP_FILTER_DEFAULT \
- ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
- (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
+ (BIT(MMF_DUMP_ANON_PRIVATE) | BIT(MMF_DUMP_ANON_SHARED) | \
+ BIT(MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
-# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
+# define MMF_DUMP_MASK_DEFAULT_ELF BIT(MMF_DUMP_ELF_HEADERS)
#else
# define MMF_DUMP_MASK_DEFAULT_ELF 0
#endif
@@ -1758,19 +1799,16 @@ enum {
#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
#define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */
-/*
- * This one-shot flag is dropped due to necessity of changing exe once again
- * on NFS restore
- */
-//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
+#define MMF_HUGE_ZERO_FOLIO 18 /* mm has ever used the global huge zero folio */
#define MMF_HAS_UPROBES 19 /* has uprobes */
#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */
#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
-#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
-#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
-#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
+#define MMF_DISABLE_THP_EXCEPT_ADVISED 23 /* no THP except when advised (e.g., VM_HUGEPAGE) */
+#define MMF_DISABLE_THP_COMPLETELY 24 /* no THP for all VMAs */
+#define MMF_DISABLE_THP_MASK (BIT(MMF_DISABLE_THP_COMPLETELY) | \
+ BIT(MMF_DISABLE_THP_EXCEPT_ADVISED))
#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */
#define MMF_MULTIPROCESS 26 /* mm is shared between processes */
/*
@@ -1783,27 +1821,33 @@ enum {
#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */
#define MMF_HAS_MDWE 28
-#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE)
-
+#define MMF_HAS_MDWE_MASK BIT(MMF_HAS_MDWE)
#define MMF_HAS_MDWE_NO_INHERIT 29
#define MMF_VM_MERGE_ANY 30
-#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY)
+#define MMF_VM_MERGE_ANY_MASK BIT(MMF_VM_MERGE_ANY)
#define MMF_TOPDOWN 31 /* mm searches top down by default */
-#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN)
+#define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN)
-#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
-static inline unsigned long mmf_init_flags(unsigned long flags)
+/* Legacy flags must fit within 32 bits. */
+static_assert((u64)MMF_INIT_LEGACY_MASK <= (u64)UINT_MAX);
+
+/*
+ * Initialise legacy flags according to masks, propagating selected flags on
+ * fork. Further flag manipulation can be performed by the caller.
+ */
+static inline unsigned long mmf_init_legacy_flags(unsigned long flags)
{
if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
flags &= ~((1UL << MMF_HAS_MDWE) |
(1UL << MMF_HAS_MDWE_NO_INHERIT));
- return flags & MMF_INIT_MASK;
+ return flags & MMF_INIT_LEGACY_MASK;
}
#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mman.h b/include/linux/mman.h
index de9e8e6229a4..0ba8a7e8b90a 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -201,7 +201,7 @@ static inline bool arch_memory_deny_write_exec_supported(void)
static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
{
/* If MDWE is disabled, we have nothing to deny. */
- if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ if (!mm_flags_test(MMF_HAS_MDWE, current->mm))
return false;
/* If the new VMA is not executable, we have nothing to deny. */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 11a078de9150..2c9fffa58714 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -148,91 +148,6 @@ static inline void vma_refcount_put(struct vm_area_struct *vma)
}
/*
- * Try to read-lock a vma. The function is allowed to occasionally yield false
- * locked result to avoid performance overhead, in which case we fall back to
- * using mmap_lock. The function should never yield false unlocked result.
- * False locked result is possible if mm_lock_seq overflows or if vma gets
- * reused and attached to a different mm before we lock it.
- * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
- * detached.
- *
- * WARNING! The vma passed to this function cannot be used if the function
- * fails to lock it because in certain cases RCU lock is dropped and then
- * reacquired. Once RCU lock is dropped the vma can be concurently freed.
- */
-static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
- struct vm_area_struct *vma)
-{
- int oldcnt;
-
- /*
- * Check before locking. A race might cause false locked result.
- * We can use READ_ONCE() for the mm_lock_seq here, and don't need
- * ACQUIRE semantics, because this is just a lockless check whose result
- * we don't rely on for anything - the mm_lock_seq read against which we
- * need ordering is below.
- */
- if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
- return NULL;
-
- /*
- * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
- * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
- * Acquire fence is required here to avoid reordering against later
- * vm_lock_seq check and checks inside lock_vma_under_rcu().
- */
- if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
- VMA_REF_LIMIT))) {
- /* return EAGAIN if vma got detached from under us */
- return oldcnt ? NULL : ERR_PTR(-EAGAIN);
- }
-
- rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
-
- /*
- * If vma got attached to another mm from under us, that mm is not
- * stable and can be freed in the narrow window after vma->vm_refcnt
- * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
- * releasing vma->vm_refcnt.
- */
- if (unlikely(vma->vm_mm != mm)) {
- /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
- struct mm_struct *other_mm = vma->vm_mm;
-
- /*
- * __mmdrop() is a heavy operation and we don't need RCU
- * protection here. Release RCU lock during these operations.
- * We reinstate the RCU read lock as the caller expects it to
- * be held when this function returns even on error.
- */
- rcu_read_unlock();
- mmgrab(other_mm);
- vma_refcount_put(vma);
- mmdrop(other_mm);
- rcu_read_lock();
- return NULL;
- }
-
- /*
- * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
- * False unlocked result is impossible because we modify and check
- * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
- * modification invalidates all existing locks.
- *
- * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
- * racing with vma_end_write_all(), we only start reading from the VMA
- * after it has been unlocked.
- * This pairs with RELEASE semantics in vma_end_write_all().
- */
- if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
- vma_refcount_put(vma);
- return NULL;
- }
-
- return vma;
-}
-
-/*
* Use only while holding mmap read lock which guarantees that locking will not
* fail (nobody can concurrently write-lock the vma). vma_start_read() should
* not be used in such cases because it might fail due to mm_lock_seq overflow.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0c5da9141983..7fb7331c5725 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -234,7 +234,21 @@ enum node_stat_item {
#endif
#ifdef CONFIG_NUMA_BALANCING
PGPROMOTE_SUCCESS, /* promote successfully */
- PGPROMOTE_CANDIDATE, /* candidate pages to promote */
+ /**
+ * Candidate pages for promotion based on hint fault latency. This
+ * counter is used to control the promotion rate and adjust the hot
+ * threshold.
+ */
+ PGPROMOTE_CANDIDATE,
+ /**
+ * Not rate-limited (NRL) candidate pages for those can be promoted
+ * without considering hot threshold because of enough free pages in
+ * fast-tier node. These promotions bypass the regular hotness checks
+ * and do NOT influence the promotion rate-limiter or
+ * threshold-adjustment logic.
+ * This is for statistics/monitoring purposes.
+ */
+ PGPROMOTE_CANDIDATE_NRL,
#endif
/* PGDEMOTE_*: pages demoted */
PGDEMOTE_KSWAPD,
@@ -245,6 +259,7 @@ enum node_stat_item {
NR_HUGETLB,
#endif
NR_BALLOON_PAGES,
+ NR_KERNEL_FILE_PAGES,
NR_VM_NODE_STAT_ITEMS
};
@@ -1089,7 +1104,7 @@ static inline unsigned long promo_wmark_pages(const struct zone *z)
return wmark_pages(z, WMARK_PROMO);
}
-static inline unsigned long zone_managed_pages(struct zone *zone)
+static inline unsigned long zone_managed_pages(const struct zone *zone)
{
return (unsigned long)atomic_long_read(&zone->managed_pages);
}
@@ -1113,12 +1128,12 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
}
-static inline bool zone_is_initialized(struct zone *zone)
+static inline bool zone_is_initialized(const struct zone *zone)
{
return zone->initialized;
}
-static inline bool zone_is_empty(struct zone *zone)
+static inline bool zone_is_empty(const struct zone *zone)
{
return zone->spanned_pages == 0;
}
@@ -1169,26 +1184,31 @@ static inline bool zone_is_empty(struct zone *zone)
#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1)
#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
+static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags)
+{
+ ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT);
+ return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK;
+}
+
static inline enum zone_type page_zonenum(const struct page *page)
{
- ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
- return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
+ return memdesc_zonenum(page->flags);
}
static inline enum zone_type folio_zonenum(const struct folio *folio)
{
- return page_zonenum(&folio->page);
+ return memdesc_zonenum(folio->flags);
}
#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_zone_device_page(const struct page *page)
+static inline bool memdesc_is_zone_device(memdesc_flags_t mdf)
{
- return page_zonenum(page) == ZONE_DEVICE;
+ return memdesc_zonenum(mdf) == ZONE_DEVICE;
}
static inline struct dev_pagemap *page_pgmap(const struct page *page)
{
- VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page);
+ VM_WARN_ON_ONCE_PAGE(!memdesc_is_zone_device(page->flags), page);
return page_folio(page)->pgmap;
}
@@ -1203,9 +1223,9 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page)
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
const struct page *b)
{
- if (is_zone_device_page(a) != is_zone_device_page(b))
+ if (memdesc_is_zone_device(a->flags) != memdesc_is_zone_device(b->flags))
return false;
- if (!is_zone_device_page(a))
+ if (!memdesc_is_zone_device(a->flags))
return true;
return page_pgmap(a) == page_pgmap(b);
}
@@ -1213,7 +1233,7 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
extern void memmap_init_zone_device(struct zone *, unsigned long,
unsigned long, struct dev_pagemap *);
#else
-static inline bool is_zone_device_page(const struct page *page)
+static inline bool memdesc_is_zone_device(memdesc_flags_t mdf)
{
return false;
}
@@ -1228,9 +1248,14 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page)
}
#endif
+static inline bool is_zone_device_page(const struct page *page)
+{
+ return memdesc_is_zone_device(page->flags);
+}
+
static inline bool folio_is_zone_device(const struct folio *folio)
{
- return is_zone_device_page(&folio->page);
+ return memdesc_is_zone_device(folio->flags);
}
static inline bool is_zone_movable_page(const struct page *page)
@@ -1248,7 +1273,7 @@ static inline bool folio_is_zone_movable(const struct folio *folio)
* Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
* intersection with the given zone
*/
-static inline bool zone_intersects(struct zone *zone,
+static inline bool zone_intersects(const struct zone *zone,
unsigned long start_pfn, unsigned long nr_pages)
{
if (zone_is_empty(zone))
@@ -1415,7 +1440,7 @@ typedef struct pglist_data {
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;
- int kswapd_failures; /* Number of 'reclaimed == 0' runs */
+ atomic_t kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
@@ -1556,12 +1581,12 @@ static inline int local_memory_node(int node_id) { return node_id; };
#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
#ifdef CONFIG_ZONE_DEVICE
-static inline bool zone_is_zone_device(struct zone *zone)
+static inline bool zone_is_zone_device(const struct zone *zone)
{
return zone_idx(zone) == ZONE_DEVICE;
}
#else
-static inline bool zone_is_zone_device(struct zone *zone)
+static inline bool zone_is_zone_device(const struct zone *zone)
{
return false;
}
@@ -1573,19 +1598,19 @@ static inline bool zone_is_zone_device(struct zone *zone)
* populated_zone(). If the whole zone is reserved then we can easily
* end up with populated_zone() && !managed_zone().
*/
-static inline bool managed_zone(struct zone *zone)
+static inline bool managed_zone(const struct zone *zone)
{
return zone_managed_pages(zone);
}
/* Returns true if a zone has memory */
-static inline bool populated_zone(struct zone *zone)
+static inline bool populated_zone(const struct zone *zone)
{
return zone->present_pages;
}
#ifdef CONFIG_NUMA
-static inline int zone_to_nid(struct zone *zone)
+static inline int zone_to_nid(const struct zone *zone)
{
return zone->node;
}
@@ -1595,7 +1620,7 @@ static inline void zone_set_nid(struct zone *zone, int nid)
zone->node = nid;
}
#else
-static inline int zone_to_nid(struct zone *zone)
+static inline int zone_to_nid(const struct zone *zone)
{
return 0;
}
@@ -1622,7 +1647,7 @@ static inline int is_highmem_idx(enum zone_type idx)
* @zone: pointer to struct zone variable
* Return: 1 for a highmem zone, 0 otherwise
*/
-static inline int is_highmem(struct zone *zone)
+static inline int is_highmem(const struct zone *zone)
{
return is_highmem_idx(zone_idx(zone));
}
@@ -1688,12 +1713,12 @@ static inline struct zone *zonelist_zone(struct zoneref *zoneref)
return zoneref->zone;
}
-static inline int zonelist_zone_idx(struct zoneref *zoneref)
+static inline int zonelist_zone_idx(const struct zoneref *zoneref)
{
return zoneref->zone_idx;
}
-static inline int zonelist_node_idx(struct zoneref *zoneref)
+static inline int zonelist_node_idx(const struct zoneref *zoneref)
{
return zone_to_nid(zoneref->zone);
}
@@ -1996,7 +2021,7 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section)
return (struct page *)map;
}
-static inline int present_section(struct mem_section *section)
+static inline int present_section(const struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}
@@ -2006,12 +2031,12 @@ static inline int present_section_nr(unsigned long nr)
return present_section(__nr_to_section(nr));
}
-static inline int valid_section(struct mem_section *section)
+static inline int valid_section(const struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}
-static inline int early_section(struct mem_section *section)
+static inline int early_section(const struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_IS_EARLY));
}
@@ -2021,27 +2046,27 @@ static inline int valid_section_nr(unsigned long nr)
return valid_section(__nr_to_section(nr));
}
-static inline int online_section(struct mem_section *section)
+static inline int online_section(const struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}
#ifdef CONFIG_ZONE_DEVICE
-static inline int online_device_section(struct mem_section *section)
+static inline int online_device_section(const struct mem_section *section)
{
unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
return section && ((section->section_mem_map & flags) == flags);
}
#else
-static inline int online_device_section(struct mem_section *section)
+static inline int online_device_section(const struct mem_section *section)
{
return 0;
}
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
-static inline int preinited_vmemmap_section(struct mem_section *section)
+static inline int preinited_vmemmap_section(const struct mem_section *section)
{
return (section &&
(section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT));
@@ -2051,7 +2076,7 @@ void sparse_vmemmap_init_nid_early(int nid);
void sparse_vmemmap_init_nid_late(int nid);
#else
-static inline int preinited_vmemmap_section(struct mem_section *section)
+static inline int preinited_vmemmap_section(const struct mem_section *section)
{
return 0;
}
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 98c96d649bf9..72ee7d210a74 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -21,7 +21,7 @@
#include <linux/rolling_buffer.h>
enum netfs_sreq_ref_trace;
-typedef struct mempool_s mempool_t;
+typedef struct mempool mempool_t;
struct folio_queue;
/**
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 1e0fc6931ce9..7b02bc1d0a7e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -91,7 +91,7 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
*/
static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
{
- if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
+ if (unlikely(mm_flags_test(MMF_UNSTABLE, mm)))
return VM_FAULT_SIGBUS;
return 0;
}
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 8d3fa3a91ce4..48e27768e7ba 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -217,7 +217,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page
* cold cacheline in some cases.
*/
if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) &&
- test_bit(PG_head, &page->flags)) {
+ test_bit(PG_head, &page->flags.f)) {
/*
* We can safely access the field of the @page[1] with PG_head
* because the @page is a compound page composed with at least
@@ -316,7 +316,7 @@ static __always_inline unsigned long _compound_head(const struct page *page)
* check that the page number lies within @folio; the caller is presumed
* to have a reference to the page.
*/
-#define folio_page(folio, n) nth_page(&(folio)->page, n)
+#define folio_page(folio, n) (&(folio)->page + (n))
static __always_inline int PageTail(const struct page *page)
{
@@ -325,14 +325,14 @@ static __always_inline int PageTail(const struct page *page)
static __always_inline int PageCompound(const struct page *page)
{
- return test_bit(PG_head, &page->flags) ||
+ return test_bit(PG_head, &page->flags.f) ||
READ_ONCE(page->compound_head) & 1;
}
#define PAGE_POISON_PATTERN -1l
static inline int PagePoisoned(const struct page *page)
{
- return READ_ONCE(page->flags) == PAGE_POISON_PATTERN;
+ return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN;
}
#ifdef CONFIG_DEBUG_VM
@@ -349,8 +349,8 @@ static const unsigned long *const_folio_flags(const struct folio *folio,
const struct page *page = &folio->page;
VM_BUG_ON_PGFLAGS(page->compound_head & 1, page);
- VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
- return &page[n].flags;
+ VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page);
+ return &page[n].flags.f;
}
static unsigned long *folio_flags(struct folio *folio, unsigned n)
@@ -358,8 +358,8 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n)
struct page *page = &folio->page;
VM_BUG_ON_PGFLAGS(page->compound_head & 1, page);
- VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
- return &page[n].flags;
+ VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page);
+ return &page[n].flags.f;
}
/*
@@ -449,37 +449,37 @@ FOLIO_CLEAR_FLAG(name, page)
#define TESTPAGEFLAG(uname, lname, policy) \
FOLIO_TEST_FLAG(lname, FOLIO_##policy) \
static __always_inline int Page##uname(const struct page *page) \
-{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
+{ return test_bit(PG_##lname, &policy(page, 0)->flags.f); }
#define SETPAGEFLAG(uname, lname, policy) \
FOLIO_SET_FLAG(lname, FOLIO_##policy) \
static __always_inline void SetPage##uname(struct page *page) \
-{ set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ set_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define CLEARPAGEFLAG(uname, lname, policy) \
FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \
static __always_inline void ClearPage##uname(struct page *page) \
-{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ clear_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define __SETPAGEFLAG(uname, lname, policy) \
__FOLIO_SET_FLAG(lname, FOLIO_##policy) \
static __always_inline void __SetPage##uname(struct page *page) \
-{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ __set_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define __CLEARPAGEFLAG(uname, lname, policy) \
__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \
static __always_inline void __ClearPage##uname(struct page *page) \
-{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ __clear_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define TESTSETFLAG(uname, lname, policy) \
FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \
static __always_inline int TestSetPage##uname(struct page *page) \
-{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define TESTCLEARFLAG(uname, lname, policy) \
FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \
static __always_inline int TestClearPage##uname(struct page *page) \
-{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define PAGEFLAG(uname, lname, policy) \
TESTPAGEFLAG(uname, lname, policy) \
@@ -846,7 +846,7 @@ static __always_inline bool folio_test_head(const struct folio *folio)
static __always_inline int PageHead(const struct page *page)
{
PF_POISONED_CHECK(page);
- return test_bit(PG_head, &page->flags) && !page_is_fake_head(page);
+ return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page);
}
__SETPAGEFLAG(Head, head, PF_ANY)
@@ -1170,28 +1170,28 @@ static __always_inline int PageAnonExclusive(const struct page *page)
*/
if (PageHuge(page))
page = compound_head(page);
- return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
+ return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}
static __always_inline void SetPageAnonExclusive(struct page *page)
{
VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
- set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
+ set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}
static __always_inline void ClearPageAnonExclusive(struct page *page)
{
VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
- clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
+ clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}
static __always_inline void __ClearPageAnonExclusive(struct page *page)
{
VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
- __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
+ __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}
#ifdef CONFIG_MMU
@@ -1241,7 +1241,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
*/
static inline int folio_has_private(const struct folio *folio)
{
- return !!(folio->flags & PAGE_FLAGS_PRIVATE);
+ return !!(folio->flags.f & PAGE_FLAGS_PRIVATE);
}
#undef PF_ANY
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 6a44be0f39f4..e046278a01fa 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -13,12 +13,11 @@
#include <linux/types.h>
-#define PB_migratetype_bits 3
/* Bit indices that affect a whole block of pages */
enum pageblock_bits {
- PB_migrate,
- PB_migrate_end = PB_migrate + PB_migratetype_bits - 1,
- /* 3 bits required for migrate types */
+ PB_migrate_0,
+ PB_migrate_1,
+ PB_migrate_2,
PB_compact_skip,/* If set the block is skipped by compaction */
#ifdef CONFIG_MEMORY_ISOLATION
@@ -37,11 +36,10 @@ enum pageblock_bits {
#define NR_PAGEBLOCK_BITS (roundup_pow_of_two(__NR_PAGEBLOCK_BITS))
-#define MIGRATETYPE_MASK ((1UL << (PB_migrate_end + 1)) - 1)
+#define MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2))
#ifdef CONFIG_MEMORY_ISOLATION
-#define MIGRATETYPE_AND_ISO_MASK \
- (((1UL << (PB_migrate_end + 1)) - 1) | BIT(PB_migrate_isolate))
+#define MIGRATETYPE_AND_ISO_MASK (MIGRATETYPE_MASK | BIT(PB_migrate_isolate))
#else
#define MIGRATETYPE_AND_ISO_MASK MIGRATETYPE_MASK
#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 12a12dae727d..185644e288ea 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -140,7 +140,7 @@ static inline int inode_drain_writes(struct inode *inode)
return filemap_write_and_wait(inode->i_mapping);
}
-static inline bool mapping_empty(struct address_space *mapping)
+static inline bool mapping_empty(const struct address_space *mapping)
{
return xa_empty(&mapping->i_pages);
}
@@ -166,7 +166,7 @@ static inline bool mapping_empty(struct address_space *mapping)
* refcount and the referenced bit, which will be elevated or set in
* the process of adding new cache pages to an inode.
*/
-static inline bool mapping_shrinkable(struct address_space *mapping)
+static inline bool mapping_shrinkable(const struct address_space *mapping)
{
void *head;
@@ -211,6 +211,8 @@ enum mapping_flags {
folio contents */
AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */
AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
+ AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't
+ account usage to user cgroups */
/* Bits 16-25 are used for FOLIO_ORDER */
AS_FOLIO_ORDER_BITS = 5,
AS_FOLIO_ORDER_MIN = 16,
@@ -265,7 +267,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping)
clear_bit(AS_UNEVICTABLE, &mapping->flags);
}
-static inline bool mapping_unevictable(struct address_space *mapping)
+static inline bool mapping_unevictable(const struct address_space *mapping)
{
return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags);
}
@@ -275,7 +277,7 @@ static inline void mapping_set_exiting(struct address_space *mapping)
set_bit(AS_EXITING, &mapping->flags);
}
-static inline int mapping_exiting(struct address_space *mapping)
+static inline int mapping_exiting(const struct address_space *mapping)
{
return test_bit(AS_EXITING, &mapping->flags);
}
@@ -285,7 +287,7 @@ static inline void mapping_set_no_writeback_tags(struct address_space *mapping)
set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}
-static inline int mapping_use_writeback_tags(struct address_space *mapping)
+static inline int mapping_use_writeback_tags(const struct address_space *mapping)
{
return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}
@@ -331,7 +333,7 @@ static inline void mapping_set_inaccessible(struct address_space *mapping)
set_bit(AS_INACCESSIBLE, &mapping->flags);
}
-static inline bool mapping_inaccessible(struct address_space *mapping)
+static inline bool mapping_inaccessible(const struct address_space *mapping)
{
return test_bit(AS_INACCESSIBLE, &mapping->flags);
}
@@ -341,18 +343,18 @@ static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_
set_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
}
-static inline bool mapping_writeback_may_deadlock_on_reclaim(struct address_space *mapping)
+static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct address_space *mapping)
{
return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
}
-static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
+static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
{
return mapping->gfp_mask;
}
/* Restricts the given gfp_mask to what the mapping allows. */
-static inline gfp_t mapping_gfp_constraint(struct address_space *mapping,
+static inline gfp_t mapping_gfp_constraint(const struct address_space *mapping,
gfp_t gfp_mask)
{
return mapping_gfp_mask(mapping) & gfp_mask;
@@ -475,11 +477,17 @@ mapping_min_folio_order(const struct address_space *mapping)
}
static inline unsigned long
-mapping_min_folio_nrpages(struct address_space *mapping)
+mapping_min_folio_nrpages(const struct address_space *mapping)
{
return 1UL << mapping_min_folio_order(mapping);
}
+static inline unsigned long
+mapping_min_folio_nrbytes(const struct address_space *mapping)
+{
+ return mapping_min_folio_nrpages(mapping) << PAGE_SHIFT;
+}
+
/**
* mapping_align_index() - Align index for this mapping.
* @mapping: The address_space.
@@ -489,7 +497,7 @@ mapping_min_folio_nrpages(struct address_space *mapping)
* new folio to the page cache and need to know what index to give it,
* call this function.
*/
-static inline pgoff_t mapping_align_index(struct address_space *mapping,
+static inline pgoff_t mapping_align_index(const struct address_space *mapping,
pgoff_t index)
{
return round_down(index, mapping_min_folio_nrpages(mapping));
@@ -499,7 +507,7 @@ static inline pgoff_t mapping_align_index(struct address_space *mapping,
* Large folio support currently depends on THP. These dependencies are
* being worked on but are not yet fixed.
*/
-static inline bool mapping_large_folio_support(struct address_space *mapping)
+static inline bool mapping_large_folio_support(const struct address_space *mapping)
{
/* AS_FOLIO_ORDER is only reasonable for pagecache folios */
VM_WARN_ONCE((unsigned long)mapping & FOLIO_MAPPING_ANON,
@@ -514,7 +522,7 @@ static inline size_t mapping_max_folio_size(const struct address_space *mapping)
return PAGE_SIZE << mapping_max_folio_order(mapping);
}
-static inline int filemap_nr_thps(struct address_space *mapping)
+static inline int filemap_nr_thps(const struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
return atomic_read(&mapping->nr_thps);
@@ -543,7 +551,7 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
#endif
}
-struct address_space *folio_mapping(struct folio *);
+struct address_space *folio_mapping(const struct folio *folio);
/**
* folio_flush_mapping - Find the file mapping this folio belongs to.
@@ -928,7 +936,7 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
*
* Return: The index of the folio which follows this folio in the file.
*/
-static inline pgoff_t folio_next_index(struct folio *folio)
+static inline pgoff_t folio_next_index(const struct folio *folio)
{
return folio->index + folio_nr_pages(folio);
}
@@ -957,7 +965,7 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
* e.g., shmem did not move this folio to the swap cache.
* Return: true or false.
*/
-static inline bool folio_contains(struct folio *folio, pgoff_t index)
+static inline bool folio_contains(const struct folio *folio, pgoff_t index)
{
VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
return index - folio->index < folio_nr_pages(folio);
@@ -1034,13 +1042,13 @@ static inline loff_t page_offset(struct page *page)
/*
* Get the offset in PAGE_SIZE (even for hugetlb folios).
*/
-static inline pgoff_t folio_pgoff(struct folio *folio)
+static inline pgoff_t folio_pgoff(const struct folio *folio)
{
return folio->index;
}
-static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
- unsigned long address)
+static inline pgoff_t linear_page_index(const struct vm_area_struct *vma,
+ const unsigned long address)
{
pgoff_t pgoff;
pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
@@ -1460,7 +1468,7 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac,
* readahead_pos - The byte offset into the file of this readahead request.
* @rac: The readahead request.
*/
-static inline loff_t readahead_pos(struct readahead_control *rac)
+static inline loff_t readahead_pos(const struct readahead_control *rac)
{
return (loff_t)rac->_index * PAGE_SIZE;
}
@@ -1469,7 +1477,7 @@ static inline loff_t readahead_pos(struct readahead_control *rac)
* readahead_length - The number of bytes in this readahead request.
* @rac: The readahead request.
*/
-static inline size_t readahead_length(struct readahead_control *rac)
+static inline size_t readahead_length(const struct readahead_control *rac)
{
return rac->_nr_pages * PAGE_SIZE;
}
@@ -1478,7 +1486,7 @@ static inline size_t readahead_length(struct readahead_control *rac)
* readahead_index - The index of the first page in this readahead request.
* @rac: The readahead request.
*/
-static inline pgoff_t readahead_index(struct readahead_control *rac)
+static inline pgoff_t readahead_index(const struct readahead_control *rac)
{
return rac->_index;
}
@@ -1487,7 +1495,7 @@ static inline pgoff_t readahead_index(struct readahead_control *rac)
* readahead_count - The number of pages in this readahead request.
* @rac: The readahead request.
*/
-static inline unsigned int readahead_count(struct readahead_control *rac)
+static inline unsigned int readahead_count(const struct readahead_control *rac)
{
return rac->_nr_pages;
}
@@ -1496,12 +1504,12 @@ static inline unsigned int readahead_count(struct readahead_control *rac)
* readahead_batch_length - The number of bytes in the current batch.
* @rac: The readahead request.
*/
-static inline size_t readahead_batch_length(struct readahead_control *rac)
+static inline size_t readahead_batch_length(const struct readahead_control *rac)
{
return rac->_batch_count * PAGE_SIZE;
}
-static inline unsigned long dir_pages(struct inode *inode)
+static inline unsigned long dir_pages(const struct inode *inode)
{
return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
PAGE_SHIFT;
@@ -1515,8 +1523,8 @@ static inline unsigned long dir_pages(struct inode *inode)
* Return: the number of bytes in the folio up to EOF,
* or -EFAULT if the folio was truncated.
*/
-static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio,
- struct inode *inode)
+static inline ssize_t folio_mkwrite_check_truncate(const struct folio *folio,
+ const struct inode *inode)
{
loff_t size = i_size_read(inode);
pgoff_t index = size >> PAGE_SHIFT;
@@ -1547,7 +1555,8 @@ static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio,
* Return: The number of filesystem blocks covered by this folio.
*/
static inline
-unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio)
+unsigned int i_blocks_per_folio(const struct inode *inode,
+ const struct folio *folio)
{
return folio_size(folio) >> inode->i_blkbits;
}
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 5d3a0cccc6bf..63be5a451627 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -51,12 +51,12 @@ static inline void folio_batch_reinit(struct folio_batch *fbatch)
fbatch->i = 0;
}
-static inline unsigned int folio_batch_count(struct folio_batch *fbatch)
+static inline unsigned int folio_batch_count(const struct folio_batch *fbatch)
{
return fbatch->nr;
}
-static inline unsigned int folio_batch_space(struct folio_batch *fbatch)
+static inline unsigned int folio_batch_space(const struct folio_batch *fbatch)
{
return PAGEVEC_SIZE - fbatch->nr;
}
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 8a7f4f802c57..38a82d65e58e 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -107,7 +107,8 @@ static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref,
if (static_key_enabled(&mem_profiling_compressed)) {
pgalloc_tag_idx idx;
- idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask;
+ idx = (page->flags.f >> alloc_tag_ref_offs) &
+ alloc_tag_ref_mask;
idx_to_ref(idx, ref);
handle->page = page;
} else {
@@ -149,11 +150,11 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code
idx = (unsigned long)ref_to_idx(ref);
idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs;
do {
- old_flags = READ_ONCE(page->flags);
+ old_flags = READ_ONCE(page->flags.f);
flags = old_flags;
flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs);
flags |= idx;
- } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
+ } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags)));
} else {
if (WARN_ON(!handle.ref || !ref))
return;
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 25a7257052ff..32e8457ad535 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1975,6 +1975,32 @@ static inline bool arch_has_pfn_modify_check(void)
/* Page-Table Modification Mask */
typedef unsigned int pgtbl_mod_mask;
+enum pgtable_level {
+ PGTABLE_LEVEL_PTE = 0,
+ PGTABLE_LEVEL_PMD,
+ PGTABLE_LEVEL_PUD,
+ PGTABLE_LEVEL_P4D,
+ PGTABLE_LEVEL_PGD,
+};
+
+static inline const char *pgtable_level_to_str(enum pgtable_level level)
+{
+ switch (level) {
+ case PGTABLE_LEVEL_PTE:
+ return "pte";
+ case PGTABLE_LEVEL_PMD:
+ return "pmd";
+ case PGTABLE_LEVEL_PUD:
+ return "pud";
+ case PGTABLE_LEVEL_P4D:
+ return "p4d";
+ case PGTABLE_LEVEL_PGD:
+ return "pgd";
+ default:
+ return "unknown";
+ }
+}
+
#endif /* !__ASSEMBLY__ */
#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6cd020eea37a..daa92a58585d 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -394,18 +394,8 @@ typedef int __bitwise rmap_t;
/* The anonymous (sub)page is exclusive to a single process. */
#define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0))
-/*
- * Internally, we're using an enum to specify the granularity. We make the
- * compiler emit specialized code for each granularity.
- */
-enum rmap_level {
- RMAP_LEVEL_PTE = 0,
- RMAP_LEVEL_PMD,
- RMAP_LEVEL_PUD,
-};
-
-static inline void __folio_rmap_sanity_checks(const struct folio *folio,
- const struct page *page, int nr_pages, enum rmap_level level)
+static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio,
+ const struct page *page, int nr_pages, enum pgtable_level level)
{
/* hugetlb folios are handled separately. */
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
@@ -427,18 +417,18 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio,
VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
break;
- case RMAP_LEVEL_PMD:
+ case PGTABLE_LEVEL_PMD:
/*
* We don't support folios larger than a single PMD yet. So
- * when RMAP_LEVEL_PMD is set, we assume that we are creating
+ * when PGTABLE_LEVEL_PMD is set, we assume that we are creating
* a single "entire" mapping of the folio.
*/
VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
break;
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PUD:
/*
* Assume that we are creating a single "entire" mapping of the
* folio.
@@ -447,7 +437,7 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio,
VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
break;
default:
- VM_WARN_ON_ONCE(true);
+ BUILD_BUG();
}
/*
@@ -567,14 +557,14 @@ static inline void hugetlb_remove_rmap(struct folio *folio)
static __always_inline void __folio_dup_file_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
- enum rmap_level level)
+ enum pgtable_level level)
{
const int orig_nr_pages = nr_pages;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
atomic_inc(&folio->_mapcount);
break;
@@ -587,11 +577,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
}
folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
atomic_inc(&folio->_entire_mapcount);
folio_inc_large_mapcount(folio, dst_vma);
break;
+ default:
+ BUILD_BUG();
}
}
@@ -609,13 +601,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
static inline void folio_dup_file_rmap_ptes(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *dst_vma)
{
- __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE);
+ __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE);
}
static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
struct page *page, struct vm_area_struct *dst_vma)
{
- __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE);
+ __folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE);
}
/**
@@ -632,7 +624,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio,
struct page *page, struct vm_area_struct *dst_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE);
+ __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE);
#else
WARN_ON_ONCE(true);
#endif
@@ -640,7 +632,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio,
static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma, enum rmap_level level)
+ struct vm_area_struct *src_vma, enum pgtable_level level)
{
const int orig_nr_pages = nr_pages;
bool maybe_pinned;
@@ -665,7 +657,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
* copying if the folio maybe pinned.
*/
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (unlikely(maybe_pinned)) {
for (i = 0; i < nr_pages; i++)
if (PageAnonExclusive(page + i))
@@ -687,8 +679,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
} while (page++, --nr_pages > 0);
folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
if (PageAnonExclusive(page)) {
if (unlikely(maybe_pinned))
return -EBUSY;
@@ -697,6 +689,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
atomic_inc(&folio->_entire_mapcount);
folio_inc_large_mapcount(folio, dst_vma);
break;
+ default:
+ BUILD_BUG();
}
return 0;
}
@@ -730,7 +724,7 @@ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
struct vm_area_struct *src_vma)
{
return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma,
- src_vma, RMAP_LEVEL_PTE);
+ src_vma, PGTABLE_LEVEL_PTE);
}
static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
@@ -738,7 +732,7 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
struct vm_area_struct *src_vma)
{
return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma,
- RMAP_LEVEL_PTE);
+ PGTABLE_LEVEL_PTE);
}
/**
@@ -770,7 +764,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma,
- src_vma, RMAP_LEVEL_PMD);
+ src_vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
return -EBUSY;
@@ -778,7 +772,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
}
static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
- struct page *page, int nr_pages, enum rmap_level level)
+ struct page *page, int nr_pages, enum pgtable_level level)
{
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
@@ -873,7 +867,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
struct page *page)
{
- return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE);
+ return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE);
}
/**
@@ -904,7 +898,7 @@ static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
- RMAP_LEVEL_PMD);
+ PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
return -EBUSY;
@@ -928,6 +922,11 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
/* Look for migration entries rather than present PTEs */
#define PVMW_MIGRATION (1 << 1)
+/* Result flags */
+
+/* The page is mapped across page table boundary */
+#define PVMW_PGTABLE_CROSSED (1 << 16)
+
struct page_vma_mapped_walk {
unsigned long pfn;
unsigned long nr_pages;
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 6f8a4965f9b9..29f6ceb98d74 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -158,6 +158,7 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
static inline void sg_set_page(struct scatterlist *sg, struct page *page,
unsigned int len, unsigned int offset)
{
+ VM_WARN_ON_ONCE(!page_range_contiguous(page, ALIGN(len + offset, PAGE_SIZE) / PAGE_SIZE));
sg_assign_page(sg, page);
sg->offset = offset;
sg->length = len;
@@ -600,7 +601,7 @@ void __sg_page_iter_start(struct sg_page_iter *piter,
*/
static inline struct page *sg_page_iter_page(struct sg_page_iter *piter)
{
- return nth_page(sg_page(piter->sg), piter->sg_pgoffset);
+ return sg_page(piter->sg) + piter->sg_pgoffset;
}
/**
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 6eb65ceed213..b7fafe999073 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -8,6 +8,20 @@
#define SUID_DUMP_USER 1 /* Dump as user of process */
#define SUID_DUMP_ROOT 2 /* Dump as root */
+static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm)
+{
+ /*
+ * By convention, dumpable bits are contained in first 32 bits of the
+ * bitmap, so we can simply access this first unsigned long directly.
+ */
+ return __mm_flags_get_word(mm);
+}
+
+static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value)
+{
+ __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value);
+}
+
extern void set_dumpable(struct mm_struct *mm, int value);
/*
* This returns the actual value of the suid_dumpable flag. For things
@@ -22,7 +36,9 @@ static inline int __get_dumpable(unsigned long mm_flags)
static inline int get_dumpable(struct mm_struct *mm)
{
- return __get_dumpable(mm->flags);
+ unsigned long flags = __mm_flags_get_dumpable(mm);
+
+ return __get_dumpable(flags);
}
#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2201da0afecc..0232d983b715 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -178,7 +178,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
#endif
extern void arch_pick_mmap_layout(struct mm_struct *mm,
- struct rlimit *rlim_stack);
+ const struct rlimit *rlim_stack);
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -211,7 +211,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long flags, vm_flags_t vm_flags);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm,
- struct rlimit *rlim_stack) {}
+ const struct rlimit *rlim_stack) {}
#endif
static inline bool in_vfork(struct task_struct *tsk)
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 6d0f9c599ff7..0e47465ef0fd 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -99,9 +99,9 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
#ifdef CONFIG_SHMEM
-bool shmem_mapping(struct address_space *mapping);
+bool shmem_mapping(const struct address_space *mapping);
#else
-static inline bool shmem_mapping(struct address_space *mapping)
+static inline bool shmem_mapping(const struct address_space *mapping)
{
return false;
}
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 561597dd2164..cf443f064a66 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -496,9 +496,13 @@ int kmem_cache_shrink(struct kmem_cache *s);
/*
* Common kmalloc functions provided by all allocators
*/
-void * __must_check krealloc_noprof(const void *objp, size_t new_size,
- gfp_t flags) __realloc_size(2);
-#define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__))
+void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size,
+ unsigned long align,
+ gfp_t flags, int nid) __realloc_size(2);
+#define krealloc_noprof(_o, _s, _f) krealloc_node_align_noprof(_o, _s, 1, _f, NUMA_NO_NODE)
+#define krealloc_node_align(...) alloc_hooks(krealloc_node_align_noprof(__VA_ARGS__))
+#define krealloc_node(_o, _s, _f, _n) krealloc_node_align(_o, _s, 1, _f, _n)
+#define krealloc(...) krealloc_node(__VA_ARGS__, NUMA_NO_NODE)
void kfree(const void *objp);
void kfree_nolock(const void *objp);
@@ -1092,18 +1096,20 @@ static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags)
#define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__))
#define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node)
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) __alloc_size(1);
-#define kvmalloc_node_noprof(size, flags, node) \
- __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node)
-#define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__))
-
-#define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE)
-#define kvmalloc_noprof(_size, _flags) kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE)
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
+ gfp_t flags, int node) __alloc_size(1);
+#define kvmalloc_node_align_noprof(_size, _align, _flags, _node) \
+ __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, NULL), _align, _flags, _node)
+#define kvmalloc_node_align(...) \
+ alloc_hooks(kvmalloc_node_align_noprof(__VA_ARGS__))
+#define kvmalloc_node(_s, _f, _n) kvmalloc_node_align(_s, 1, _f, _n)
+#define kvmalloc(...) kvmalloc_node(__VA_ARGS__, NUMA_NO_NODE)
#define kvzalloc(_size, _flags) kvmalloc(_size, (_flags)|__GFP_ZERO)
#define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node)
+
#define kmem_buckets_valloc(_b, _size, _flags) \
- alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE))
+ alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), 1, _flags, NUMA_NO_NODE))
static inline __alloc_size(1, 2) void *
kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
@@ -1113,7 +1119,7 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
- return kvmalloc_node_noprof(bytes, flags, node);
+ return kvmalloc_node_align_noprof(bytes, 1, flags, node);
}
#define kvmalloc_array_noprof(...) kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE)
@@ -1124,9 +1130,12 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
#define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__))
#define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__))
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
- __realloc_size(2);
-#define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__))
+void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid) __realloc_size(2);
+#define kvrealloc_node_align(...) \
+ alloc_hooks(kvrealloc_node_align_noprof(__VA_ARGS__))
+#define kvrealloc_node(_p, _s, _f, _n) kvrealloc_node_align(_p, _s, 1, _f, _n)
+#define kvrealloc(...) kvrealloc_node(__VA_ARGS__, NUMA_NO_NODE)
extern void kvfree(const void *addr);
DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T))
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7012a0f758d8..e818fbade1e2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -236,40 +236,6 @@ enum {
#define SWAP_CONT_MAX 0x7f /* Max count */
/*
- * We use this to track usage of a cluster. A cluster is a block of swap disk
- * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
- * free clusters are organized into a list. We fetch an entry from the list to
- * get a free cluster.
- *
- * The flags field determines if a cluster is free. This is
- * protected by cluster lock.
- */
-struct swap_cluster_info {
- spinlock_t lock; /*
- * Protect swap_cluster_info fields
- * other than list, and swap_info_struct->swap_map
- * elements corresponding to the swap cluster.
- */
- u16 count;
- u8 flags;
- u8 order;
- struct list_head list;
-};
-
-/* All on-list cluster must have a non-zero flag. */
-enum swap_cluster_flags {
- CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */
- CLUSTER_FLAG_FREE,
- CLUSTER_FLAG_NONFULL,
- CLUSTER_FLAG_FRAG,
- /* Clusters with flags above are allocatable */
- CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG,
- CLUSTER_FLAG_FULL,
- CLUSTER_FLAG_DISCARD,
- CLUSTER_FLAG_MAX,
-};
-
-/*
* The first page in the swap file is the swap header, which is always marked
* bad to prevent it from being allocated as an entry. This also prevents the
* cluster to which it belongs being marked free. Therefore 0 is safe to use as
@@ -310,7 +276,6 @@ struct swap_info_struct {
/* list of cluster that contains at least one free slot */
struct list_head frag_clusters[SWAP_NR_ORDERS];
/* list of cluster that are fragmented or contented */
- atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
unsigned int pages; /* total of usable pages of swap */
atomic_long_t inuse_pages; /* number of those currently in use */
struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
@@ -321,11 +286,8 @@ struct swap_info_struct {
struct completion comp; /* seldom referenced */
spinlock_t lock; /*
* protect map scan related fields like
- * swap_map, lowest_bit, highest_bit,
- * inuse_pages, cluster_next,
- * cluster_nr, lowest_alloc,
- * highest_alloc, free/discard cluster
- * list. other fields are only changed
+ * swap_map, inuse_pages and all cluster
+ * lists. other fields are only changed
* at swapon/swapoff, so are protected
* by swap_lock. changing flags need
* hold this lock and swap_lock. If
@@ -517,10 +479,7 @@ extern sector_t swapdev_block(int, pgoff_t);
extern int __swap_count(swp_entry_t entry);
extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
-struct swap_info_struct *swp_swap_info(swp_entry_t entry);
struct backing_dev_info;
-extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
-extern void exit_swap_address_space(unsigned int type);
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
sector_t swap_folio_sector(struct folio *folio);
@@ -530,11 +489,6 @@ static inline void put_swap_device(struct swap_info_struct *si)
}
#else /* CONFIG_SWAP */
-static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
-{
- return NULL;
-}
-
static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
{
return NULL;
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 9e15a088ba38..92f80b4d69a6 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -2,6 +2,8 @@
#ifndef VM_EVENT_ITEM_H_INCLUDED
#define VM_EVENT_ITEM_H_INCLUDED
+#include <linux/thread_info.h>
+
#ifdef CONFIG_ZONE_DMA
#define DMA_ZONE(xx) xx##_DMA,
#else
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 2759dac6be44..eb54b7b3202f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -197,9 +197,15 @@ extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1
extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2);
#define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__))
-void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags)
- __realloc_size(2);
-#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__))
+void *__must_check vrealloc_node_align_noprof(const void *p, size_t size,
+ unsigned long align, gfp_t flags, int nid) __realloc_size(2);
+#define vrealloc_node_noprof(_p, _s, _f, _nid) \
+ vrealloc_node_align_noprof(_p, _s, 1, _f, _nid)
+#define vrealloc_noprof(_p, _s, _f) \
+ vrealloc_node_align_noprof(_p, _s, 1, _f, NUMA_NO_NODE)
+#define vrealloc_node_align(...) alloc_hooks(vrealloc_node_align_noprof(__VA_ARGS__))
+#define vrealloc_node(...) alloc_hooks(vrealloc_node_noprof(__VA_ARGS__))
+#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__))
extern void vfree(const void *addr);
extern void vfree_atomic(const void *addr);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 15a4bc4ab819..22dd4adc5667 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -362,12 +362,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb);
struct folio *writeback_iter(struct address_space *mapping,
struct writeback_control *wbc, struct folio *folio, int *error);
-typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc,
- void *data);
-
-int write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc, writepage_t writepage,
- void *data);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
void writeback_set_ratelimit(void);
void tag_pages_for_writeback(struct address_space *mapping,
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
deleted file mode 100644
index 369ef068fad8..000000000000
--- a/include/linux/zpool.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * zpool memory storage api
- *
- * Copyright (C) 2014 Dan Streetman
- *
- * This is a common frontend for the zswap compressed memory storage
- * implementations.
- */
-
-#ifndef _ZPOOL_H_
-#define _ZPOOL_H_
-
-struct zpool;
-
-bool zpool_has_pool(char *type);
-
-struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp);
-
-const char *zpool_get_type(struct zpool *pool);
-
-void zpool_destroy_pool(struct zpool *pool);
-
-int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
- unsigned long *handle, const int nid);
-
-void zpool_free(struct zpool *pool, unsigned long handle);
-
-void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle,
- void *local_copy);
-
-void zpool_obj_read_end(struct zpool *zpool, unsigned long handle,
- void *handle_mem);
-
-void zpool_obj_write(struct zpool *zpool, unsigned long handle,
- void *handle_mem, size_t mem_len);
-
-u64 zpool_get_total_pages(struct zpool *pool);
-
-
-/**
- * struct zpool_driver - driver implementation for zpool
- * @type: name of the driver.
- * @list: entry in the list of zpool drivers.
- * @create: create a new pool.
- * @destroy: destroy a pool.
- * @malloc: allocate mem from a pool.
- * @free: free mem from a pool.
- * @sleep_mapped: whether zpool driver can sleep during map.
- * @map: map a handle.
- * @unmap: unmap a handle.
- * @total_size: get total size of a pool.
- *
- * This is created by a zpool implementation and registered
- * with zpool.
- */
-struct zpool_driver {
- char *type;
- struct module *owner;
- atomic_t refcount;
- struct list_head list;
-
- void *(*create)(const char *name, gfp_t gfp);
- void (*destroy)(void *pool);
-
- int (*malloc)(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle, const int nid);
- void (*free)(void *pool, unsigned long handle);
-
- void *(*obj_read_begin)(void *pool, unsigned long handle,
- void *local_copy);
- void (*obj_read_end)(void *pool, unsigned long handle,
- void *handle_mem);
- void (*obj_write)(void *pool, unsigned long handle,
- void *handle_mem, size_t mem_len);
-
- u64 (*total_pages)(void *pool);
-};
-
-void zpool_register_driver(struct zpool_driver *driver);
-
-int zpool_unregister_driver(struct zpool_driver *driver);
-
-bool zpool_can_sleep_mapped(struct zpool *pool);
-
-#endif