From fad9c80e6371ee04a3fa5728efe20b88d8e4cccd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 May 2023 22:51:01 +0200 Subject: maple_tree: fix a few documentation issues The documentation of mt_next() claims that it starts the search at the provided index. That's incorrect as it starts the search after the provided index. The documentation of mt_find() is slightly confusing. "Handles locking" is not really helpful as it does not explain how the "locking" works. Also the documentation of index talks about a range, while in reality the index is updated on a succesful search to the index of the found entry plus one. Fix similar issues for mt_find_after() and mt_prev(). Reword the confusing "Note: Will not return the zero entry." comment on mt_for_each() and document @__index correctly. Link: https://lkml.kernel.org/r/87ttw2n556.ffs@tglx Signed-off-by: Thomas Gleixner Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox Cc: Shanker Donthineni Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 295548cca8b3..6e5bd2c9875d 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -662,10 +662,11 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); * mt_for_each - Iterate over each entry starting at index until max. * @__tree: The Maple Tree * @__entry: The current entry - * @__index: The index to update to track the location in the tree + * @__index: The index to start the search from. Subsequently used as iterator. * @__max: The maximum limit for @index * - * Note: Will not return the zero entry. + * This iterator skips all entries, which resolve to a NULL pointer, + * e.g. entries which has been reserved with XA_ZERO_ENTRY. */ #define mt_for_each(__tree, __entry, __index, __max) \ for (__entry = mt_find(__tree, &(__index), __max); \ -- cgit v1.2.3 From fc1878ec70ede56ee48f2d65525d4f7c6888b496 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Sat, 1 Jul 2023 11:28:53 +0800 Subject: mm: remove page_rmapping() After converting the last user to folio_raw_mapping(), we can safely remove the function. Link: https://lkml.kernel.org/r/20230701032853.258697-3-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 406ab9ea818f..6d150990e35c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2170,7 +2170,6 @@ static inline void *folio_address(const struct folio *folio) return page_address(&folio->page); } -extern void *page_rmapping(struct page *page); extern pgoff_t __page_file_index(struct page *page); /* -- cgit v1.2.3 From 527ed4f7d902d362471a93e1a4afb604c18ceb48 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Jun 2023 14:22:52 +0800 Subject: mm: remove arguments of show_mem() All callers of show_mem() pass 0 and NULL, so we can remove the two arguments by directly calling __show_mem(0, NULL, MAX_NR_ZONES - 1) in show_mem(). Link: https://lkml.kernel.org/r/20230630062253.189440-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Christophe Leroy Cc: Greg Kroah-Hartman Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6d150990e35c..8c3e3eec9008 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3068,9 +3068,9 @@ extern void mem_init(void); extern void __init mmap_init(void); extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); -static inline void show_mem(unsigned int flags, nodemask_t *nodemask) +static inline void show_mem(void) { - __show_mem(flags, nodemask, MAX_NR_ZONES - 1); + __show_mem(0, NULL, MAX_NR_ZONES - 1); } extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); -- cgit v1.2.3 From 1279aa0656bbebbeecbd3bec0dd50faf35e5db38 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Jun 2023 14:22:53 +0800 Subject: mm: make show_free_areas() static All callers of show_free_areas() pass 0 and NULL, so we can directly use show_mem() instead of show_free_areas(0, NULL), which could make show_free_areas() a static function. Link: https://lkml.kernel.org/r/20230630062253.189440-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Christophe Leroy Cc: Greg Kroah-Hartman Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8c3e3eec9008..d1ad22980ebe 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2236,18 +2236,6 @@ extern void pagefault_out_of_memory(void); #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) -/* - * Flags passed to show_mem() and show_free_areas() to suppress output in - * various contexts. - */ -#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ - -extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); -static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask) -{ - __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1); -} - /* * Parameter block passed down to zap_pte_range in exceptional cases. */ -- cgit v1.2.3 From 5502ea44f5ade35d32a397353956bc026b870400 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:05 -0400 Subject: mm/hugetlb: add page_mask for hugetlb_follow_page_mask() follow_page() doesn't need it, but we'll start to need it when unifying gup for hugetlb. Link: https://lkml.kernel.org/r/20230628215310.73782-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ca3c8e10f24a..9f282f370d96 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -131,7 +131,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags); + unsigned long address, unsigned int flags, + unsigned int *page_mask); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, unsigned long *, unsigned long *, long, unsigned int, int *); @@ -297,8 +298,9 @@ static inline void adjust_range_if_pmd_sharing_possible( { } -static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +static inline struct page *hugetlb_follow_page_mask( + struct vm_area_struct *vma, unsigned long address, unsigned int flags, + unsigned int *page_mask) { BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/ } -- cgit v1.2.3 From 4849807114b83e1897381ed3f851632f376a0b7e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:08 -0400 Subject: mm/gup: retire follow_hugetlb_page() Now __get_user_pages() should be well prepared to handle thp completely, as long as hugetlb gup requests even without the hugetlb's special path. Time to retire follow_hugetlb_page(). Tweak misc comments to reflect reality of follow_hugetlb_page()'s removal. Link: https://lkml.kernel.org/r/20230628215310.73782-7-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9f282f370d96..9bc3c2d71b71 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -133,9 +133,6 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned int *page_mask); -long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, - struct page **, unsigned long *, unsigned long *, - long, unsigned int, int *); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); @@ -305,15 +302,6 @@ static inline struct page *hugetlb_follow_page_mask( BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/ } -static inline long follow_hugetlb_page(struct mm_struct *mm, - struct vm_area_struct *vma, struct page **pages, - unsigned long *position, unsigned long *nr_pages, - long i, unsigned int flags, int *nonblocking) -{ - BUG(); - return 0; -} - static inline int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, -- cgit v1.2.3 From a524fcfe190da16bbf1311b6636f51d81f35d59a Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Mon, 26 Jun 2023 07:55:18 +0200 Subject: fs: convert block_commit_write to return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit block_commit_write() always returns 0, this patch changes it to return void. Link: https://lkml.kernel.org/r/20230626055518.842392-3-beanhuo@iokpp.de Signed-off-by: Bean Huo Reviewed-by: Jan Kara Acked-by: Theodore Ts'o Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Cc: Al Viro Cc: Andreas Dilger Cc: Christian Brauner Cc: Joel Becker Cc: Joseph Qi Cc: Luís Henriques Cc: Mark Fasheh Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6cb3e9af78c9..a7377877ff4e 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -288,7 +288,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct page **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); -int block_commit_write(struct page *page, unsigned from, unsigned to); +void block_commit_write(struct page *page, unsigned int from, unsigned int to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); /* Convert errno to return value from ->page_mkwrite() call */ -- cgit v1.2.3 From 3fade62b62e84dd8dbf6e92d494b0e7eca750c43 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 25 Jun 2023 10:13:23 +0800 Subject: mm/mm_init.c: remove obsolete macro HASH_SMALL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HASH_SMALL only works when parameter numentries is 0. But the sole caller futex_init() never calls alloc_large_system_hash() with numentries set to 0. So HASH_SMALL is obsolete and remove it. Link: https://lkml.kernel.org/r/20230625021323.849147-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Rapoport (IBM) Cc: André Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Miaohe Lin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/memblock.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f71ff9f0ec81..0d031fbfea25 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -581,9 +581,7 @@ extern void *alloc_large_system_hash(const char *tablename, unsigned long high_limit); #define HASH_EARLY 0x00000001 /* Allocating during early boot? */ -#define HASH_SMALL 0x00000002 /* sub-page allocation allowed, min - * shift passed via *_hash_shift */ -#define HASH_ZERO 0x00000004 /* Zero allocated hash table */ +#define HASH_ZERO 0x00000002 /* Zero allocated hash table */ /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. -- cgit v1.2.3 From 79271476b3362a9e69adae949a520647f8af3559 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 13 Jun 2023 11:09:28 +0800 Subject: ksm: support unsharing KSM-placed zero pages Patch series "ksm: support tracking KSM-placed zero-pages", v10. The core idea of this patch set is to enable users to perceive the number of any pages merged by KSM, regardless of whether use_zero_page switch has been turned on, so that users can know how much free memory increase is really due to their madvise(MERGEABLE) actions. But the problem is, when enabling use_zero_pages, all empty pages will be merged with kernel zero pages instead of with each other as use_zero_pages is disabled, and then these zero-pages are no longer monitored by KSM. The motivations to do this is seen at: https://lore.kernel.org/lkml/202302100915227721315@zte.com.cn/ In one word, we hope to implement the support for KSM-placed zero pages tracking without affecting the feature of use_zero_pages, so that app developer can also benefit from knowing the actual KSM profit by getting KSM-placed zero pages to optimize applications eventually when /sys/kernel/mm/ksm/use_zero_pages is enabled. This patch (of 5): When use_zero_pages of ksm is enabled, madvise(addr, len, MADV_UNMERGEABLE) and other ways (like write 2 to /sys/kernel/mm/ksm/run) to trigger unsharing will *not* actually unshare the shared zeropage as placed by KSM (which is against the MADV_UNMERGEABLE documentation). As these KSM-placed zero pages are out of the control of KSM, the related counts of ksm pages don't expose how many zero pages are placed by KSM (these special zero pages are different from those initially mapped zero pages, because the zero pages mapped to MADV_UNMERGEABLE areas are expected to be a complete and unshared page). To not blindly unshare all shared zero_pages in applicable VMAs, the patch use pte_mkdirty (related with architecture) to mark KSM-placed zero pages. Thus, MADV_UNMERGEABLE will only unshare those KSM-placed zero pages. In addition, we'll reuse this mechanism to reliably identify KSM-placed ZeroPages to properly account for them (e.g., calculating the KSM profit that includes zeropages) in the latter patches. The patch will not degrade the performance of use_zero_pages as it doesn't change the way of merging empty pages in use_zero_pages's feature. Link: https://lkml.kernel.org/r/202306131104554703428@zte.com.cn Link: https://lkml.kernel.org/r/20230613030928.185882-1-yang.yang29@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Cc: Claudio Imbrenda Cc: Xuexin Jiang Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Signed-off-by: Andrew Morton --- include/linux/ksm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 899a314bc487..98878107244f 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -26,6 +26,12 @@ int ksm_disable(struct mm_struct *mm); int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); +/* + * To identify zeropages that were mapped by KSM, we reuse the dirty bit + * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when + * deduplicating memory. + */ +#define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { -- cgit v1.2.3 From e2942062e01df85b4692460fe5b48ab0c90fdb95 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 13 Jun 2023 11:09:34 +0800 Subject: ksm: count all zero pages placed by KSM As pages_sharing and pages_shared don't include the number of zero pages merged by KSM, we cannot know how many pages are zero pages placed by KSM when enabling use_zero_pages, which leads to KSM not being transparent with all actual merged pages by KSM. In the early days of use_zero_pages, zero-pages was unable to get unshared by the ways like MADV_UNMERGEABLE so it's hard to count how many times one of those zeropages was then unmerged. But now, unsharing KSM-placed zero page accurately has been achieved, so we can easily count both how many times a page full of zeroes was merged with zero-page and how many times one of those pages was then unmerged. and so, it helps to estimate memory demands when each and every shared page could get unshared. So we add ksm_zero_pages under /sys/kernel/mm/ksm/ to show the number of all zero pages placed by KSM. Meanwhile, we update the Documentation. Link: https://lkml.kernel.org/r/20230613030934.185944-1-yang.yang29@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Cc: Claudio Imbrenda Cc: Xuexin Jiang Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Signed-off-by: Andrew Morton --- include/linux/ksm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 98878107244f..e80aa49009b2 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -33,6 +33,14 @@ void __ksm_exit(struct mm_struct *mm); */ #define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) +extern unsigned long ksm_zero_pages; + +static inline void ksm_might_unmap_zero_page(pte_t pte) +{ + if (is_ksm_zero_pte(pte)) + ksm_zero_pages--; +} + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { int ret; @@ -101,6 +109,10 @@ static inline void ksm_exit(struct mm_struct *mm) { } +static inline void ksm_might_unmap_zero_page(pte_t pte) +{ +} + #ifdef CONFIG_MEMORY_FAILURE static inline void collect_procs_ksm(struct page *page, struct list_head *to_kill, int force_early) -- cgit v1.2.3 From 6080d19f07043ade61094d0f58b14c05e1694a39 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 13 Jun 2023 11:09:38 +0800 Subject: ksm: add ksm zero pages for each process As the number of ksm zero pages is not included in ksm_merging_pages per process when enabling use_zero_pages, it's unclear of how many actual pages are merged by KSM. To let users accurately estimate their memory demands when unsharing KSM zero-pages, it's necessary to show KSM zero- pages per process. In addition, it help users to know the actual KSM profit because KSM-placed zero pages are also benefit from KSM. since unsharing zero pages placed by KSM accurately is achieved, then tracking empty pages merging and unmerging is not a difficult thing any longer. Since we already have /proc//ksm_stat, just add the information of 'ksm_zero_pages' in it. Link: https://lkml.kernel.org/r/20230613030938.185993-1-yang.yang29@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Cc: Claudio Imbrenda Cc: Xuexin Jiang Signed-off-by: Andrew Morton --- include/linux/ksm.h | 8 +++++--- include/linux/mm_types.h | 9 +++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index e80aa49009b2..c2dd786a30e1 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -35,10 +35,12 @@ void __ksm_exit(struct mm_struct *mm); extern unsigned long ksm_zero_pages; -static inline void ksm_might_unmap_zero_page(pte_t pte) +static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { - if (is_ksm_zero_pte(pte)) + if (is_ksm_zero_pte(pte)) { ksm_zero_pages--; + mm->ksm_zero_pages--; + } } static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) @@ -109,7 +111,7 @@ static inline void ksm_exit(struct mm_struct *mm) { } -static inline void ksm_might_unmap_zero_page(pte_t pte) +static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5e74ce4a28cd..51d04c1847c1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -812,7 +812,7 @@ struct mm_struct { #ifdef CONFIG_KSM /* * Represent how many pages of this process are involved in KSM - * merging. + * merging (not including ksm_zero_pages). */ unsigned long ksm_merging_pages; /* @@ -820,7 +820,12 @@ struct mm_struct { * including merged and not merged. */ unsigned long ksm_rmap_items; -#endif + /* + * Represent how many empty pages are merged with kernel zero + * pages when enabling KSM use_zero_pages. + */ + unsigned long ksm_zero_pages; +#endif /* CONFIG_KSM */ #ifdef CONFIG_LRU_GEN struct { /* this mm_struct is on lru_gen_mm_list */ -- cgit v1.2.3 From bded67f81ec47e6054ad24c1c7992a6523a9b2c6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 6 Jul 2023 14:39:05 +0800 Subject: memory tier: rename destroy_memory_type() to put_memory_type() It appears that destroy_memory_type() isn't a very good name because we usually will not free the memory_type here. So rename it to a more appropriate name i.e. put_memory_type(). Link: https://lkml.kernel.org/r/20230706063905.543800-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Suggested-by: Huang, Ying Reviewed-by: "Huang, Ying" Reviewed-by: Xiao Yang Cc: Dan Williams Cc: Dave Jiang Cc: Vishal Verma Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index fc9647b1b4f9..437441cdf78f 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -33,7 +33,7 @@ struct memory_dev_type { #ifdef CONFIG_NUMA extern bool numa_demotion_enabled; struct memory_dev_type *alloc_memory_type(int adistance); -void destroy_memory_type(struct memory_dev_type *memtype); +void put_memory_type(struct memory_dev_type *memtype); void init_node_memory_type(int node, struct memory_dev_type *default_type); void clear_node_memory_type(int node, struct memory_dev_type *memtype); #ifdef CONFIG_MIGRATION @@ -68,7 +68,7 @@ static inline struct memory_dev_type *alloc_memory_type(int adistance) return NULL; } -static inline void destroy_memory_type(struct memory_dev_type *memtype) +static inline void put_memory_type(struct memory_dev_type *memtype) { } -- cgit v1.2.3 From 8f21912a4bf854e51b3ba69298f559f976d63685 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 6 Jul 2023 17:24:41 +0800 Subject: mm: remove obsolete comment above struct per_cpu_pages Since commit 01b44456a7aa ("mm/page_alloc: replace local_lock with normal spinlock"), per_cpu_pages is protected by normal spinlock. Remove the obsolete comment as it's not that helpful. Link: https://lkml.kernel.org/r/20230706092441.1574950-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5e50b78d58ea..4106fbc5b4b3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -676,7 +676,6 @@ enum zone_watermarks { #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) -/* Fields and list protected by pagesets local_lock in page_alloc.c */ struct per_cpu_pages { spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ -- cgit v1.2.3 From b4fa966f03b7401ceacd4ffd7227197afb2b8376 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Jun 2023 11:48:52 +0100 Subject: mm, netfs, fscache: stop read optimisation when folio removed from pagecache Fscache has an optimisation by which reads from the cache are skipped until we know that (a) there's data there to be read and (b) that data isn't entirely covered by pages resident in the netfs pagecache. This is done with two flags manipulated by fscache_note_page_release(): if (... test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) && test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); where the NO_DATA_TO_READ flag causes cachefiles_prepare_read() to indicate that netfslib should download from the server or clear the page instead. The fscache_note_page_release() function is intended to be called from ->releasepage() - but that only gets called if PG_private or PG_private_2 is set - and currently the former is at the discretion of the network filesystem and the latter is only set whilst a page is being written to the cache, so sometimes we miss clearing the optimisation. Fix this by following Willy's suggestion[1] and adding an address_space flag, AS_RELEASE_ALWAYS, that causes filemap_release_folio() to always call ->release_folio() if it's set, even if PG_private or PG_private_2 aren't set. Note that this would require folio_test_private() and page_has_private() to become more complicated. To avoid that, in the places[*] where these are used to conditionalise calls to filemap_release_folio() and try_to_release_page(), the tests are removed the those functions just jumped to unconditionally and the test is performed there. [*] There are some exceptions in vmscan.c where the check guards more than just a call to the releaser. I've added a function, folio_needs_release() to wrap all the checks for that. AS_RELEASE_ALWAYS should be set if a non-NULL cookie is obtained from fscache and cleared in ->evict_inode() before truncate_inode_pages_final() is called. Additionally, the FSCACHE_COOKIE_NO_DATA_TO_READ flag needs to be cleared and the optimisation cancelled if a cachefiles object already contains data when we open it. [dwysocha@redhat.com: call folio_mapping() inside folio_needs_release()] Link: https://github.com/DaveWysochanskiRH/kernel/commit/902c990e311120179fa5de99d68364b2947b79ec Link: https://lkml.kernel.org/r/20230628104852.3391651-3-dhowells@redhat.com Fixes: 1f67e6d0b188 ("fscache: Provide a function to note the release of a page") Fixes: 047487c947e8 ("cachefiles: Implement the I/O routines") Signed-off-by: David Howells Signed-off-by: Dave Wysochanski Reported-by: Rohith Surabattula Suggested-by: Matthew Wilcox Tested-by: SeongJae Park Cc: Daire Byrne Cc: Matthew Wilcox Cc: Linus Torvalds Cc: Steve French Cc: Shyam Prasad N Cc: Rohith Surabattula Cc: Dave Wysochanski Cc: Dominique Martinet Cc: Ilya Dryomov Cc: Andreas Dilger Cc: Jingbo Xu Cc: "Theodore Ts'o" Cc: Xiubo Li Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 716953ee1ebd..0ab0f2362b9b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -203,6 +203,7 @@ enum mapping_flags { /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, AS_LARGE_FOLIO_SUPPORT = 6, + AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ }; /** @@ -273,6 +274,21 @@ static inline int mapping_use_writeback_tags(struct address_space *mapping) return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } +static inline bool mapping_release_always(const struct address_space *mapping) +{ + return test_bit(AS_RELEASE_ALWAYS, &mapping->flags); +} + +static inline void mapping_set_release_always(struct address_space *mapping) +{ + set_bit(AS_RELEASE_ALWAYS, &mapping->flags); +} + +static inline void mapping_clear_release_always(struct address_space *mapping) +{ + clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; -- cgit v1.2.3 From 60b1e24ce8c3334d9204d6229356b750632136be Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 8 Jul 2023 10:33:04 +0800 Subject: mm/memcg: minor cleanup for MEM_CGROUP_ID_MAX MEM_CGROUP_ID_MAX is only used when CONFIG_MEMCG is configured. So remove unneeded !CONFIG_MEMCG variant. Also it's only used in mem_cgroup_alloc(), so move it from memcontrol.h to memcontrol.c. And further define it as: #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) so if someone changes MEM_CGROUP_ID_SHIFT in the future, then MEM_CGROUP_ID_MAX will be updated accordingly, as suggested by Muchun. Link: https://lkml.kernel.org/r/20230708023304.1184111-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5818af8eca5a..58eb7ca65699 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -61,7 +61,6 @@ struct mem_cgroup_reclaim_cookie { #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 -#define MEM_CGROUP_ID_MAX USHRT_MAX struct mem_cgroup_id { int id; @@ -1158,7 +1157,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 -#define MEM_CGROUP_ID_MAX 0 static inline struct mem_cgroup *folio_memcg(struct folio *folio) { -- cgit v1.2.3 From af19487f00f34ff8643921d7909dbb3fedc7e329 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:33 -0700 Subject: mm: make PTE_MARKER_SWAPIN_ERROR more general Patch series "add UFFDIO_POISON to simulate memory poisoning with UFFD", v4. This series adds a new userfaultfd feature, UFFDIO_POISON. See commit 4 for a detailed description of the feature. This patch (of 8): Future patches will reuse PTE_MARKER_SWAPIN_ERROR to implement UFFDIO_POISON, so make some various preparations for that: First, rename it to just PTE_MARKER_POISONED. The "SWAPIN" can be confusing since we're going to re-use it for something not really related to swap. This can be particularly confusing for things like hugetlbfs, which doesn't support swap whatsoever. Also rename some various helper functions. Next, fix pte marker copying for hugetlbfs. Previously, it would WARN on seeing a PTE_MARKER_SWAPIN_ERROR, since hugetlbfs doesn't support swap. But, since we're going to re-use it, we want it to go ahead and copy it just like non-hugetlbfs memory does today. Since the code to do this is more complicated now, pull it out into a helper which can be re-used in both places. While we're at it, also make it slightly more explicit in its handling of e.g. uffd wp markers. For non-hugetlbfs page faults, instead of returning VM_FAULT_SIGBUS for an error entry, return VM_FAULT_HWPOISON. For most cases this change doesn't matter, e.g. a userspace program would receive a SIGBUS either way. But for UFFDIO_POISON, this change will let KVM guests get an MCE out of the box, instead of giving a SIGBUS to the hypervisor and requiring it to somehow inject an MCE. Finally, for hugetlbfs faults, handle PTE_MARKER_POISONED, and return VM_FAULT_HWPOISON_LARGE in such cases. Note that this can't happen today because the lack of swap support means we'll never end up with such a PTE anyway, but this behavior will be needed once such entries *can* show up via UFFDIO_POISON. Link: https://lkml.kernel.org/r/20230707215540.2324998-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20230707215540.2324998-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 19 +++++++++++++++++++ include/linux/swapops.h | 15 ++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 21d6c72bcc71..a86c84600787 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -523,6 +523,25 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending) > 1; } +/* + * Computes the pte marker to copy from the given source entry into dst_vma. + * If no marker should be copied, returns 0. + * The caller should insert a new pte created with make_pte_marker(). + */ +static inline pte_marker copy_pte_marker( + swp_entry_t entry, struct vm_area_struct *dst_vma) +{ + pte_marker srcm = pte_marker_get(entry); + /* Always copy error entries. */ + pte_marker dstm = srcm & PTE_MARKER_POISONED; + + /* Only copy PTE markers if UFFD register matches. */ + if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) + dstm |= PTE_MARKER_UFFD_WP; + + return dstm; +} + /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to * replace a none pte. NOTE! This should only be called when *pte is already diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 4c932cb45e0b..bff1e8d97de0 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -393,7 +393,12 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) -#define PTE_MARKER_SWAPIN_ERROR BIT(1) +/* + * "Poisoned" here is meant in the very general sense of "future accesses are + * invalid", instead of referring very specifically to hardware memory errors. + * This marker is meant to represent any of various different causes of this. + */ +#define PTE_MARKER_POISONED BIT(1) #define PTE_MARKER_MASK (BIT(2) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) @@ -421,15 +426,15 @@ static inline pte_t make_pte_marker(pte_marker marker) return swp_entry_to_pte(make_pte_marker_entry(marker)); } -static inline swp_entry_t make_swapin_error_entry(void) +static inline swp_entry_t make_poisoned_swp_entry(void) { - return make_pte_marker_entry(PTE_MARKER_SWAPIN_ERROR); + return make_pte_marker_entry(PTE_MARKER_POISONED); } -static inline int is_swapin_error_entry(swp_entry_t entry) +static inline int is_poisoned_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_SWAPIN_ERROR); + (pte_marker_get(entry) & PTE_MARKER_POISONED); } /* -- cgit v1.2.3 From f92cedfa39ef208c9685d2fdd8215bb58178571b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 8 Jul 2023 18:03:23 -0700 Subject: mm-make-pte_marker_swapin_error-more-general-fix fix CONFIG_MMU=n build Cc: Axel Rasmussen Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index a86c84600787..8148b30a9df1 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -523,6 +523,7 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending) > 1; } +#ifdef CONFIG_MMU /* * Computes the pte marker to copy from the given source entry into dst_vma. * If no marker should be copied, returns 0. @@ -541,6 +542,7 @@ static inline pte_marker copy_pte_marker( return dstm; } +#endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to -- cgit v1.2.3 From fc71884a5f599a603fcc3c2b28b3872c09d19c18 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:36 -0700 Subject: mm: userfaultfd: add new UFFDIO_POISON ioctl The basic idea here is to "simulate" memory poisoning for VMs. A VM running on some host might encounter a memory error, after which some page(s) are poisoned (i.e., future accesses SIGBUS). They expect that once poisoned, pages can never become "un-poisoned". So, when we live migrate the VM, we need to preserve the poisoned status of these pages. When live migrating, we try to get the guest running on its new host as quickly as possible. So, we start it running before all memory has been copied, and before we're certain which pages should be poisoned or not. So the basic way to use this new feature is: - On the new host, the guest's memory is registered with userfaultfd, in either MISSING or MINOR mode (doesn't really matter for this purpose). - On any first access, we get a userfaultfd event. At this point we can communicate with the old host to find out if the page was poisoned. - If so, we can respond with a UFFDIO_POISON - this places a swap marker so any future accesses will SIGBUS. Because the pte is now "present", future accesses won't generate more userfaultfd events, they'll just SIGBUS directly. UFFDIO_POISON does not handle unmapping previously-present PTEs. This isn't needed, because during live migration we want to intercept all accesses with userfaultfd (not just writes, so WP mode isn't useful for this). So whether minor or missing mode is being used (or both), the PTE won't be present in any case, so handling that case isn't needed. Similarly, UFFDIO_POISON won't replace existing PTE markers. This might be okay to do, but it seems to be safer to just refuse to overwrite any existing entry (like a UFFD_WP PTE marker). Link: https://lkml.kernel.org/r/20230707215540.2324998-5-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac7b0c96d351..ac8c6854097c 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -46,6 +46,7 @@ enum mfill_atomic_mode { MFILL_ATOMIC_COPY, MFILL_ATOMIC_ZEROPAGE, MFILL_ATOMIC_CONTINUE, + MFILL_ATOMIC_POISON, NR_MFILL_ATOMIC_MODES, }; @@ -83,6 +84,9 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, atomic_t *mmap_changing, uffd_flags_t flags); +extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, atomic_t *mmap_changing, + uffd_flags_t flags); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); -- cgit v1.2.3 From d695c30a8ca07ac7e2138435b461b36289d5656e Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:38 +0800 Subject: maple_tree: don't use MAPLE_ARANGE64_META_MAX to indicate no gap Patch series "Improve the validation for maple tree and some cleanup", v2. This patch (of 7): Do not use a special offset to indicate that there is no gap. When there is no gap, offset can point to any valid slots because its gap is 0. Link: https://lkml.kernel.org/r/20230711035444.526-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20230711035444.526-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 6e5bd2c9875d..7769270b85e8 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -29,14 +29,12 @@ #define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ -#define MAPLE_ARANGE64_META_MAX 15 /* Out of range for metadata */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) #else /* 32bit sizes */ #define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ -#define MAPLE_ARANGE64_META_MAX 31 /* Out of range for metadata */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) #endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ -- cgit v1.2.3 From a349d72fd9efc87c8fd1d16d3164752d84a7275b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:30:40 -0700 Subject: mm/pgtable: add rcu_read_lock() and rcu_read_unlock()s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: free retracted page table by RCU", v3. Some mmap_lock avoidance i.e. latency reduction. Initially just for the case of collapsing shmem or file pages to THPs: the usefulness of MADV_COLLAPSE on shmem is being limited by that mmap_write_lock it currently requires. Likely to be relied upon later in other contexts e.g. freeing of empty page tables (but that's not work I'm doing). mmap_write_lock avoidance when collapsing to anon THPs? Perhaps, but again that's not work I've done: a quick attempt was not as easy as the shmem/file case. These changes (though of course not these exact patches) have been in Google's data centre kernel for three years now: we do rely upon them. This patch (of 13): Before putting them to use (several commits later), add rcu_read_lock() to pte_offset_map(), and rcu_read_unlock() to pte_unmap(). Make this a separate commit, since it risks exposing imbalances: prior commits have fixed all the known imbalances, but we may find some have been missed. Link: https://lkml.kernel.org/r/7cd843a9-aa80-14f-5eb2-33427363c20@google.com Link: https://lkml.kernel.org/r/d3b01da5-2a6-833c-6681-67a3e024a16f@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5063b482e34f..5134edcec668 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -99,7 +99,7 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) #define pte_unmap(pte) do { \ kunmap_local((pte)); \ - /* rcu_read_unlock() to be added later */ \ + rcu_read_unlock(); \ } while (0) #else static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) @@ -108,7 +108,7 @@ static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) } static inline void pte_unmap(pte_t *pte) { - /* rcu_read_unlock() to be added later */ + rcu_read_unlock(); } #endif -- cgit v1.2.3 From 146b42e07494e45f7c7bcf2cbf7afd1424afd78e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:32:05 -0700 Subject: mm/pgtable: add PAE safety to __pte_offset_map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a faint risk that __pte_offset_map(), on a 32-bit architecture with a 64-bit pmd_t e.g. x86-32 with CONFIG_X86_PAE=y, would succeed on a pmdval assembled from a pmd_low and a pmd_high which never belonged together: their combination not pointing to a page table at all, perhaps not even a valid pfn. pmdp_get_lockless() is not enough to prevent that. Guard against that (on such configs) by local_irq_save() blocking TLB flush between present updates, as linux/pgtable.h suggests. It's only needed around the pmdp_get_lockless() in __pte_offset_map(): a race when __pte_offset_map_lock() repeats the pmdp_get_lockless() after getting the lock, would just send it back to __pte_offset_map() again. Complement this pmdp_get_lockless_start() and pmdp_get_lockless_end(), used only locally in __pte_offset_map(), with a pmdp_get_lockless_sync() synonym for tlb_remove_table_sync_one(): to send the necessary interrupt at the right moment on those configs which do not already send it. CONFIG_GUP_GET_PXX_LOW_HIGH is enabled when required by mips, sh and x86. It is not enabled by arm-32 CONFIG_ARM_LPAE: my understanding is that Will Deacon's 2020 enhancements to READ_ONCE() are sufficient for arm. It is not enabled by arc, but its pmd_t is 32-bit even when pte_t 64-bit. Limit the IRQ disablement to CONFIG_HIGHPTE? Perhaps, but would need a little more work, to retry if pmd_low good for page table, but pmd_high non-zero from THP (and that might be making x86-specific assumptions). Link: https://lkml.kernel.org/r/3adcd8f-9191-2df1-d7ea-c4877698aad@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5134edcec668..7f2db400f653 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -390,6 +390,7 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) return pmd; } #define pmdp_get_lockless pmdp_get_lockless +#define pmdp_get_lockless_sync() tlb_remove_table_sync_one() #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ @@ -408,6 +409,9 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { return pmdp_get(pmdp); } +static inline void pmdp_get_lockless_sync(void) +{ +} #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE -- cgit v1.2.3 From 13cf577e6b66a148d6d63f5ef7801f4b61d5850f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:39:48 -0700 Subject: mm/pgtable: add pte_free_defer() for pgtable as page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the generic pte_free_defer(), to call pte_free() via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This version suits all those architectures which use an unfragmented page for one page table (none of whose pte_free()s use the mm arg which was passed to it). Link: https://lkml.kernel.org/r/78e921b0-b681-a1b0-dc20-44c9efa4ef3c@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 ++++ include/linux/pgtable.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 51d04c1847c1..1fc4b9c2c8a6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -144,6 +144,10 @@ struct page { struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ + /* + * A PTE page table page might be freed by use of + * rcu_head: which overlays those two fields above. + */ unsigned long _pt_pad_2; /* mapping */ union { struct mm_struct *pt_mm; /* x86 pgds only */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 7f2db400f653..9fa34be65159 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -112,6 +112,8 @@ static inline void pte_unmap(pte_t *pte) } #endif +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) -- cgit v1.2.3 From cf95e337cb63cfbf5c9ea1a1f64f9818b979e3b3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:48:48 -0700 Subject: mm: delete mmap_write_trylock() and vma_try_start_write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mmap_write_trylock() and vma_try_start_write() were added just for khugepaged, but now it has no use for them: delete. Link: https://lkml.kernel.org/r/4e6db3d-e8e-73fb-1f2a-8de2dab2a87c@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 ----------------- include/linux/mmap_lock.h | 10 ---------- 2 files changed, 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d1ad22980ebe..bfb46483108c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -709,21 +709,6 @@ static inline void vma_start_write(struct vm_area_struct *vma) up_write(&vma->vm_lock->lock); } -static inline bool vma_try_start_write(struct vm_area_struct *vma) -{ - int mm_lock_seq; - - if (__is_vma_write_locked(vma, &mm_lock_seq)) - return true; - - if (!down_write_trylock(&vma->vm_lock->lock)) - return false; - - WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - up_write(&vma->vm_lock->lock); - return true; -} - static inline void vma_assert_write_locked(struct vm_area_struct *vma) { int mm_lock_seq; @@ -748,8 +733,6 @@ static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} -static inline bool vma_try_start_write(struct vm_area_struct *vma) - { return true; } static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index e05e167dbd16..a5c63b6d7d46 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -118,16 +118,6 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm) return ret; } -static inline bool mmap_write_trylock(struct mm_struct *mm) -{ - bool ret; - - __mmap_lock_trace_start_locking(mm, true); - ret = down_write_trylock(&mm->mmap_lock) != 0; - __mmap_lock_trace_acquire_returned(mm, true, ret); - return ret; -} - static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); -- cgit v1.2.3 From 73e791d73877e90444afb6036d86ea37fd78584f Mon Sep 17 00:00:00 2001 From: Xueshi Hu Date: Wed, 12 Jul 2023 21:49:59 +0800 Subject: mm: remove clear_page_idle() All callers have now been converted to call folio_clear_idle(). Link: https://lkml.kernel.org/r/20230712134959.145373-1-xueshi.hu@smartx.com Signed-off-by: Xueshi Hu Reviewed-by: David Hildenbrand Cc: Charan Teja Kalla Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/page_idle.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 5cb7bd2078ec..d8f344840643 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -144,9 +144,4 @@ static inline void set_page_idle(struct page *page) { folio_set_idle(page_folio(page)); } - -static inline void clear_page_idle(struct page *page) -{ - folio_clear_idle(page_folio(page)); -} #endif /* _LINUX_MM_PAGE_IDLE_H */ -- cgit v1.2.3 From b79f8eb408d0468df0d6082ed958b67d94adce65 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Thu, 13 Jul 2023 00:18:31 +0000 Subject: mm/hwpoison: check if a raw page in a hugetlb folio is raw HWPOISON Add the functionality, is_raw_hwpoison_page_in_hugepage, to tell if a raw page in a hugetlb folio is HWPOISON. This functionality relies on RawHwpUnreliable to be not set; otherwise hugepage's raw HWPOISON list becomes meaningless. is_raw_hwpoison_page_in_hugepage holds mf_mutex in order to synchronize with folio_set_hugetlb_hwpoison and folio_free_raw_hwp who iterate, insert, or delete entry in raw_hwp_list. llist itself doesn't ensure insertion and removal are synchornized with the llist_for_each_entry used by is_raw_hwpoison_page_in_hugepage (unless iterated entries are already deleted from the list). Caller can minimize the overhead of lock cycles by first checking HWPOISON flag of the folio. Exports this functionality to be immediately used in the read operation for hugetlbfs. Link: https://lkml.kernel.org/r/20230713001833.3778937-3-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: James Houghton Cc: Muchun Song Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9bc3c2d71b71..9f4bac3df59e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -997,6 +997,11 @@ void hugetlb_register_node(struct node *node); void hugetlb_unregister_node(struct node *node); #endif +/* + * Check if a given raw @page in a hugepage is HWPOISON. + */ +bool is_raw_hwpoison_page_in_hugepage(struct page *page); + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; -- cgit v1.2.3 From aa232204c4689427cefa55fe975692b57291523a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:31 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pte_clear Remove unused addr in page_table_check_pte_clear and __page_table_check_pte_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- include/linux/page_table_check.h | 11 ++++------- include/linux/pgtable.h | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 01e16c7696ec..35c53c4b94d3 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -14,8 +14,7 @@ extern struct static_key_true page_table_check_disabled; extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); -void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t pte); +void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, @@ -46,13 +45,12 @@ static inline void page_table_check_free(struct page *page, unsigned int order) __page_table_check_zero(page, order); } -static inline void page_table_check_pte_clear(struct mm_struct *mm, - unsigned long addr, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_clear(mm, addr, pte); + __page_table_check_pte_clear(mm, pte); } static inline void page_table_check_pmd_clear(struct mm_struct *mm, @@ -123,8 +121,7 @@ static inline void page_table_check_free(struct page *page, unsigned int order) { } -static inline void page_table_check_pte_clear(struct mm_struct *mm, - unsigned long addr, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 9fa34be65159..a1ccb13c4853 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -322,7 +322,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); - page_table_check_pte_clear(mm, address, pte); + page_table_check_pte_clear(mm, pte); return pte; } #endif -- cgit v1.2.3 From 1831414cd729a34af937d56ad684a66599de6344 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:32 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_clear Remove unused addr in page_table_check_pmd_clear and __page_table_check_pmd_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- include/linux/page_table_check.h | 11 ++++------- include/linux/pgtable.h | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 35c53c4b94d3..0f777bca5283 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -15,8 +15,7 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); -void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, - pmd_t pmd); +void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, @@ -53,13 +52,12 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) __page_table_check_pte_clear(mm, pte); } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, - unsigned long addr, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_clear(mm, addr, pmd); + __page_table_check_pmd_clear(mm, pmd); } static inline void page_table_check_pud_clear(struct mm_struct *mm, @@ -125,8 +123,7 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, - unsigned long addr, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a1ccb13c4853..3edef5ed008f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -425,7 +425,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_t pmd = *pmdp; pmd_clear(pmdp); - page_table_check_pmd_clear(mm, address, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } -- cgit v1.2.3 From 931c38e16499a057e30a3033f4d6a9c242f0f156 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:33 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pud_clear Remove unused addr in __page_table_check_pud_clear and page_table_check_pud_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- include/linux/page_table_check.h | 11 ++++------- include/linux/pgtable.h | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 0f777bca5283..5c9dc848a1bc 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -16,8 +16,7 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); -void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, - pud_t pud); +void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, @@ -60,13 +59,12 @@ static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) __page_table_check_pmd_clear(mm, pmd); } -static inline void page_table_check_pud_clear(struct mm_struct *mm, - unsigned long addr, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pud_clear(mm, addr, pud); + __page_table_check_pud_clear(mm, pud); } static inline void page_table_check_pte_set(struct mm_struct *mm, @@ -127,8 +125,7 @@ static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } -static inline void page_table_check_pud_clear(struct mm_struct *mm, - unsigned long addr, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 3edef5ed008f..5f36c055794b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -438,7 +438,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_t pud = *pudp; pud_clear(pudp); - page_table_check_pud_clear(mm, address, pud); + page_table_check_pud_clear(mm, pud); return pud; } -- cgit v1.2.3 From 1066293d426d3000793c3c3b4276ef38b63ada4a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:34 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pte_set Remove unused addr in __page_table_check_pte_set and page_table_check_pte_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- include/linux/page_table_check.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 5c9dc848a1bc..63ebd9fcf28b 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -17,8 +17,7 @@ void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); -void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); +void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, @@ -67,14 +66,13 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) __page_table_check_pud_clear(mm, pud); } -static inline void page_table_check_pte_set(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, +static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_set(mm, addr, ptep, pte); + __page_table_check_pte_set(mm, ptep, pte); } static inline void page_table_check_pmd_set(struct mm_struct *mm, @@ -129,8 +127,7 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } -static inline void page_table_check_pte_set(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, +static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) { } -- cgit v1.2.3 From a3b837130b5865521fa8662aceaa6ebc8d29389a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:35 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_set Remove unused addr in __page_table_check_pmd_set and page_table_check_pmd_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-8-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- include/linux/page_table_check.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 63ebd9fcf28b..dd58dfb0e643 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -18,8 +18,7 @@ void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); -void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd); +void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, @@ -75,14 +74,13 @@ static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, __page_table_check_pte_set(mm, ptep, pte); } -static inline void page_table_check_pmd_set(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp, +static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_set(mm, addr, pmdp, pmd); + __page_table_check_pmd_set(mm, pmdp, pmd); } static inline void page_table_check_pud_set(struct mm_struct *mm, @@ -132,8 +130,7 @@ static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, { } -static inline void page_table_check_pmd_set(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp, +static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { } -- cgit v1.2.3 From 6d144436d954311f2dbacb5bf7b084042448d83e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:36 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pud_set Remove unused addr in __page_table_check_pud_set and page_table_check_pud_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-9-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- include/linux/page_table_check.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index dd58dfb0e643..7f6b9bf926c5 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -19,8 +19,7 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); -void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud); +void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd); @@ -83,14 +82,13 @@ static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, __page_table_check_pmd_set(mm, pmdp, pmd); } -static inline void page_table_check_pud_set(struct mm_struct *mm, - unsigned long addr, pud_t *pudp, +static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pud_set(mm, addr, pudp, pud); + __page_table_check_pud_set(mm, pudp, pud); } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, @@ -135,8 +133,7 @@ static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, { } -static inline void page_table_check_pud_set(struct mm_struct *mm, - unsigned long addr, pud_t *pudp, +static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { } -- cgit v1.2.3 From b23d03ef7af567929bcd3fca7eea6f4f347387d3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jul 2023 04:55:06 +0100 Subject: highmem: add memcpy_to_folio() and memcpy_from_folio() Patch series "More filesystem folio conversions for 6.6". Remove the only spots in affs which actually use a struct page; there are a few places where one is mentioned, but it's part of the interface. The rest of this is removing the remaining calls to set_bh_page(), and then removing the function before any new users show up. This patch (of 7): These are the folio equivalent of memcpy_to_page() and memcpy_from_page(). [agruenba@redhat.com: use correct chunk size in memcpy()] Link: https://lkml.kernel.org/r/20230802144354.1023099-1-agruenba@redhat.com Link: https://lkml.kernel.org/r/20230713035512.4139457-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230713035512.4139457-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Sterba Cc: Jan Kara Cc: Konstantin Komarov Cc: Pankaj Raghav Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Christian Brauner Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tom Rix Cc: Andreas Gruenbacher Signed-off-by: Andrew Morton --- include/linux/highmem.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 68da30625a6c..99c474de800d 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -439,6 +439,50 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) kunmap_local(addr); } +static inline void memcpy_from_folio(char *to, struct folio *folio, + size_t offset, size_t len) +{ + VM_BUG_ON(offset + len > folio_size(folio)); + + do { + const char *from = kmap_local_folio(folio, offset); + size_t chunk = len; + + if (folio_test_highmem(folio) && + chunk > PAGE_SIZE - offset_in_page(offset)) + chunk = PAGE_SIZE - offset_in_page(offset); + memcpy(to, from, chunk); + kunmap_local(from); + + from += chunk; + offset += chunk; + len -= chunk; + } while (len > 0); +} + +static inline void memcpy_to_folio(struct folio *folio, size_t offset, + const char *from, size_t len) +{ + VM_BUG_ON(offset + len > folio_size(folio)); + + do { + char *to = kmap_local_folio(folio, offset); + size_t chunk = len; + + if (folio_test_highmem(folio) && + chunk > PAGE_SIZE - offset_in_page(offset)) + chunk = PAGE_SIZE - offset_in_page(offset); + memcpy(to, from, chunk); + kunmap_local(to); + + from += chunk; + offset += chunk; + len -= chunk; + } while (len > 0); + + flush_dcache_folio(folio); +} + /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. -- cgit v1.2.3 From 5f6d28622ffc7fa356b2745b088c831ebb8546b0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jul 2023 04:55:12 +0100 Subject: buffer: remove set_bh_page() With all users converted to folio_set_bh(), remove this function. Link: https://lkml.kernel.org/r/20230713035512.4139457-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: David Sterba Cc: Konstantin Komarov Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pankaj Raghav Cc: "Theodore Ts'o" Cc: Tom Rix Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index a7377877ff4e..06566aee94ca 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -194,8 +194,6 @@ void buffer_check_dirty_writeback(struct folio *folio, void mark_buffer_dirty(struct buffer_head *bh); void mark_buffer_write_io_error(struct buffer_head *bh); void touch_buffer(struct buffer_head *bh); -void set_bh_page(struct buffer_head *bh, - struct page *page, unsigned long offset); void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset); bool try_to_free_buffers(struct folio *); -- cgit v1.2.3 From 016fec91013cfb27c9f5a101a87ae8266537fed1 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:17 +0800 Subject: mm: move is_ioremap_addr() into new header file Now is_ioremap_addr() is only used in kernel/iomem.c and gonna be used in mm/ioremap.c. Move it into its own new header file linux/ioremap.h. Link: https://lkml.kernel.org/r/20230706154520.11257-17-bhe@redhat.com Suggested-by: Christoph Hellwig Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/ioremap.h | 30 ++++++++++++++++++++++++++++++ include/linux/mm.h | 5 ----- 2 files changed, 30 insertions(+), 5 deletions(-) create mode 100644 include/linux/ioremap.h (limited to 'include/linux') diff --git a/include/linux/ioremap.h b/include/linux/ioremap.h new file mode 100644 index 000000000000..f0e99fc7dd8b --- /dev/null +++ b/include/linux/ioremap.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IOREMAP_H +#define _LINUX_IOREMAP_H + +#include +#include + +#if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP) +/* + * Ioremap often, but not always uses the generic vmalloc area. E.g on + * Power ARCH, it could have different ioremap space. + */ +#ifndef IOREMAP_START +#define IOREMAP_START VMALLOC_START +#define IOREMAP_END VMALLOC_END +#endif +static inline bool is_ioremap_addr(const void *x) +{ + unsigned long addr = (unsigned long)kasan_reset_tag(x); + + return addr >= IOREMAP_START && addr < IOREMAP_END; +} +#else +static inline bool is_ioremap_addr(const void *x) +{ + return false; +} +#endif + +#endif /* _LINUX_IOREMAP_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index bfb46483108c..0ae5654f665b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1055,11 +1055,6 @@ unsigned long vmalloc_to_pfn(const void *addr); * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ - -#ifndef is_ioremap_addr -#define is_ioremap_addr(x) is_vmalloc_addr(x) -#endif - #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); -- cgit v1.2.3 From f73419bb89d606de9be2043febf0957d56627a5b Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 17 Jul 2023 21:10:02 +0800 Subject: mm/tlbbatch: rename and extend some functions This patch does some preparation works to extend batched TLB flush to arm64. Including: - Extend set_tlb_ubc_flush_pending() and arch_tlbbatch_add_mm() to accept an additional argument for address, architectures like arm64 may need this for tlbi. - Rename arch_tlbbatch_add_mm() to arch_tlbbatch_add_pending() to match its current function since we don't need to handle mm on architectures like arm64 and add_mm is not proper, add_pending will make sense to both as on x86 we're pending the TLB flush operations while on arm64 we're pending the synchronize operations. This intends no functional changes on x86. Link: https://lkml.kernel.org/r/20230717131004.12662-3-yangyicong@huawei.com Tested-by: Yicong Yang Tested-by: Xin Hao Tested-by: Punit Agrawal Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Kefeng Wang Reviewed-by: Xin Hao Reviewed-by: Anshuman Khandual Reviewed-by: Catalin Marinas Cc: Jonathan Corbet Cc: Nadav Amit Cc: Mel Gorman Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Barry Song Cc: Darren Hart Cc: Jonathan Cameron Cc: lipeifeng Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Zeng Tao Signed-off-by: Andrew Morton --- include/linux/mm_types_task.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 5414b5c6a103..aa44fff8bb9d 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -52,8 +52,8 @@ struct tlbflush_unmap_batch { #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* * The arch code makes the following promise: generic code can modify a - * PTE, then call arch_tlbbatch_add_mm() (which internally provides all - * needed barriers), then call arch_tlbbatch_flush(), and the entries + * PTE, then call arch_tlbbatch_add_pending() (which internally provides + * all needed barriers), then call arch_tlbbatch_flush(), and the entries * will be flushed on all CPUs by the time that arch_tlbbatch_flush() * returns. */ -- cgit v1.2.3 From aee79d4e5271cee4ffa89ed830189929a6272eb8 Mon Sep 17 00:00:00 2001 From: "Zhu, Lipeng" Date: Sun, 16 Jul 2023 22:56:54 +0800 Subject: fs/address_space: add alignment padding for i_map and i_mmap_rwsem to mitigate a false sharing. When running UnixBench/Shell Scripts, we observed high false sharing for accessing i_mmap against i_mmap_rwsem. UnixBench/Shell Scripts are typical load/execute command test scenarios, which concurrently launch->execute->exit a lot of shell commands. A lot of processes invoke vma_interval_tree_remove which touch "i_mmap", the call stack: ----vma_interval_tree_remove |----unlink_file_vma | free_pgtables | |----exit_mmap | | mmput | | |----begin_new_exec | | | load_elf_binary | | | bprm_execve Meanwhile, there are a lot of processes touch 'i_mmap_rwsem' to acquire the semaphore in order to access 'i_mmap'. In existing 'address_space' layout, 'i_mmap' and 'i_mmap_rwsem' are in the same cacheline. The patch places the i_mmap and i_mmap_rwsem in separate cache lines to avoid this false sharing problem. With this patch, based on kernel v6.4.0, on Intel Sapphire Rapids 112C/224T platform, the score improves by ~5.3%. And perf c2c tool shows the false sharing is resolved as expected, the symbol vma_interval_tree_remove disappeared in cache line 0 after this change. Baseline: ================================================= Shared Cache Line Distribution Pareto ================================================= ------------------------------------------------------------- 0 3729 5791 0 0 0xff19b3818445c740 ------------------------------------------------------------- 3.27% 3.02% 0.00% 0.00% 0x18 0 1 0xffffffffa194403b 604 483 389 692 203 [k] vma_interval_tree_insert [kernel.kallsyms] vma_interval_tree_insert+75 0 1 4.13% 3.63% 0.00% 0.00% 0x20 0 1 0xffffffffa19440a2 553 413 415 962 215 [k] vma_interval_tree_remove [kernel.kallsyms] vma_interval_tree_remove+18 0 1 2.04% 1.35% 0.00% 0.00% 0x28 0 1 0xffffffffa219a1d6 1210 855 460 1229 222 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+678 0 1 0.62% 1.85% 0.00% 0.00% 0x28 0 1 0xffffffffa219a1bf 762 329 577 527 198 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+655 0 1 0.48% 0.31% 0.00% 0.00% 0x28 0 1 0xffffffffa219a58c 1677 1476 733 1544 224 [k] down_write [kernel.kallsyms] down_write+28 0 1 0.05% 0.07% 0.00% 0.00% 0x28 0 1 0xffffffffa219a21d 1040 819 689 33 27 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+749 0 1 0.00% 0.05% 0.00% 0.00% 0x28 0 1 0xffffffffa17707db 0 1005 786 1373 223 [k] up_write [kernel.kallsyms] up_write+27 0 1 0.00% 0.02% 0.00% 0.00% 0x28 0 1 0xffffffffa219a064 0 233 778 32 30 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+308 0 1 33.82% 34.10% 0.00% 0.00% 0x30 0 1 0xffffffffa1770945 779 495 534 6011 224 [k] rwsem_spin_on_owner [kernel.kallsyms] rwsem_spin_on_owner+53 0 1 17.06% 15.28% 0.00% 0.00% 0x30 0 1 0xffffffffa1770915 593 438 468 2715 224 [k] rwsem_spin_on_owner [kernel.kallsyms] rwsem_spin_on_owner+5 0 1 3.54% 3.52% 0.00% 0.00% 0x30 0 1 0xffffffffa2199f84 881 601 583 1421 223 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+84 0 1 With this change: ------------------------------------------------------------- 0 556 838 0 0 0xff2780d7965d2780 ------------------------------------------------------------- 0.18% 0.60% 0.00% 0.00% 0x8 0 1 0xffffffffafff27b8 503 453 569 14 13 [k] do_dentry_open [kernel.kallsyms] do_dentry_open+456 0 1 0.54% 0.12% 0.00% 0.00% 0x8 0 1 0xffffffffaffc51ac 510 199 428 15 12 [k] hugepage_vma_check [kernel.kallsyms] hugepage_vma_check+252 0 1 1.80% 2.15% 0.00% 0.00% 0x18 0 1 0xffffffffb079a1d6 1778 799 343 215 136 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+678 0 1 0.54% 1.31% 0.00% 0.00% 0x18 0 1 0xffffffffb079a1bf 547 296 528 91 71 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+655 0 1 0.72% 0.72% 0.00% 0.00% 0x18 0 1 0xffffffffb079a58c 1479 1534 676 288 163 [k] down_write [kernel.kallsyms] down_write+28 0 1 0.00% 0.12% 0.00% 0.00% 0x18 0 1 0xffffffffafd707db 0 2381 744 282 158 [k] up_write [kernel.kallsyms] up_write+27 0 1 0.00% 0.12% 0.00% 0.00% 0x18 0 1 0xffffffffb079a064 0 239 518 6 6 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+308 0 1 46.58% 47.02% 0.00% 0.00% 0x20 0 1 0xffffffffafd70945 704 403 499 1137 219 [k] rwsem_spin_on_owner [kernel.kallsyms] rwsem_spin_on_owner+53 0 1 23.92% 25.78% 0.00% 0.00% 0x20 0 1 0xffffffffafd70915 558 413 500 542 185 [k] rwsem_spin_on_owner [kernel.kallsyms] rwsem_spin_on_owner+5 0 1 v1->v2: change padding to exchange fields. Link: https://lkml.kernel.org/r/20230716145653.20122-1-lipeng.zhu@intel.com Signed-off-by: Lipeng Zhu Reviewed-by: Tim Chen Cc: Alexander Viro Cc: Christian Brauner Cc: Yu Ma Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6867512907d6..bb00d37a6ca0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -447,11 +447,11 @@ struct address_space { atomic_t nr_thps; #endif struct rb_root_cached i_mmap; - struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; + struct rw_semaphore i_mmap_rwsem; errseq_t wb_err; spinlock_t private_lock; struct list_head private_list; -- cgit v1.2.3 From cabdf74e6b319c989eb8e812f1854291ae0af1c0 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 18 Jul 2023 15:30:19 +0800 Subject: mm: kfence: allocate kfence_metadata at runtime kfence_metadata is currently a static array. For the purpose of allocating scalable __kfence_pool, we first change it to runtime allocation of metadata. Since the size of an object of kfence_metadata is 1160 bytes, we can save at least 72 pages (with default 256 objects) without enabling kfence. [akpm@linux-foundation.org: restore newline, per Marco] Link: https://lkml.kernel.org/r/20230718073019.52513-1-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/kfence.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 726857a4b680..401af4757514 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -59,15 +59,16 @@ static __always_inline bool is_kfence_address(const void *addr) } /** - * kfence_alloc_pool() - allocate the KFENCE pool via memblock + * kfence_alloc_pool_and_metadata() - allocate the KFENCE pool and KFENCE + * metadata via memblock */ -void __init kfence_alloc_pool(void); +void __init kfence_alloc_pool_and_metadata(void); /** * kfence_init() - perform KFENCE initialization at boot time * - * Requires that kfence_alloc_pool() was called before. This sets up the - * allocation gate timer, and requires that workqueues are available. + * Requires that kfence_alloc_pool_and_metadata() was called before. This sets + * up the allocation gate timer, and requires that workqueues are available. */ void __init kfence_init(void); @@ -223,7 +224,7 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla #else /* CONFIG_KFENCE */ static inline bool is_kfence_address(const void *addr) { return false; } -static inline void kfence_alloc_pool(void) { } +static inline void kfence_alloc_pool_and_metadata(void) { } static inline void kfence_init(void) { } static inline void kfence_shutdown_cache(struct kmem_cache *s) { } static inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { return NULL; } -- cgit v1.2.3 From affd26b1fbd67fceea70d9ceac40ff4815afbeb5 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 19 Jul 2023 11:41:45 -0700 Subject: mm/hugetlb: get rid of page_hstate() Convert the last page_hstate() user to use folio_hstate() so page_hstate() can be safely removed. Link: https://lkml.kernel.org/r/20230719184145.301911-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9f4bac3df59e..0a393bc02f25 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -841,11 +841,6 @@ static inline struct hstate *folio_hstate(struct folio *folio) return size_to_hstate(folio_size(folio)); } -static inline struct hstate *page_hstate(struct page *page) -{ - return folio_hstate(page_folio(page)); -} - static inline unsigned hstate_index_to_shift(unsigned index) { return hstates[index].order + PAGE_SHIFT; @@ -1062,11 +1057,6 @@ static inline struct hstate *folio_hstate(struct folio *folio) return NULL; } -static inline struct hstate *page_hstate(struct page *page) -{ - return NULL; -} - static inline struct hstate *size_to_hstate(unsigned long size) { return NULL; -- cgit v1.2.3 From 134d153c9346fc1b2842cacec8720da3a9667a11 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Jul 2023 15:55:49 -0400 Subject: maple_tree: relax lockdep checks for on-stack trees To support early release of the maple tree locks, do not lockdep check the lock if it is set to NULL. This is intended for the special case on-stack use of tracking entries and not for general use. Link: https://lkml.kernel.org/r/20230714195551.894800-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Linus Torvalds Cc: Oliver Sang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 7769270b85e8..6618c1512886 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -182,7 +182,9 @@ enum maple_type { #ifdef CONFIG_LOCKDEP typedef struct lockdep_map *lockdep_map_p; -#define mt_lock_is_held(mt) lock_is_held(mt->ma_external_lock) +#define mt_lock_is_held(mt) \ + (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) + #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map #else -- cgit v1.2.3 From 02fdb25fb41c563a3afbd4a97469c527a6c5abbf Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 5 Jul 2023 14:47:49 -0400 Subject: mm/mmap: change detached vma locking scheme Don't set the lock to the mm lock so that the detached VMA tree does not complain about being unlocked when the mmap_lock is dropped prior to freeing the tree. Introduce mt_on_stack() for setting the external lock to NULL only when LOCKDEP is used. Move the destroying of the detached tree outside the mmap lock all together. Link: https://lkml.kernel.org/r/20230719183142.ktgcmuj2pnlr3h3s@revolver Signed-off-by: Liam R. Howlett Cc: Linus Torvalds Cc: Oliver Sang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 6618c1512886..e278b9598428 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -187,10 +187,13 @@ typedef struct lockdep_map *lockdep_map_p; #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map + +#define mt_on_stack(mt) (mt).ma_external_lock = NULL #else typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) +#define mt_on_stack(mt) do { } while (0) #endif /* -- cgit v1.2.3 From 19a462f06eb5a78e0c3ebe4fd4fbdc71620b8788 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Jul 2023 15:55:51 -0400 Subject: maple_tree: Be more strict about locking Use lockdep to check the write path in the maple tree holds the lock in write mode. Introduce mt_write_lock_is_held() to check if the lock is held for writing. Update the necessary checks for rcu_dereference_protected() to use the new write lock check. Link: https://lkml.kernel.org/r/20230714195551.894800-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Linus Torvalds Cc: Oliver Sang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e278b9598428..949f911bf955 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -185,13 +185,18 @@ typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) +#define mt_write_lock_is_held(mt) \ + (!(mt)->ma_external_lock || \ + lock_is_held_type((mt)->ma_external_lock, 0)) + #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else typedef struct { /* nothing */ } lockdep_map_p; -#define mt_lock_is_held(mt) 1 +#define mt_lock_is_held(mt) 1 +#define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) #define mt_on_stack(mt) do { } while (0) #endif -- cgit v1.2.3 From ec8832d007cb7b50229ad5745eec35b847cc9120 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:06 +1000 Subject: mmu_notifiers: don't invalidate secondary TLBs as part of mmu_notifier_invalidate_range_end() Secondary TLBs are now invalidated from the architecture specific TLB invalidation functions. Therefore there is no need to explicitly notify or invalidate as part of the range end functions. This means we can remove mmu_notifier_invalidate_range_end_only() and some of the ptep_*_notify() functions. Link: https://lkml.kernel.org/r/90d749d03cbab256ca0edeb5287069599566d783.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Jason Gunthorpe Cc: Andrew Donnellan Cc: Catalin Marinas Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: SeongJae Park Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 56 ++------------------------------------------ 1 file changed, 2 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 64a3e051c3c4..f2e9edc6aa43 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -395,8 +395,7 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm, extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); -extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, - bool only_end); +extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); extern bool @@ -481,14 +480,7 @@ mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) might_sleep(); if (mm_has_notifiers(range->mm)) - __mmu_notifier_invalidate_range_end(range, false); -} - -static inline void -mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) -{ - if (mm_has_notifiers(range->mm)) - __mmu_notifier_invalidate_range_end(range, true); + __mmu_notifier_invalidate_range_end(range); } static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, @@ -582,45 +574,6 @@ static inline void mmu_notifier_range_init_owner( __young; \ }) -#define ptep_clear_flush_notify(__vma, __address, __ptep) \ -({ \ - unsigned long ___addr = __address & PAGE_MASK; \ - struct mm_struct *___mm = (__vma)->vm_mm; \ - pte_t ___pte; \ - \ - ___pte = ptep_clear_flush(__vma, __address, __ptep); \ - mmu_notifier_invalidate_range(___mm, ___addr, \ - ___addr + PAGE_SIZE); \ - \ - ___pte; \ -}) - -#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ -({ \ - unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ - struct mm_struct *___mm = (__vma)->vm_mm; \ - pmd_t ___pmd; \ - \ - ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ - mmu_notifier_invalidate_range(___mm, ___haddr, \ - ___haddr + HPAGE_PMD_SIZE); \ - \ - ___pmd; \ -}) - -#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \ -({ \ - unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \ - struct mm_struct *___mm = (__vma)->vm_mm; \ - pud_t ___pud; \ - \ - ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \ - mmu_notifier_invalidate_range(___mm, ___haddr, \ - ___haddr + HPAGE_PUD_SIZE); \ - \ - ___pud; \ -}) - /* * set_pte_at_notify() sets the pte _after_ running the notifier. * This is safe to start by updating the secondary MMUs, because the primary MMU @@ -711,11 +664,6 @@ void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } -static inline void -mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) -{ -} - static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { -- cgit v1.2.3 From 1af5a8109904b7f00828e7f9f63f5695b42f8215 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:07 +1000 Subject: mmu_notifiers: rename invalidate_range notifier There are two main use cases for mmu notifiers. One is by KVM which uses mmu_notifier_invalidate_range_start()/end() to manage a software TLB. The other is to manage hardware TLBs which need to use the invalidate_range() callback because HW can establish new TLB entries at any time. Hence using start/end() can lead to memory corruption as these callbacks happen too soon/late during page unmap. mmu notifier users should therefore either use the start()/end() callbacks or the invalidate_range() callbacks. To make this usage clearer rename the invalidate_range() callback to arch_invalidate_secondary_tlbs() and update documention. Link: https://lkml.kernel.org/r/6f77248cd25545c8020a54b4e567e8b72be4dca1.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Suggested-by: Jason Gunthorpe Acked-by: Catalin Marinas Reviewed-by: Jason Gunthorpe Cc: Andrew Donnellan Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: SeongJae Park Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 48 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index f2e9edc6aa43..6e3c857606f1 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -187,27 +187,27 @@ struct mmu_notifier_ops { const struct mmu_notifier_range *range); /* - * invalidate_range() is either called between - * invalidate_range_start() and invalidate_range_end() when the - * VM has to free pages that where unmapped, but before the - * pages are actually freed, or outside of _start()/_end() when - * a (remote) TLB is necessary. + * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB + * which shares page-tables with the CPU. The + * invalidate_range_start()/end() callbacks should not be implemented as + * invalidate_secondary_tlbs() already catches the points in time when + * an external TLB needs to be flushed. * - * If invalidate_range() is used to manage a non-CPU TLB with - * shared page-tables, it not necessary to implement the - * invalidate_range_start()/end() notifiers, as - * invalidate_range() already catches the points in time when an - * external TLB range needs to be flushed. For more in depth - * discussion on this see Documentation/mm/mmu_notifier.rst + * This requires arch_invalidate_secondary_tlbs() to be called while + * holding the ptl spin-lock and therefore this callback is not allowed + * to sleep. * - * Note that this function might be called with just a sub-range - * of what was passed to invalidate_range_start()/end(), if - * called between those functions. + * This is called by architecture code whenever invalidating a TLB + * entry. It is assumed that any secondary TLB has the same rules for + * when invalidations are required. If this is not the case architecture + * code will need to call this explicitly when required for secondary + * TLB invalidation. */ - void (*invalidate_range)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + void (*arch_invalidate_secondary_tlbs)( + struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long start, + unsigned long end); /* * These callbacks are used with the get/put interface to manage the @@ -396,8 +396,8 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); -extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end); +extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end); extern bool mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); @@ -483,11 +483,11 @@ mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) __mmu_notifier_invalidate_range_end(range); } -static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range(mm, start, end); + __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) @@ -664,7 +664,7 @@ void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } -static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, +static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { } -- cgit v1.2.3 From ea09800bf17561a0d20498923d766468c0d7a4a7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 20 Jul 2023 19:28:06 +0800 Subject: mm: fix obsolete function name above debug_pagealloc_enabled_static() Since commit 04013513cc84 ("mm, page_alloc: do not rely on the order of page_poison and init_on_alloc/free parameters"), init_debug_pagealloc() is converted to init_mem_debugging_and_hardening(). Later it's renamed to mem_debugging_and_hardening_init() via commit f2fc4b44ec2b ("mm: move init_mem_debugging_and_hardening() to mm/mm_init.c"). Link: https://lkml.kernel.org/r/20230720112806.3851893-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0ae5654f665b..84988d4ff8fb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3465,8 +3465,8 @@ static inline bool debug_pagealloc_enabled(void) } /* - * For use in fast paths after init_debug_pagealloc() has run, or when a - * false negative result is not harmful when called too early. + * For use in fast paths after mem_debugging_and_hardening_init() has run, + * or when a false negative result is not harmful when called too early. */ static inline bool debug_pagealloc_enabled_static(void) { -- cgit v1.2.3 From 6d2790d95d7cffaf0def36270032ce5228dd43a5 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:44 +0800 Subject: mm/page_io: introduce bio_first_folio_all() Introduce bio_first_folio_all() to return a folio, which makes it easier to use. Link: https://lkml.kernel.org/r/20230721034451.16412-4-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Suggested-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/bio.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index c4f5b5228105..027ff9ab5d12 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -253,6 +253,11 @@ static inline struct page *bio_first_page_all(struct bio *bio) return bio_first_bvec_all(bio)->bv_page; } +static inline struct folio *bio_first_folio_all(struct bio *bio) +{ + return page_folio(bio_first_page_all(bio)); +} + static inline struct bio_vec *bio_last_bvec_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); -- cgit v1.2.3 From 90717566f8f6b6761494ccfff43ea62af443fc9b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 20 Jul 2023 21:34:36 +0200 Subject: mm: don't drop VMA locks in mm_drop_all_locks() Despite its name, mm_drop_all_locks() does not drop _all_ locks; the mmap lock is held write-locked by the caller, and the caller is responsible for dropping the mmap lock at a later point (which will also release the VMA locks). Calling vma_end_write_all() here is dangerous because the caller might have write-locked a VMA with the expectation that it will stay write-locked until the mmap_lock is released, as usual. This _almost_ becomes a problem in the following scenario: An anonymous VMA A and an SGX VMA B are mapped adjacent to each other. Userspace calls munmap() on a range starting at the start address of A and ending in the middle of B. Hypothetical call graph with additional notes in brackets: do_vmi_align_munmap [begin first for_each_vma_range loop] vma_start_write [on VMA A] vma_mark_detached [on VMA A] __split_vma [on VMA B] sgx_vma_open [== new->vm_ops->open] sgx_encl_mm_add __mmu_notifier_register [luckily THIS CAN'T ACTUALLY HAPPEN] mm_take_all_locks mm_drop_all_locks vma_end_write_all [drops VMA lock taken on VMA A before] vma_start_write [on VMA B] vma_mark_detached [on VMA B] [end first for_each_vma_range loop] vma_iter_clear_gfp [removes VMAs from maple tree] mmap_write_downgrade unmap_region mmap_read_unlock In this hypothetical scenario, while do_vmi_align_munmap() thinks it still holds a VMA write lock on VMA A, the VMA write lock has actually been invalidated inside __split_vma(). The call from sgx_encl_mm_add() to __mmu_notifier_register() can't actually happen here, as far as I understand, because we are duplicating an existing SGX VMA, but sgx_encl_mm_add() only calls __mmu_notifier_register() for the first SGX VMA created in a given process. So this could only happen in fork(), not on munmap(). But in my view it is just pure luck that this can't happen. Also, we wouldn't actually have any bad consequences from this in do_vmi_align_munmap(), because by the time the bug drops the lock on VMA A, we've already marked VMA A as detached, which makes it completely ineligible for any VMA-locked page faults. But again, that's just pure luck. So remove the vma_end_write_all(), so that VMA write locks are only ever released on mmap_write_unlock() or mmap_write_downgrade(). Also add comments to document the locking rules established by this patch. Link: https://lkml.kernel.org/r/20230720193436.454247-1-jannh@google.com Fixes: eeff9a5d47f8 ("mm/mmap: prevent pagefault handler from racing with mmu_notifier registration") Signed-off-by: Jann Horn Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ include/linux/mmap_lock.h | 8 ++++++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 84988d4ff8fb..93eb291181f7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -691,6 +691,11 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) return (vma->vm_lock_seq == *mm_lock_seq); } +/* + * Begin writing to a VMA. + * Exclude concurrent readers under the per-VMA lock until the currently + * write-locked mmap_lock is dropped or downgraded. + */ static inline void vma_start_write(struct vm_area_struct *vma) { int mm_lock_seq; diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index a5c63b6d7d46..8d38dcb6d044 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -73,6 +73,14 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK +/* + * Drop all currently-held per-VMA locks. + * This is called from the mmap_lock implementation directly before releasing + * a write-locked mmap_lock (or downgrading it to read-locked). + * This should normally NOT be called manually from other places. + * If you want to call this manually anyway, keep in mind that this will release + * *all* VMA write locks, including ones from further up the stack. + */ static inline void vma_end_write_all(struct mm_struct *mm) { mmap_assert_write_locked(mm); -- cgit v1.2.3 From fd892593d44d8b649caf30a67f0c7696d976d901 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:45 -0400 Subject: mm: change do_vmi_align_munmap() tracking of VMAs to remove The majority of the calls to munmap a vm range is within a single vma. The maple tree is able to store a single entry at 0, with a size of 1 as a pointer and avoid any allocations. Change do_vmi_align_munmap() to store the VMAs being munmap()'ed into a tree indexed by the count. This will leverage the ability to store the first entry without a node allocation. Storing the entries into a tree by the count and not the vma start and end means changing the functions which iterate over the entries. Update unmap_vmas() and free_pgtables() to take a maple state and a tree end address to support this functionality. Passing through the same maple state to unmap_vmas() and free_pgtables() means the state needs to be reset between calls. This happens in the static unmap_region() and exit_mmap(). Link: https://lkml.kernel.org/r/20230724183157.3939892-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 93eb291181f7..ded514ee2588 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2287,9 +2287,9 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } -void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, +void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end, bool mm_wr_locked); + unsigned long end, unsigned long tree_end, bool mm_wr_locked); struct mmu_notifier_range; -- cgit v1.2.3 From c1297987cc2ada57a7faea7985c2334548d110f9 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:47 -0400 Subject: maple_tree: introduce __mas_set_range() mas_set_range() resets the node to MAS_START, which will cause a re-walk of the tree to the range. This is unnecessary when the maple state is already at the correct location of the write. Add a function that only sets the range to avoid unnecessary re-walking of the tree. Link: https://lkml.kernel.org/r/20230724183157.3939892-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 949f911bf955..e10db656e31c 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -539,6 +539,22 @@ static inline void mas_reset(struct ma_state *mas) */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) +/** + * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the + * current location. + * @mas: Maple Tree operation state. + * @start: New start of range in the Maple Tree. + * @last: New end of range in the Maple Tree. + * + * set the internal maple state values to a sub-range. + * Please use mas_set_range() if you do not know where you are in the tree. + */ +static inline void __mas_set_range(struct ma_state *mas, unsigned long start, + unsigned long last) +{ + mas->index = start; + mas->last = last; +} /** * mas_set_range() - Set up Maple Tree operation state for a different index. @@ -553,9 +569,8 @@ static inline void mas_reset(struct ma_state *mas) static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { - mas->index = start; - mas->last = last; - mas->node = MAS_START; + __mas_set_range(mas, start, last); + mas->node = MAS_START; } /** -- cgit v1.2.3 From da0892547b101df6e13255b378380d077975368d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:49 -0400 Subject: maple_tree: re-introduce entry to mas_preallocate() arguments The current preallocation strategy is to preallocate the absolute worst-case allocation for a tree modification. The entry (or NULL) is needed to know how many nodes are needed to write to the tree. Start by adding the argument to the mas_preallocate() definition. Link: https://lkml.kernel.org/r/20230724183157.3939892-8-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e10db656e31c..c962af188681 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -466,7 +466,7 @@ void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); -int mas_preallocate(struct ma_state *mas, gfp_t gfp); +int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); -- cgit v1.2.3 From 284e05920498788c5df1a7dd6424adb426498e1c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:01 +0100 Subject: mm: remove CONFIG_PER_VMA_LOCK ifdefs Patch series "Handle most file-backed faults under the VMA lock", v3. This patchset adds the ability to handle page faults on parts of files which are already in the page cache without taking the mmap lock. This patch (of 10): Provide lock_vma_under_rcu() when CONFIG_PER_VMA_LOCK is not defined to eliminate ifdefs in the users. Link: https://lkml.kernel.org/r/20230724185410.1124082-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230724185410.1124082-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Punit Agrawal Cc: Arjun Roy Cc: Eric Dumazet Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ded514ee2588..21299a0cfbca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -742,6 +742,12 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} +static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) +{ + return NULL; +} + #endif /* CONFIG_PER_VMA_LOCK */ /* -- cgit v1.2.3 From 350f6bbca1de515cd7519a33661cefc93ea06054 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:02 +0100 Subject: mm: allow per-VMA locks on file-backed VMAs Remove the TCP layering violation by allowing per-VMA locks on all VMAs. The fault path will immediately fail in handle_mm_fault(). There may be a small performance reduction from this patch as a little unnecessary work will be done on each page fault. See later patches for the improvement. Link: https://lkml.kernel.org/r/20230724185410.1124082-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Signed-off-by: Andrew Morton --- include/linux/net_mm.h | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 include/linux/net_mm.h (limited to 'include/linux') diff --git a/include/linux/net_mm.h b/include/linux/net_mm.h deleted file mode 100644 index b298998bd5a0..000000000000 --- a/include/linux/net_mm.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifdef CONFIG_MMU - -#ifdef CONFIG_INET -extern const struct vm_operations_struct tcp_vm_ops; -static inline bool vma_is_tcp(const struct vm_area_struct *vma) -{ - return vma->vm_ops == &tcp_vm_ops; -} -#else -static inline bool vma_is_tcp(const struct vm_area_struct *vma) -{ - return false; -} -#endif /* CONFIG_INET*/ - -#endif /* CONFIG_MMU */ -- cgit v1.2.3 From 348ad1606f4c09e3dc28092baac474e10a252471 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:47 +0530 Subject: mm/hugepage pud: allow arch-specific helper function to check huge page pud support Patch series "Add support for DAX vmemmap optimization for ppc64", v6. This patch series implements changes required to support DAX vmemmap optimization for ppc64. The vmemmap optimization is only enabled with radix MMU translation and 1GB PUD mapping with 64K page size. The patch series also splits the hugetlb vmemmap optimization as a separate Kconfig variable so that architectures can enable DAX vmemmap optimization without enabling hugetlb vmemmap optimization. This should enable architectures like arm64 to enable DAX vmemmap optimization while they can't enable hugetlb vmemmap optimization. More details of the same are in patch "mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization". With 64K page size for 16384 pages added (1G) we save 14 pages With 4K page size for 262144 pages added (1G) we save 4094 pages With 4K page size for 512 pages added (2M) we save 6 pages This patch (of 13): Architectures like powerpc would like to enable transparent huge page pud support only with radix translation. To support that add has_transparent_pud_hugepage() helper that architectures can override. [aneesh.kumar@linux.ibm.com: use the new has_transparent_pud_hugepage()] Link: https://lkml.kernel.org/r/87tttrvtaj.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5f36c055794b..5eb6bdf30c62 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1505,6 +1505,9 @@ typedef unsigned int pgtbl_mod_mask; #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif +#ifndef has_transparent_pud_hugepage +#define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +#endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. -- cgit v1.2.3 From f32928ab6fe5abac5a270b6c0bffc4ce77ee8c42 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:48 +0530 Subject: mm: change pudp_huge_get_and_clear_full take vm_area_struct as arg We will use this in a later patch to do tlb flush when clearing pud entries on powerpc. This is similar to commit 93a98695f2f9 ("mm: change pmdp_huge_get_and_clear_full take vm_area_struct as arg") Link: https://lkml.kernel.org/r/20230724190759.483013-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5eb6bdf30c62..124427ece520 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -456,11 +456,11 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL -static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm, +static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, int full) { - return pudp_huge_get_and_clear(mm, address, pudp); + return pudp_huge_get_and_clear(vma->vm_mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit v1.2.3 From c1a6c536fb088c01d6bdce77731d89ad5e1734c6 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:49 +0530 Subject: mm/vmemmap: improve vmemmap_can_optimize and allow architectures to override dax vmemmap optimization requires a minimum of 2 PAGE_SIZE area within vmemmap such that tail page mapping can point to the second PAGE_SIZE area. Enforce that in vmemmap_can_optimize() function. Architectures like powerpc also want to enable vmemmap optimization conditionally (only with radix MMU translation). Hence allow architecture override. Link: https://lkml.kernel.org/r/20230724190759.483013-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 21299a0cfbca..d4ce73c20dcc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3632,13 +3632,32 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif +#define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP -static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) +static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { - return is_power_of_2(sizeof(struct page)) && - pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap; + unsigned long nr_pages; + unsigned long nr_vmemmap_pages; + + if (!pgmap || !is_power_of_2(sizeof(struct page))) + return false; + + nr_pages = pgmap_vmemmap_nr(pgmap); + nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); + /* + * For vmemmap optimization with DAX we need minimum 2 vmemmap + * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst + */ + return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR); } +/* + * If we don't have an architecture override, use the generic rule + */ +#ifndef vmemmap_can_optimize +#define vmemmap_can_optimize __vmemmap_can_optimize +#endif + #else static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) -- cgit v1.2.3 From 973bf6800cf37802ca48292378d574f2a689de9b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:51 +0530 Subject: mm: add pud_same similar to __HAVE_ARCH_P4D_SAME This helps architectures to override pmd_same and pud_same independently. Link: https://lkml.kernel.org/r/20230724190759.483013-6-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 124427ece520..0af8bc4ce258 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -699,11 +699,14 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } +#endif +#ifndef pud_same static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } +#define pud_same pud_same #endif #ifndef __HAVE_ARCH_P4D_SAME -- cgit v1.2.3 From 54a948a1e97a9d19a0a4bed63a2d4caef30c5d17 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:52 +0530 Subject: mm/huge pud: use transparent huge pud helpers only with CONFIG_TRANSPARENT_HUGEPAGE pudp_set_wrprotect and move_huge_pud helpers are only used when CONFIG_TRANSPARENT_HUGEPAGE is enabled. Similar to pmdp_set_wrprotect and move_huge_pmd_helpers use architecture override only if CONFIG_TRANSPARENT_HUGEPAGE is set Link: https://lkml.kernel.org/r/20230724190759.483013-7-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 0af8bc4ce258..f34e0f2cb4d8 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -564,6 +564,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +#ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { @@ -577,6 +578,7 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm, { BUILD_BUG(); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif -- cgit v1.2.3 From 0b6f15824cc7e431a9706c78bfb9cb3011477ad3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:53 +0530 Subject: mm/vmemmap optimization: split hugetlb and devdax vmemmap optimization Arm disabled hugetlb vmemmap optimization [1] because hugetlb vmemmap optimization includes an update of both the permissions (writeable to read-only) and the output address (pfn) of the vmemmap ptes. That is not supported without unmapping of pte(marking it invalid) by some architectures. With DAX vmemmap optimization we don't require such pte updates and architectures can enable DAX vmemmap optimization while having hugetlb vmemmap optimization disabled. Hence split DAX optimization support into a different config. s390, loongarch and riscv don't have devdax support. So the DAX config is not enabled for them. With this change, arm64 should be able to select DAX optimization [1] commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable HUGETLB_PAGE_OPTIMIZE_VMEMMAP") Link: https://lkml.kernel.org/r/20230724190759.483013-8-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d4ce73c20dcc..b2520dd555f9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3633,7 +3633,7 @@ void vmemmap_free(unsigned long start, unsigned long end, #endif #define VMEMMAP_RESERVE_NR 2 -#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { -- cgit v1.2.3 From 42c06a0e8ebe95b81e5fb41c6556ff22d9255b0c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 17 Jul 2023 12:02:27 -0400 Subject: mm: kill frontswap The only user of frontswap is zswap, and has been for a long time. Have swap call into zswap directly and remove the indirection. [hannes@cmpxchg.org: remove obsolete comment, per Yosry] Link: https://lkml.kernel.org/r/20230719142832.GA932528@cmpxchg.org [fengwei.yin@intel.com: don't warn if none swapcache folio is passed to zswap_load] Link: https://lkml.kernel.org/r/20230810095652.3905184-1-fengwei.yin@intel.com Link: https://lkml.kernel.org/r/20230717160227.GA867137@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Yin Fengwei Acked-by: Konrad Rzeszutek Wilk Acked-by: Nhat Pham Acked-by: Yosry Ahmed Acked-by: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Matthew Wilcox (Oracle) Cc: Vitaly Wool Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/frontswap.h | 91 ----------------------------------------------- include/linux/swap.h | 9 ----- include/linux/swapfile.h | 5 --- include/linux/zswap.h | 37 +++++++++++++++++++ 4 files changed, 37 insertions(+), 105 deletions(-) delete mode 100644 include/linux/frontswap.h create mode 100644 include/linux/zswap.h (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h deleted file mode 100644 index eaa0ac5f9003..000000000000 --- a/include/linux/frontswap.h +++ /dev/null @@ -1,91 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_FRONTSWAP_H -#define _LINUX_FRONTSWAP_H - -#include -#include -#include -#include - -struct frontswap_ops { - void (*init)(unsigned); /* this swap type was just swapon'ed */ - int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ - int (*load)(unsigned, pgoff_t, struct page *, bool *); /* load a page */ - void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */ - void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */ -}; - -int frontswap_register_ops(const struct frontswap_ops *ops); - -extern void frontswap_init(unsigned type, unsigned long *map); -extern int __frontswap_store(struct page *page); -extern int __frontswap_load(struct page *page); -extern void __frontswap_invalidate_page(unsigned, pgoff_t); -extern void __frontswap_invalidate_area(unsigned); - -#ifdef CONFIG_FRONTSWAP -extern struct static_key_false frontswap_enabled_key; - -static inline bool frontswap_enabled(void) -{ - return static_branch_unlikely(&frontswap_enabled_key); -} - -static inline void frontswap_map_set(struct swap_info_struct *p, - unsigned long *map) -{ - p->frontswap_map = map; -} - -static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) -{ - return p->frontswap_map; -} -#else -/* all inline routines become no-ops and all externs are ignored */ - -static inline bool frontswap_enabled(void) -{ - return false; -} - -static inline void frontswap_map_set(struct swap_info_struct *p, - unsigned long *map) -{ -} - -static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) -{ - return NULL; -} -#endif - -static inline int frontswap_store(struct page *page) -{ - if (frontswap_enabled()) - return __frontswap_store(page); - - return -1; -} - -static inline int frontswap_load(struct page *page) -{ - if (frontswap_enabled()) - return __frontswap_load(page); - - return -1; -} - -static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset) -{ - if (frontswap_enabled()) - __frontswap_invalidate_page(type, offset); -} - -static inline void frontswap_invalidate_area(unsigned type) -{ - if (frontswap_enabled()) - __frontswap_invalidate_area(type); -} - -#endif /* _LINUX_FRONTSWAP_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 456546443f1f..bb5adc604144 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -302,10 +302,6 @@ struct swap_info_struct { struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ struct completion comp; /* seldom referenced */ -#ifdef CONFIG_FRONTSWAP - unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ - atomic_t frontswap_pages; /* frontswap pages in-use counter */ -#endif spinlock_t lock; /* * protect map scan related fields like * swap_map, lowest_bit, highest_bit, @@ -630,11 +626,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) } #endif -#ifdef CONFIG_ZSWAP -extern u64 zswap_pool_total_size; -extern atomic_t zswap_stored_pages; -#endif - #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp); static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index 7ed529a77c5b..99e3ed469e88 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -2,11 +2,6 @@ #ifndef _LINUX_SWAPFILE_H #define _LINUX_SWAPFILE_H -/* - * these were static in swapfile.c but frontswap.c needs them and we don't - * want to expose them to the dozens of source files that include swap.h - */ -extern struct swap_info_struct *swap_info[]; extern unsigned long generic_max_swapfile_size(void); unsigned long arch_max_swapfile_size(void); diff --git a/include/linux/zswap.h b/include/linux/zswap.h new file mode 100644 index 000000000000..850c377d9b6d --- /dev/null +++ b/include/linux/zswap.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ZSWAP_H +#define _LINUX_ZSWAP_H + +#include +#include + +extern u64 zswap_pool_total_size; +extern atomic_t zswap_stored_pages; + +#ifdef CONFIG_ZSWAP + +bool zswap_store(struct page *page); +bool zswap_load(struct page *page); +void zswap_invalidate(int type, pgoff_t offset); +void zswap_swapon(int type); +void zswap_swapoff(int type); + +#else + +static inline bool zswap_store(struct page *page) +{ + return false; +} + +static inline bool zswap_load(struct page *page) +{ + return false; +} + +static inline void zswap_invalidate(int type, pgoff_t offset) {} +static inline void zswap_swapon(int type) {} +static inline void zswap_swapoff(int type) {} + +#endif + +#endif /* _LINUX_ZSWAP_H */ -- cgit v1.2.3 From 34f4c198bfbe86612c368eb122002787acecaa93 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 15 Jul 2023 05:23:40 +0100 Subject: zswap: make zswap_store() take a folio Patch series "Followup folio conversions for zswap". With frontswap killed, it's worth converting the zswap_load() and zswap_store() functions to take a folio instead of a page pointer. They aren't converted to support large folios, but there are a lot of unnecessary calls to compound_head() that are removed by these patches. This patch (of 4): Only convert a few easy parts of this function to use the folio passed in; convert back to struct page for the majority of it. This does remove a few hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20230715042343.434588-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230715042343.434588-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 850c377d9b6d..9f318c8bc367 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -10,7 +10,7 @@ extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP -bool zswap_store(struct page *page); +bool zswap_store(struct folio *folio); bool zswap_load(struct page *page); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); @@ -18,7 +18,7 @@ void zswap_swapoff(int type); #else -static inline bool zswap_store(struct page *page) +static inline bool zswap_store(struct folio *folio) { return false; } -- cgit v1.2.3 From 074e3e262adb02a7a42e32e0aadfb6b6cf416854 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 15 Jul 2023 05:23:41 +0100 Subject: memcg: convert get_obj_cgroup_from_page to get_obj_cgroup_from_folio As the one caller now has a folio, pass it in and use it. Removes three calls to compound_head(). Link: https://lkml.kernel.org/r/20230715042343.434588-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Nhat Pham Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 58eb7ca65699..058fb748e128 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1759,7 +1759,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); struct obj_cgroup *get_obj_cgroup_from_current(void); -struct obj_cgroup *get_obj_cgroup_from_page(struct page *page); +struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); @@ -1843,7 +1843,7 @@ static inline void __memcg_kmem_uncharge_page(struct page *page, int order) { } -static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) +static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { return NULL; } -- cgit v1.2.3 From ca54f6d89d60abf3e7dea68c95dfd442eeece212 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 15 Jul 2023 05:23:43 +0100 Subject: zswap: make zswap_load() take a folio Only convert a few easy parts of this function to use the folio passed in; convert back to struct page for the majority of it. Removes three hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20230715042343.434588-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Nhat Pham Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 9f318c8bc367..2a60ce39cfde 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -11,7 +11,7 @@ extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP bool zswap_store(struct folio *folio); -bool zswap_load(struct page *page); +bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); @@ -23,7 +23,7 @@ static inline bool zswap_store(struct folio *folio) return false; } -static inline bool zswap_load(struct page *page) +static inline bool zswap_load(struct folio *folio) { return false; } -- cgit v1.2.3 From c0a5d93a885b42c22085ad0f39233795f4a578e0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Jul 2023 22:58:10 +0800 Subject: mm/page_ext: add common function to get client data from page_ext Patch series "add page_ext_data to get client data in page_ext". Current clients get data from page_ext by adding offset which is auto generated in page_ext core and exposes the data layout design inside page_ext core. This series adds a page_ext_data() to hide this from clients. Benefits include: 1. Future clients can call page_ext_data directly instead of defining a new function like get_page_owner to get the data. 2. There is no change to clients if the layout of page_ext data changes. This patch (of 3): Add common page_ext_data function to get client data. This could hide offset which is auto generated in page_ext core and expose the desgin of page_ext data layout. Link: https://lkml.kernel.org/r/20230718145812.1991717-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230718145812.1991717-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Andrew Morton Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 67314f648aeb..658d8d39a10e 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -82,6 +82,12 @@ static inline void page_ext_init(void) extern struct page_ext *page_ext_get(struct page *page); extern void page_ext_put(struct page_ext *page_ext); +static inline void *page_ext_data(struct page_ext *page_ext, + struct page_ext_operations *ops) +{ + return (void *)(page_ext) + ops->offset; +} + static inline struct page_ext *page_ext_next(struct page_ext *curr) { void *next = curr; -- cgit v1.2.3 From 5d241789dfe1c0e5c9b4eb4ae6e48590964b4976 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 27 Jul 2023 19:59:34 +0800 Subject: mm/memcg: fix obsolete function name in mem_cgroup_protection() Commit 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks") changed the function name but not the corresponding comment. Link: https://lkml.kernel.org/r/20230727115934.657787-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 058fb748e128..419e001a02e4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -582,7 +582,7 @@ static inline void mem_cgroup_protection(struct mem_cgroup *root, /* * There is no reclaim protection applied to a targeted reclaim. * We are special casing this specific case here because - * mem_cgroup_protected calculation is not robust enough to keep + * mem_cgroup_calculate_protection is not robust enough to keep * the protection invariant for calculated effective values for * parallel reclaimers with different reclaim target. This is * especially a problem for tail memcgs (as they have pages on LRU) -- cgit v1.2.3 From 67311a36e5e1e56dfa7264c93db759b18217e114 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 17 Jul 2023 19:32:27 +0800 Subject: mm/page_ext: move page_ext_operations definition under CONFIG_PAGE_EXTENSION page_ext_operations should only be defined when CONFIG_PAGE_EXTENSION is enabled. Besides, this may detect missing reliance on CONFIG_PAGE_EXTENSION from future Page Extension clients at compile time. Link: https://lkml.kernel.org/r/20230717113227.1897173-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 658d8d39a10e..be98564191e6 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -8,6 +8,7 @@ struct pglist_data; +#ifdef CONFIG_PAGE_EXTENSION /** * struct page_ext_operations - per page_ext client operations * @offset: Offset to the client's data within page_ext. Offset is returned to @@ -29,8 +30,6 @@ struct page_ext_operations { bool need_shared_flags; }; -#ifdef CONFIG_PAGE_EXTENSION - /* * The page_ext_flags users must set need_shared_flags to true. */ -- cgit v1.2.3 From 11250fd12eb8a58205e69ea36f19fa8c084afb62 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 28 Jul 2023 13:00:40 +0800 Subject: mm: factor out VMA stack and heap checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: convert to vma_is_initial_heap/stack()", v3. Add vma_is_initial_stack() and vma_is_initial_heap() helpers and use them to simplify code. This patch (of 4): Factor out VMA stack and heap checks and name them vma_is_initial_stack() and vma_is_initial_heap() for general use. Link: https://lkml.kernel.org/r/20230728050043.59880-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20230728050043.59880-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Acked-by: Peter Zijlstra (Intel) Cc: Christian Göttsche Cc: Alex Deucher Cc: Arnaldo Carvalho de Melo Cc: Christian Göttsche Cc: Christian König Cc: Daniel Vetter Cc: David Airlie Cc: Eric Paris Cc: Felix Kuehling Cc: "Pan, Xinhui" Cc: Paul Moore Cc: Stephen Smalley Signed-off-by: Andrew Morton --- include/linux/mm.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b2520dd555f9..f64d1de3af09 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -833,6 +833,31 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma) return !vma->vm_ops; } +/* + * Indicate if the VMA is a heap for the given task; for + * /proc/PID/maps that is the heap of the main task. + */ +static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) +{ + return vma->vm_start <= vma->vm_mm->brk && + vma->vm_end >= vma->vm_mm->start_brk; +} + +/* + * Indicate if the VMA is a stack for the given task; for + * /proc/PID/maps that is the stack of the main task. + */ +static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) +{ + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; +} + static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); -- cgit v1.2.3 From ca39c5e7d10f09983293f064caa447690cb3ec92 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 1 Aug 2023 20:43:59 +0800 Subject: mm/memcg: update obsolete comment above parent_mem_cgroup() Since commit bef8620cd8e0 ("mm: memcg: deprecate the non-hierarchical mode"), use_hierarchy is already deprecated. And it's further removed via commit 9d9d341df4d5 ("cgroup: remove obsoleted broken_hierarchy and warned_broken_hierarchy"). Update corresponding comment. Link: https://lkml.kernel.org/r/20230801124359.2266860-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 419e001a02e4..163004ae3349 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -860,8 +860,7 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) * parent_mem_cgroup - find the accounting parent of a memcg * @memcg: memcg whose parent to find * - * Returns the parent memcg, or NULL if this is the root or the memory - * controller is in legacy no-hierarchy mode. + * Returns the parent memcg, or NULL if this is the root. */ static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { -- cgit v1.2.3 From ab9bda001b681c293fb72ef21f083adfbcd78028 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:00 +0000 Subject: mm/damon/core: introduce address range type damos filter Patch series "Extend DAMOS filters for address ranges and DAMON monitoring targets" There are use cases that need to apply DAMOS schemes to specific address ranges or DAMON monitoring targets. NUMA nodes in the physical address space, special memory objects in the virtual address space, and monitoring target specific efficient monitoring results snapshot retrieval could be examples of such use cases. This patchset extends DAMOS filters feature for such cases, by implementing two more filter types, namely address ranges and DAMON monitoring types. Patches sequence ---------------- The first seven patches are for the address ranges based DAMOS filter. The first patch implements the filter feature and expose it via DAMON kernel API. The second patch further expose the feature to users via DAMON sysfs interface. The third and fourth patches implement unit tests and selftests for the feature. Three patches (fifth to seventh) updating the documents follow. The following six patches are for the DAMON monitoring target based DAMOS filter. The eighth patch implements the feature in the core layer and expose it via DAMON's kernel API. The ninth patch further expose it to users via DAMON sysfs interface. Tenth patch add a selftest, and two patches (eleventh and twelfth) update documents. [1] https://lore.kernel.org/damon/20230728203444.70703-1-sj@kernel.org/ This patch (of 13): Users can know special characteristic of specific address ranges. NUMA nodes or special objects or buffers in virtual address space could be such examples. For such cases, DAMOS schemes could required to be applied to only specific address ranges. Implement yet another type of DAMOS filter for the purpose. Note that the existing filter types, namely anon pages and memcg DAMOS filters needed page level type check. Because such check can be done efficiently in the opertions set layer, those filters are handled in operations set layer. Specifically, only paddr operations set implementation supports these filters. Also, because statistics counting is done in the DAMON core layer, the regions that filtered out by these filters are counted as tried but failed to the statistics. Unlike those, address range based filters can efficiently handled in the core layer. Hence, do the handling in the layer, and count the regions that filtered out by those as the scheme has not tried for the region. This difference should clearly documented. Link: https://lkml.kernel.org/r/20230802214312.110532-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230802214312.110532-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index d5d4d19928e0..476f37a883a4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -226,16 +226,24 @@ struct damos_stat { * enum damos_filter_type - Type of memory for &struct damos_filter * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. + * @DAMOS_FILTER_TYPE_ADDR: Address range. * @NR_DAMOS_FILTER_TYPES: Number of filter types. * - * The support of each filter type is up to running &struct damon_operations. - * &enum DAMON_OPS_PADDR is supporting all filter types, while - * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR are not supporting any - * filter types. + * The anon pages type and memcg type filters are handled by underlying + * &struct damon_operations as a part of scheme action trying, and therefore + * accounted as 'tried'. In contrast, other types are handled by core layer + * before trying of the action and therefore not accounted as 'tried'. + * + * The support of the filters that handled by &struct damon_operations depend + * on the running &struct damon_operations. + * &enum DAMON_OPS_PADDR supports both anon pages type and memcg type filters, + * while &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR don't support any of + * the two types. */ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, DAMOS_FILTER_TYPE_MEMCG, + DAMOS_FILTER_TYPE_ADDR, NR_DAMOS_FILTER_TYPES, }; @@ -244,18 +252,20 @@ enum damos_filter_type { * @type: Type of the page. * @matching: If the matching page should filtered out or in. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. + * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each * page of the region matches to this and avoid applying the action if so. - * Note that the check support is up to &struct damon_operations - * implementation. + * Support of each filter type depends on the running &struct damon_operations + * and the type. Refer to &enum damos_filter_type for more detai. */ struct damos_filter { enum damos_filter_type type; bool matching; union { unsigned short memcg_id; + struct damon_addr_range addr_range; }; struct list_head list; }; -- cgit v1.2.3 From 17e7c724d3c2e622c4d9969b7a473e8ed1d14ff0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:07 +0000 Subject: mm/damon/core: implement target type damos filter One DAMON context can have multiple monitoring targets, and DAMOS schemes are applied to all targets. In some cases, users need to apply different scheme to different targets. Retrieving monitoring results via DAMON sysfs interface' 'tried_regions' directory could be one good example. Also, there could be cases that cgroup DAMOS filter is not enough. All such use cases can be worked around by having multiple DAMON contexts having only single target, but it is inefficient in terms of resource usage, thogh the overhead is not estimated to be huge. Implement DAMON monitoring target based DAMOS filter for the case. Like address range target DAMOS filter, handle these filters in the DAMON core layer, since it is more efficient than doing in operations set layer. This also means that regions that filtered out by monitoring target type DAMOS filters are counted as not tried by the scheme. Hence, target granularity monitoring results retrieval via DAMON sysfs interface becomes available. Link: https://lkml.kernel.org/r/20230802214312.110532-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 476f37a883a4..ae2664d1d5f1 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -227,6 +227,7 @@ struct damos_stat { * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @DAMOS_FILTER_TYPE_ADDR: Address range. + * @DAMOS_FILTER_TYPE_TARGET: Data Access Monitoring target. * @NR_DAMOS_FILTER_TYPES: Number of filter types. * * The anon pages type and memcg type filters are handled by underlying @@ -244,6 +245,7 @@ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, DAMOS_FILTER_TYPE_MEMCG, DAMOS_FILTER_TYPE_ADDR, + DAMOS_FILTER_TYPE_TARGET, NR_DAMOS_FILTER_TYPES, }; @@ -253,6 +255,9 @@ enum damos_filter_type { * @matching: If the matching page should filtered out or in. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. + * @target_idx: Index of the &struct damon_target of + * &damon_ctx->adaptive_targets if @type is + * DAMOS_FILTER_TYPE_TARGET. * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each @@ -266,6 +271,7 @@ struct damos_filter { union { unsigned short memcg_id; struct damon_addr_range addr_range; + int target_idx; }; struct list_head list; }; -- cgit v1.2.3 From ce2fc5fffdfa9fc1412aff108afa102ddf82fd2b Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:20 -0700 Subject: mm: for !CONFIG_PER_VMA_LOCK equate write lock assertion for vma and mmap When CONFIG_PER_VMA_LOCK=n, vma_assert_write_locked() should be equivalent to mmap_assert_write_locked(). Link: https://lkml.kernel.org/r/20230804152724.3090321-3-surenb@google.com Suggested-by: Jann Horn Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Linus Torvalds Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f64d1de3af09..49eafc62b4e6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -738,7 +738,8 @@ static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} -static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} +static inline void vma_assert_write_locked(struct vm_area_struct *vma) + { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} -- cgit v1.2.3 From 60081bf19b0ec8fa40c589bd361fa2bc763f1050 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:22 -0700 Subject: mm: lock vma explicitly before doing vm_flags_reset and vm_flags_reset_once Implicit vma locking inside vm_flags_reset() and vm_flags_reset_once() is not obvious and makes it hard to understand where vma locking is happening. Also in some cases (like in dup_userfaultfd()) vma should be locked earlier than vma_flags modification. To make locking more visible, change these functions to assert that the vma write lock is taken and explicitly lock the vma beforehand. Fix userfaultfd functions which should lock the vma earlier. Link: https://lkml.kernel.org/r/20230804152724.3090321-5-surenb@google.com Suggested-by: Linus Torvalds Signed-off-by: Suren Baghdasaryan Cc: Jann Horn Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 49eafc62b4e6..5a6ff9140090 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -774,18 +774,22 @@ static inline void vm_flags_init(struct vm_area_struct *vma, ACCESS_PRIVATE(vma, __vm_flags) = flags; } -/* Use when VMA is part of the VMA tree and modifications need coordination */ +/* + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. + */ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { - vma_start_write(vma); + vma_assert_write_locked(vma); vm_flags_init(vma, flags); } static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { - vma_start_write(vma); + vma_assert_write_locked(vma); WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); } -- cgit v1.2.3 From 9a9d0b829901125553c36b9512b2a5da4505be31 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 7 Aug 2023 01:16:11 +0200 Subject: mm: move dummy_vm_ops out of a header Otherwise the kernel ends up with multiple copies: $ nm vmlinux | grep dummy_vm_ops ffffffff81e4ea00 d dummy_vm_ops.2 ffffffff81e11760 d dummy_vm_ops.254 ffffffff81e406e0 d dummy_vm_ops.4 ffffffff81e3c780 d dummy_vm_ops.7 While here prefix it with vma_. Link: https://lkml.kernel.org/r/20230806231611.1395735-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5a6ff9140090..c63ec57a54dc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -751,17 +751,17 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #endif /* CONFIG_PER_VMA_LOCK */ +extern const struct vm_operations_struct vma_dummy_vm_ops; + /* * WARNING: vma_init does not initialize vma->vm_lock. * Use vm_area_alloc()/vm_area_free() if vma needs locking. */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { - static const struct vm_operations_struct dummy_vm_ops = {}; - memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; - vma->vm_ops = &dummy_vm_ops; + vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); vma_numab_state_init(vma); -- cgit v1.2.3 From 3f32c49ed6f15c8412a8abc93a92c4b37e6c4592 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 8 Aug 2023 11:33:59 +0800 Subject: mm: memtest: convert to memtest_report_meminfo() It is better to not expose too many internal variables of memtest, add a helper memtest_report_meminfo() to show memtest results. Link: https://lkml.kernel.org/r/20230808033359.174986-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport (IBM) Cc: Matthew Wilcox Cc: Tomas Mudrunka Signed-off-by: Andrew Morton --- include/linux/memblock.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 0d031fbfea25..1c1072e3ca06 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -594,13 +594,11 @@ extern int hashdist; /* Distribute hashes across NUMA nodes? */ #endif #ifdef CONFIG_MEMTEST -extern phys_addr_t early_memtest_bad_size; /* Size of faulty ram found by memtest */ -extern bool early_memtest_done; /* Was early memtest done? */ -extern void early_memtest(phys_addr_t start, phys_addr_t end); +void early_memtest(phys_addr_t start, phys_addr_t end); +void memtest_report_meminfo(struct seq_file *m); #else -static inline void early_memtest(phys_addr_t start, phys_addr_t end) -{ -} +static inline void early_memtest(phys_addr_t start, phys_addr_t end) { } +static inline void memtest_report_meminfo(struct seq_file *m) { } #endif -- cgit v1.2.3 From e3c2bfdd33a30b34674fb8839f5476ab2702c1c1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:44:57 +0530 Subject: mm/memory_hotplug: allow memmap on memory hotplug request to fallback If not supported, fallback to not using memap on memmory. This avoids the need for callers to do the fallback. Link: https://lkml.kernel.org/r/20230808091501.287660-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 013c69753c91..7d2076583494 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -97,6 +97,8 @@ typedef int __bitwise mhp_t; * To do so, we will use the beginning of the hot-added range to build * the page tables for the memmap array that describes the entire range. * Only selected architectures support it with SPARSE_VMEMMAP. + * This is only a hint, the core kernel can decide to not do this based on + * different alignment checks. */ #define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1)) /* @@ -354,7 +356,6 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); -extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ -- cgit v1.2.3 From 1a8c64e110435e44e71bcd50a75663174b575f22 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:45:01 +0530 Subject: mm/memory_hotplug: embed vmem_altmap details in memory block With memmap on memory, some architecture needs more details w.r.t altmap such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of computing them again when we remove a memory block, embed vmem_altmap details in struct memory_block if we are using memmap on memory block feature. [yangyingliang@huawei.com: fix error return code in add_memory_resource()] Link: https://lkml.kernel.org/r/20230809081552.1351184-1-yangyingliang@huawei.com Link: https://lkml.kernel.org/r/20230808091501.287660-7-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: Yang Yingliang Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- include/linux/memory.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 31343566c221..f53cfdaaaa41 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -77,11 +77,7 @@ struct memory_block { */ struct zone *zone; struct device dev; - /* - * Number of vmemmap pages. These pages - * lay at the beginning of the memory block. - */ - unsigned long nr_vmemmap_pages; + struct vmem_altmap *altmap; struct memory_group *group; /* group (if any) for this block */ struct list_head group_next; /* next block inside memory group */ #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) @@ -147,7 +143,7 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages, + struct vmem_altmap *altmap, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); -- cgit v1.2.3 From f7bda0d85dd7733143b5ea66987cb9b102bd0189 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:43 -0700 Subject: mm: add PAGE_TYPE_OP folio functions Patch series "Split ptdesc from struct page", v9. The MM subsystem is trying to shrink struct page. This patchset introduces a memory descriptor for page table tracking - struct ptdesc. This patchset introduces ptdesc, splits ptdesc from struct page, and converts many callers of page table constructor/destructors to use ptdescs. Ptdesc is a foundation to further standardize page tables, and eventually allow for dynamic allocation of page tables independent of struct page. However, the use of pages for page table tracking is quite deeply ingrained and varied across archictectures, so there is still a lot of work to be done before that can happen. This patch (of 31): No folio equivalents for page type operations have been defined, so define them for later folio conversions. Also changes the Page##uname macros to take in const struct page* since we only read the memory here. Link: https://lkml.kernel.org/r/20230807230513.102486-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20230807230513.102486-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Huacai Chen Cc: Hugh Dickins Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Cc: Geert Uytterhoeven Cc: Guo Ren Cc: John Paul Adrian Glaubitz Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 92a2063a0a23..9218028caf33 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -908,6 +908,8 @@ static inline bool is_page_hwpoison(struct page *page) #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) +#define folio_test_type(folio, flag) \ + ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) static inline int page_type_has_type(unsigned int page_type) { @@ -919,27 +921,41 @@ static inline int page_has_type(struct page *page) return page_type_has_type(page->page_type); } -#define PAGE_TYPE_OPS(uname, lname) \ -static __always_inline int Page##uname(struct page *page) \ +#define PAGE_TYPE_OPS(uname, lname, fname) \ +static __always_inline int Page##uname(const struct page *page) \ { \ return PageType(page, PG_##lname); \ } \ +static __always_inline int folio_test_##fname(const struct folio *folio)\ +{ \ + return folio_test_type(folio, PG_##lname); \ +} \ static __always_inline void __SetPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!PageType(page, 0), page); \ page->page_type &= ~PG_##lname; \ } \ +static __always_inline void __folio_set_##fname(struct folio *folio) \ +{ \ + VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \ + folio->page.page_type &= ~PG_##lname; \ +} \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type |= PG_##lname; \ -} +} \ +static __always_inline void __folio_clear_##fname(struct folio *folio) \ +{ \ + VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ + folio->page.page_type |= PG_##lname; \ +} \ /* * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ -PAGE_TYPE_OPS(Buddy, buddy) +PAGE_TYPE_OPS(Buddy, buddy, buddy) /* * PageOffline() indicates that the page is logically offline although the @@ -963,7 +979,7 @@ PAGE_TYPE_OPS(Buddy, buddy) * pages should check PageOffline() and synchronize with such drivers using * page_offline_freeze()/page_offline_thaw(). */ -PAGE_TYPE_OPS(Offline, offline) +PAGE_TYPE_OPS(Offline, offline, offline) extern void page_offline_freeze(void); extern void page_offline_thaw(void); @@ -973,12 +989,12 @@ extern void page_offline_end(void); /* * Marks pages in use as page tables. */ -PAGE_TYPE_OPS(Table, table) +PAGE_TYPE_OPS(Table, table, pgtable) /* * Marks guardpages used with debug_pagealloc. */ -PAGE_TYPE_OPS(Guard, guard) +PAGE_TYPE_OPS(Guard, guard, guard) extern bool is_free_buddy_page(struct page *page); -- cgit v1.2.3 From 9a35de4ffc209bd7956c4811ad17c4883791db43 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:44 -0700 Subject: pgtable: create struct ptdesc Currently, page table information is stored within struct page. As part of simplifying struct page, create struct ptdesc for page table information. Link: https://lkml.kernel.org/r/20230807230513.102486-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1fc4b9c2c8a6..d4ebcefe482e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -397,6 +397,76 @@ FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); #undef FOLIO_MATCH +/** + * struct ptdesc - Memory descriptor for page tables. + * @__page_flags: Same as page flags. Unused for page tables. + * @pt_rcu_head: For freeing page table pages. + * @pt_list: List of used page tables. Used for s390 and x86. + * @_pt_pad_1: Padding that aliases with page's compound head. + * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. + * @__page_mapping: Aliases with page->mapping. Unused for page tables. + * @pt_mm: Used for x86 pgds. + * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only. + * @_pt_pad_2: Padding to ensure proper alignment. + * @ptl: Lock for the page table. + * @__page_type: Same as page->page_type. Unused for page tables. + * @_refcount: Same as page refcount. Used for s390 page tables. + * @pt_memcg_data: Memcg data. Tracked for page tables here. + * + * This struct overlays struct page for now. Do not modify without a good + * understanding of the issues. + */ +struct ptdesc { + unsigned long __page_flags; + + union { + struct rcu_head pt_rcu_head; + struct list_head pt_list; + struct { + unsigned long _pt_pad_1; + pgtable_t pmd_huge_pte; + }; + }; + unsigned long __page_mapping; + + union { + struct mm_struct *pt_mm; + atomic_t pt_frag_refcount; + }; + + union { + unsigned long _pt_pad_2; +#if ALLOC_SPLIT_PTLOCKS + spinlock_t *ptl; +#else + spinlock_t ptl; +#endif + }; + unsigned int __page_type; + atomic_t _refcount; +#ifdef CONFIG_MEMCG + unsigned long pt_memcg_data; +#endif +}; + +#define TABLE_MATCH(pg, pt) \ + static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) +TABLE_MATCH(flags, __page_flags); +TABLE_MATCH(compound_head, pt_list); +TABLE_MATCH(compound_head, _pt_pad_1); +TABLE_MATCH(pmd_huge_pte, pmd_huge_pte); +TABLE_MATCH(mapping, __page_mapping); +TABLE_MATCH(pt_mm, pt_mm); +TABLE_MATCH(ptl, ptl); +TABLE_MATCH(rcu_head, pt_rcu_head); +TABLE_MATCH(page_type, __page_type); +TABLE_MATCH(_refcount, _refcount); +#ifdef CONFIG_MEMCG +TABLE_MATCH(memcg_data, pt_memcg_data); +#endif +#undef TABLE_MATCH +static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); + /* * Used for sizing the vmemmap region on some architectures */ -- cgit v1.2.3 From bf2d4334f72e4e033166c5a3bf1331a7238eab9d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:45 -0700 Subject: mm: add utility functions for ptdesc Introduce utility functions setting the foundation for ptdescs. These will also assist in the splitting out of ptdesc from struct page. Functions that focus on the descriptor are prefixed with ptdesc_* while functions that focus on the pagetable are prefixed with pagetable_*. pagetable_alloc() is defined to allocate new ptdesc pages as compound pages. This is to standardize ptdescs by allowing for one allocation and one free function, in contrast to 2 allocation and 2 free functions. Link: https://lkml.kernel.org/r/20230807230513.102486-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 12 ++++++++++ 2 files changed, 73 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c63ec57a54dc..f75058783359 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2772,6 +2772,57 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU */ +static inline struct ptdesc *virt_to_ptdesc(const void *x) +{ + return page_ptdesc(virt_to_page(x)); +} + +static inline void *ptdesc_to_virt(const struct ptdesc *pt) +{ + return page_to_virt(ptdesc_page(pt)); +} + +static inline void *ptdesc_address(const struct ptdesc *pt) +{ + return folio_address(ptdesc_folio(pt)); +} + +static inline bool pagetable_is_reserved(struct ptdesc *pt) +{ + return folio_test_reserved(ptdesc_folio(pt)); +} + +/** + * pagetable_alloc - Allocate pagetables + * @gfp: GFP flags + * @order: desired pagetable order + * + * pagetable_alloc allocates memory for page tables as well as a page table + * descriptor to describe that memory. + * + * Return: The ptdesc describing the allocated page tables. + */ +static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order) +{ + struct page *page = alloc_pages(gfp | __GFP_COMP, order); + + return page_ptdesc(page); +} + +/** + * pagetable_free - Free pagetables + * @pt: The page table descriptor + * + * pagetable_free frees the memory of all page tables described by a page + * table descriptor and the memory for the descriptor itself. + */ +static inline void pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + __free_pages(page, compound_order(page)); +} + #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); @@ -2898,6 +2949,11 @@ static inline struct page *pmd_pgtable_page(pmd_t *pmd) return virt_to_page((void *)((unsigned long) pmd & mask)); } +static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) +{ + return page_ptdesc(pmd_pgtable_page(pmd)); +} + static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(pmd_pgtable_page(pmd)); @@ -3010,6 +3066,11 @@ static inline void mark_page_reserved(struct page *page) adjust_managed_page_count(page, -1); } +static inline void free_reserved_ptdesc(struct ptdesc *pt) +{ + free_reserved_page(ptdesc_page(pt)); +} + /* * Default method to free all the __init memory into the buddy system. * The freed pages will be poisoned with pattern "poison" if it's within diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d4ebcefe482e..c57e940e60d0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -467,6 +467,18 @@ TABLE_MATCH(memcg_data, pt_memcg_data); #undef TABLE_MATCH static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); +#define ptdesc_page(pt) (_Generic((pt), \ + const struct ptdesc *: (const struct page *)(pt), \ + struct ptdesc *: (struct page *)(pt))) + +#define ptdesc_folio(pt) (_Generic((pt), \ + const struct ptdesc *: (const struct folio *)(pt), \ + struct ptdesc *: (struct folio *)(pt))) + +#define page_ptdesc(p) (_Generic((p), \ + const struct page *: (const struct ptdesc *)(p), \ + struct page *: (struct ptdesc *)(p))) + /* * Used for sizing the vmemmap region on some architectures */ -- cgit v1.2.3 From f8546d8494ca22fe530c8ba79beaf1509b47f6d1 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:46 -0700 Subject: mm: convert pmd_pgtable_page() callers to use pmd_ptdesc() Converts internal pmd_pgtable_page() callers to use pmd_ptdesc(). This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f75058783359..6fee233dfccc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2956,7 +2956,7 @@ static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return ptlock_ptr(pmd_pgtable_page(pmd)); + return ptlock_ptr(ptdesc_page(pmd_ptdesc(pmd))); } static inline bool pmd_ptlock_init(struct page *page) @@ -2975,7 +2975,7 @@ static inline void pmd_ptlock_free(struct page *page) ptlock_free(page); } -#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte) +#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else -- cgit v1.2.3 From f5ecca06b3a5d0371ee27ee08aa06c686407a8af Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:47 -0700 Subject: mm: convert ptlock_alloc() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6fee233dfccc..ccea0665247c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2826,7 +2826,7 @@ static inline void pagetable_free(struct ptdesc *pt) #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); -extern bool ptlock_alloc(struct page *page); +bool ptlock_alloc(struct ptdesc *ptdesc); extern void ptlock_free(struct page *page); static inline spinlock_t *ptlock_ptr(struct page *page) @@ -2838,7 +2838,7 @@ static inline void ptlock_cache_init(void) { } -static inline bool ptlock_alloc(struct page *page) +static inline bool ptlock_alloc(struct ptdesc *ptdesc) { return true; } @@ -2868,7 +2868,7 @@ static inline bool ptlock_init(struct page *page) * slab code uses page->slab_cache, which share storage with page->ptl. */ VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); - if (!ptlock_alloc(page)) + if (!ptlock_alloc(page_ptdesc(page))) return false; spin_lock_init(ptlock_ptr(page)); return true; -- cgit v1.2.3 From 1865484af6b2ced2f367c1042b5f3816c11db148 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:48 -0700 Subject: mm: convert ptlock_ptr() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-7-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ccea0665247c..7860529737d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2829,9 +2829,9 @@ void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); extern void ptlock_free(struct page *page); -static inline spinlock_t *ptlock_ptr(struct page *page) +static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { - return page->ptl; + return ptdesc->ptl; } #else /* ALLOC_SPLIT_PTLOCKS */ static inline void ptlock_cache_init(void) @@ -2847,15 +2847,15 @@ static inline void ptlock_free(struct page *page) { } -static inline spinlock_t *ptlock_ptr(struct page *page) +static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { - return &page->ptl; + return &ptdesc->ptl; } #endif /* ALLOC_SPLIT_PTLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return ptlock_ptr(pmd_page(*pmd)); + return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } static inline bool ptlock_init(struct page *page) @@ -2870,7 +2870,7 @@ static inline bool ptlock_init(struct page *page) VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); if (!ptlock_alloc(page_ptdesc(page))) return false; - spin_lock_init(ptlock_ptr(page)); + spin_lock_init(ptlock_ptr(page_ptdesc(page))); return true; } @@ -2956,7 +2956,7 @@ static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return ptlock_ptr(ptdesc_page(pmd_ptdesc(pmd))); + return ptlock_ptr(pmd_ptdesc(pmd)); } static inline bool pmd_ptlock_init(struct page *page) -- cgit v1.2.3 From edbaefe53c6418ea11372e6b3ce952ab8caa1f78 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:49 -0700 Subject: mm: convert pmd_ptlock_init() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-8-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7860529737d4..d70366169c3d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2959,12 +2959,12 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return ptlock_ptr(pmd_ptdesc(pmd)); } -static inline bool pmd_ptlock_init(struct page *page) +static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - page->pmd_huge_pte = NULL; + ptdesc->pmd_huge_pte = NULL; #endif - return ptlock_init(page); + return ptlock_init(ptdesc_page(ptdesc)); } static inline void pmd_ptlock_free(struct page *page) @@ -2984,7 +2984,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return &mm->page_table_lock; } -static inline bool pmd_ptlock_init(struct page *page) { return true; } +static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void pmd_ptlock_free(struct page *page) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -3000,7 +3000,7 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) static inline bool pgtable_pmd_page_ctor(struct page *page) { - if (!pmd_ptlock_init(page)) + if (!pmd_ptlock_init(page_ptdesc(page))) return false; __SetPageTable(page); inc_lruvec_page_state(page, NR_PAGETABLE); -- cgit v1.2.3 From 75b25d49ca6638f9a4eac47cff508b174743d907 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:50 -0700 Subject: mm: convert ptlock_init() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-9-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d70366169c3d..e53ef2eb95bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2858,7 +2858,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } -static inline bool ptlock_init(struct page *page) +static inline bool ptlock_init(struct ptdesc *ptdesc) { /* * prep_new_page() initialize page->private (and therefore page->ptl) @@ -2867,10 +2867,10 @@ static inline bool ptlock_init(struct page *page) * It can happen if arch try to use slab for page table allocation: * slab code uses page->slab_cache, which share storage with page->ptl. */ - VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); - if (!ptlock_alloc(page_ptdesc(page))) + VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc)); + if (!ptlock_alloc(ptdesc)) return false; - spin_lock_init(ptlock_ptr(page_ptdesc(page))); + spin_lock_init(ptlock_ptr(ptdesc)); return true; } @@ -2883,13 +2883,13 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) return &mm->page_table_lock; } static inline void ptlock_cache_init(void) {} -static inline bool ptlock_init(struct page *page) { return true; } +static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct page *page) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ static inline bool pgtable_pte_page_ctor(struct page *page) { - if (!ptlock_init(page)) + if (!ptlock_init(page_ptdesc(page))) return false; __SetPageTable(page); inc_lruvec_page_state(page, NR_PAGETABLE); @@ -2964,7 +2964,7 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptdesc->pmd_huge_pte = NULL; #endif - return ptlock_init(ptdesc_page(ptdesc)); + return ptlock_init(ptdesc); } static inline void pmd_ptlock_free(struct page *page) -- cgit v1.2.3 From 7e5f42ae3413785c68c383acb787f9ce8f243096 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:51 -0700 Subject: mm: convert pmd_ptlock_free() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-10-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e53ef2eb95bb..8884c700dfc6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2967,12 +2967,12 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) return ptlock_init(ptdesc); } -static inline void pmd_ptlock_free(struct page *page) +static inline void pmd_ptlock_free(struct ptdesc *ptdesc) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(page->pmd_huge_pte, page); + VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); #endif - ptlock_free(page); + ptlock_free(ptdesc_page(ptdesc)); } #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) @@ -2985,7 +2985,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } -static inline void pmd_ptlock_free(struct page *page) {} +static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -3009,7 +3009,7 @@ static inline bool pgtable_pmd_page_ctor(struct page *page) static inline void pgtable_pmd_page_dtor(struct page *page) { - pmd_ptlock_free(page); + pmd_ptlock_free(page_ptdesc(page)); __ClearPageTable(page); dec_lruvec_page_state(page, NR_PAGETABLE); } -- cgit v1.2.3 From 6ed1b8a09deb0b99fd3b54e11535c80284689555 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:52 -0700 Subject: mm: convert ptlock_free() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-11-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8884c700dfc6..d0fb31bcd482 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2827,7 +2827,7 @@ static inline void pagetable_free(struct ptdesc *pt) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); -extern void ptlock_free(struct page *page); +void ptlock_free(struct ptdesc *ptdesc); static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { @@ -2843,7 +2843,7 @@ static inline bool ptlock_alloc(struct ptdesc *ptdesc) return true; } -static inline void ptlock_free(struct page *page) +static inline void ptlock_free(struct ptdesc *ptdesc) { } @@ -2884,7 +2884,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) } static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } -static inline void ptlock_free(struct page *page) {} +static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ static inline bool pgtable_pte_page_ctor(struct page *page) @@ -2898,7 +2898,7 @@ static inline bool pgtable_pte_page_ctor(struct page *page) static inline void pgtable_pte_page_dtor(struct page *page) { - ptlock_free(page); + ptlock_free(page_ptdesc(page)); __ClearPageTable(page); dec_lruvec_page_state(page, NR_PAGETABLE); } @@ -2972,7 +2972,7 @@ static inline void pmd_ptlock_free(struct ptdesc *ptdesc) #ifdef CONFIG_TRANSPARENT_HUGEPAGE VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); #endif - ptlock_free(ptdesc_page(ptdesc)); + ptlock_free(ptdesc); } #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) -- cgit v1.2.3 From 7e11dca14b27e11596a0c49c7d20bc7816cb0508 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:53 -0700 Subject: mm: create ptdesc equivalents for pgtable_{pte,pmd}_page_{ctor,dtor} Create pagetable_pte_ctor(), pagetable_pmd_ctor(), pagetable_pte_dtor(), and pagetable_pmd_dtor() and make the original pgtable constructor/destructors wrappers. Link: https://lkml.kernel.org/r/20230807230513.102486-12-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 56 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d0fb31bcd482..6fdc294ada0d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2887,20 +2887,34 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ -static inline bool pgtable_pte_page_ctor(struct page *page) +static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { - if (!ptlock_init(page_ptdesc(page))) + struct folio *folio = ptdesc_folio(ptdesc); + + if (!ptlock_init(ptdesc)) return false; - __SetPageTable(page); - inc_lruvec_page_state(page, NR_PAGETABLE); + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } +static inline bool pgtable_pte_page_ctor(struct page *page) +{ + return pagetable_pte_ctor(page_ptdesc(page)); +} + +static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + ptlock_free(ptdesc); + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} + static inline void pgtable_pte_page_dtor(struct page *page) { - ptlock_free(page_ptdesc(page)); - __ClearPageTable(page); - dec_lruvec_page_state(page, NR_PAGETABLE); + pagetable_pte_dtor(page_ptdesc(page)); } pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); @@ -2998,20 +3012,34 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) return ptl; } -static inline bool pgtable_pmd_page_ctor(struct page *page) +static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) { - if (!pmd_ptlock_init(page_ptdesc(page))) + struct folio *folio = ptdesc_folio(ptdesc); + + if (!pmd_ptlock_init(ptdesc)) return false; - __SetPageTable(page); - inc_lruvec_page_state(page, NR_PAGETABLE); + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } +static inline bool pgtable_pmd_page_ctor(struct page *page) +{ + return pagetable_pmd_ctor(page_ptdesc(page)); +} + +static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + pmd_ptlock_free(ptdesc); + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} + static inline void pgtable_pmd_page_dtor(struct page *page) { - pmd_ptlock_free(page_ptdesc(page)); - __ClearPageTable(page); - dec_lruvec_page_state(page, NR_PAGETABLE); + pagetable_pmd_dtor(page_ptdesc(page)); } /* -- cgit v1.2.3 From 4f054c28f425b2f1623c9fdc2c2f69719a22190b Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:57 -0700 Subject: mm: remove page table members from struct page The page table members are now split out into their own ptdesc struct. Remove them from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-16-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c57e940e60d0..369b7fd35d03 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -141,24 +141,6 @@ struct page { struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ }; - struct { /* Page table pages */ - unsigned long _pt_pad_1; /* compound_head */ - pgtable_t pmd_huge_pte; /* protected by page->ptl */ - /* - * A PTE page table page might be freed by use of - * rcu_head: which overlays those two fields above. - */ - unsigned long _pt_pad_2; /* mapping */ - union { - struct mm_struct *pt_mm; /* x86 pgds only */ - atomic_t pt_frag_refcount; /* powerpc */ - }; -#if ALLOC_SPLIT_PTLOCKS - spinlock_t *ptl; -#else - spinlock_t ptl; -#endif - }; struct { /* ZONE_DEVICE pages */ /** @pgmap: Points to the hosting device page map. */ struct dev_pagemap *pgmap; @@ -454,10 +436,7 @@ struct ptdesc { TABLE_MATCH(flags, __page_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); -TABLE_MATCH(pmd_huge_pte, pmd_huge_pte); TABLE_MATCH(mapping, __page_mapping); -TABLE_MATCH(pt_mm, pt_mm); -TABLE_MATCH(ptl, ptl); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); TABLE_MATCH(_refcount, _refcount); -- cgit v1.2.3 From 9a4bbd8d975e01a777005e00c0e26d72bb6cc15a Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:13 -0700 Subject: mm: remove pgtable_{pmd, pte}_page_{ctor, dtor}() wrappers These functions are no longer necessary. Remove them and cleanup Documentation referencing them. Link: https://lkml.kernel.org/r/20230807230513.102486-32-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6fdc294ada0d..88ac4cf9d3ec 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2898,11 +2898,6 @@ static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) return true; } -static inline bool pgtable_pte_page_ctor(struct page *page) -{ - return pagetable_pte_ctor(page_ptdesc(page)); -} - static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -2912,11 +2907,6 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } -static inline void pgtable_pte_page_dtor(struct page *page) -{ - pagetable_pte_dtor(page_ptdesc(page)); -} - pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { @@ -3023,11 +3013,6 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) return true; } -static inline bool pgtable_pmd_page_ctor(struct page *page) -{ - return pagetable_pmd_ctor(page_ptdesc(page)); -} - static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3037,11 +3022,6 @@ static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } -static inline void pgtable_pmd_page_dtor(struct page *page) -{ - pagetable_pmd_dtor(page_ptdesc(page)); -} - /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be -- cgit v1.2.3 From 202e14222fadb246dfdf182e67de1518e86a1e20 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 14 Aug 2023 18:40:58 +1000 Subject: memfd: do not -EACCES old memfd_create() users with vm.memfd_noexec=2 Given the difficulty of auditing all of userspace to figure out whether every memfd_create() user has switched to passing MFD_EXEC and MFD_NOEXEC_SEAL flags, it seems far less distruptive to make it possible for older programs that don't make use of executable memfds to run under vm.memfd_noexec=2. Otherwise, a small dependency change can result in spurious errors. For programs that don't use executable memfds, passing MFD_NOEXEC_SEAL is functionally a no-op and thus having the same In addition, every failure under vm.memfd_noexec=2 needs to print to the kernel log so that userspace can figure out where the error came from. The concerns about pr_warn_ratelimited() spam that caused the switch to pr_warn_once()[1,2] do not apply to the vm.memfd_noexec=2 case. This is a user-visible API change, but as it allows programs to do something that would be blocked before, and the sysctl itself was broken and recently released, it seems unlikely this will cause any issues. [1]: https://lore.kernel.org/Y5yS8wCnuYGLHMj4@x1n/ [2]: https://lore.kernel.org/202212161233.85C9783FB@keescook/ Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-2-7ff9e3e10ba6@cyphar.com Fixes: 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC") Signed-off-by: Aleksa Sarai Cc: Dominique Martinet Cc: Christian Brauner Cc: Daniel Verkamp Cc: Jeff Xu Cc: Kees Cook Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- include/linux/pid_namespace.h | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c758809d5bcf..53974d79d98e 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -17,18 +17,10 @@ struct fs_pin; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) -/* - * sysctl for vm.memfd_noexec - * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL - * acts like MFD_EXEC was set. - * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL - * acts like MFD_NOEXEC_SEAL was set. - * 2: memfd_create() without MFD_NOEXEC_SEAL will be - * rejected. - */ -#define MEMFD_NOEXEC_SCOPE_EXEC 0 -#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 -#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 +/* modes for vm.memfd_noexec sysctl */ +#define MEMFD_NOEXEC_SCOPE_EXEC 0 /* MFD_EXEC implied if unset */ +#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 /* MFD_NOEXEC_SEAL implied if unset */ +#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 /* same as 1, except MFD_EXEC rejected */ #endif struct pid_namespace { -- cgit v1.2.3 From 9876cfe8ec1cb3c88de31f4d58d57b0e7e22bcc4 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 14 Aug 2023 18:41:00 +1000 Subject: memfd: replace ratcheting feature from vm.memfd_noexec with hierarchy This sysctl has the very unusual behaviour of not allowing any user (even CAP_SYS_ADMIN) to reduce the restriction setting, meaning that if you were to set this sysctl to a more restrictive option in the host pidns you would need to reboot your machine in order to reset it. The justification given in [1] is that this is a security feature and thus it should not be possible to disable. Aside from the fact that we have plenty of security-related sysctls that can be disabled after being enabled (fs.protected_symlinks for instance), the protection provided by the sysctl is to stop users from being able to create a binary and then execute it. A user with CAP_SYS_ADMIN can trivially do this without memfd_create(2): % cat mount-memfd.c #include #include #include #include #include #include #define SHELLCODE "#!/bin/echo this file was executed from this totally private tmpfs:" int main(void) { int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); assert(fsfd >= 0); assert(!fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 2)); int dfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); assert(dfd >= 0); int execfd = openat(dfd, "exe", O_CREAT | O_RDWR | O_CLOEXEC, 0782); assert(execfd >= 0); assert(write(execfd, SHELLCODE, strlen(SHELLCODE)) == strlen(SHELLCODE)); assert(!close(execfd)); char *execpath = NULL; char *argv[] = { "bad-exe", NULL }, *envp[] = { NULL }; execfd = openat(dfd, "exe", O_PATH | O_CLOEXEC); assert(execfd >= 0); assert(asprintf(&execpath, "/proc/self/fd/%d", execfd) > 0); assert(!execve(execpath, argv, envp)); } % ./mount-memfd this file was executed from this totally private tmpfs: /proc/self/fd/5 % Given that it is possible for CAP_SYS_ADMIN users to create executable binaries without memfd_create(2) and without touching the host filesystem (not to mention the many other things a CAP_SYS_ADMIN process would be able to do that would be equivalent or worse), it seems strange to cause a fair amount of headache to admins when there doesn't appear to be an actual security benefit to blocking this. There appear to be concerns about confused-deputy-esque attacks[2] but a confused deputy that can write to arbitrary sysctls is a bigger security issue than executable memfds. /* New API */ The primary requirement from the original author appears to be more based on the need to be able to restrict an entire system in a hierarchical manner[3], such that child namespaces cannot re-enable executable memfds. So, implement that behaviour explicitly -- the vm.memfd_noexec scope is evaluated up the pidns tree to &init_pid_ns and you have the most restrictive value applied to you. The new lower limit you can set vm.memfd_noexec is whatever limit applies to your parent. Note that a pidns will inherit a copy of the parent pidns's effective vm.memfd_noexec setting at unshare() time. This matches the existing behaviour, and it also ensures that a pidns will never have its vm.memfd_noexec setting *lowered* behind its back (but it will be raised if the parent raises theirs). /* Backwards Compatibility */ As the previous version of the sysctl didn't allow you to lower the setting at all, there are no backwards compatibility issues with this aspect of the change. However it should be noted that now that the setting is completely hierarchical. Previously, a cloned pidns would just copy the current pidns setting, meaning that if the parent's vm.memfd_noexec was changed it wouldn't propoagate to existing pid namespaces. Now, the restriction applies recursively. This is a uAPI change, however: * The sysctl is very new, having been merged in 6.3. * Several aspects of the sysctl were broken up until this patchset and the other patchset by Jeff Xu last month. And thus it seems incredibly unlikely that any real users would run into this issue. In the worst case, if this causes userspace isues we could make it so that modifying the setting follows the hierarchical rules but the restriction checking uses the cached copy. [1]: https://lore.kernel.org/CABi2SkWnAgHK1i6iqSqPMYuNEhtHBkO8jUuCvmG3RmUB5TKHJw@mail.gmail.com/ [2]: https://lore.kernel.org/CALmYWFs_dNCzw_pW1yRAo4bGCPEtykroEQaowNULp7svwMLjOg@mail.gmail.com/ [3]: https://lore.kernel.org/CALmYWFuahdUF7cT4cm7_TGLqPanuHXJ-hVSfZt7vpTnc18DPrw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-4-7ff9e3e10ba6@cyphar.com Fixes: 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC") Signed-off-by: Aleksa Sarai Cc: Dominique Martinet Cc: Christian Brauner Cc: Daniel Verkamp Cc: Jeff Xu Cc: Kees Cook Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- include/linux/pid_namespace.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 53974d79d98e..f9f9931e02d6 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -39,7 +39,6 @@ struct pid_namespace { int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) - /* sysctl for vm.memfd_noexec */ int memfd_noexec_scope; #endif } __randomize_layout; @@ -56,6 +55,23 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) return ns; } +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) +{ + int scope = MEMFD_NOEXEC_SCOPE_EXEC; + + for (; ns; ns = ns->parent) + scope = max(scope, READ_ONCE(ns->memfd_noexec_scope)); + + return scope; +} +#else +static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) +{ + return 0; +} +#endif + extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); @@ -70,6 +86,11 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) return ns; } +static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) +{ + return 0; +} + static inline struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *ns) { -- cgit v1.2.3 From 1b6754fea43cebd72d969618219347c5ec01eb8d Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sat, 12 Aug 2023 11:01:28 +0000 Subject: writeback: remove unused delaration of bdi_async_bio_wq It seems it was introduced by commit d3f77dfdc718 ("blkcg: implement REQ_CGROUP_PUNT") unintentionally, but the definition does not exist, remove it. Link: https://lkml.kernel.org/r/20230812110128.482650-1-xiujianfeng@huaweicloud.com Signed-off-by: Xiu Jianfeng Acked-by: Tejun Heo Cc: Stefan Roesch Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fbad4fcd408e..1a97277f99b1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -46,7 +46,6 @@ extern spinlock_t bdi_lock; extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; -extern struct workqueue_struct *bdi_async_bio_wq; static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { -- cgit v1.2.3 From e45a2e947dfa6da2d73e2cf91ed6399c12522d4f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 15 Aug 2023 11:06:09 +0800 Subject: pagemap: remove wait_on_page_locked_killable() There is no users of wait_on_page_locked_killable(), remove it. Link: https://lkml.kernel.org/r/20230815030609.39313-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0ab0f2362b9b..f4f24b594cd7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1060,11 +1060,6 @@ static inline void wait_on_page_locked(struct page *page) folio_wait_locked(page_folio(page)); } -static inline int wait_on_page_locked_killable(struct page *page) -{ - return folio_wait_locked_killable(page_folio(page)); -} - void wait_on_page_writeback(struct page *page); void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); -- cgit v1.2.3 From 14fb1fd751fa09440634b909e6f5f53e2cba9ae0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 3 Aug 2023 16:32:06 +0200 Subject: pgtable: improve pte_protnone() comment Especially the "For PROT_NONE VMAs, the PTEs are not marked _PAGE_PROTNONE" part is wrong: doing an mprotect(PROT_NONE) will end up marking all PTEs on x86_64 as _PAGE_PROTNONE, making pte_protnone() indicate "yes". So let's improve the comment, so it's easier to grasp which semantics pte_protnone() actually has. Link: https://lkml.kernel.org/r/20230803143208.383663-6-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Mel Gorman Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Linus Torvalds Cc: liubo Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Paolo Bonzini Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f34e0f2cb4d8..6064f454c8e3 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1333,12 +1333,16 @@ static inline int pud_trans_unstable(pud_t *pud) #ifndef CONFIG_NUMA_BALANCING /* - * Technically a PTE can be PROTNONE even when not doing NUMA balancing but - * the only case the kernel cares is for NUMA balancing and is only ever set - * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked - * _PAGE_PROTNONE so by default, implement the helper as "always no". It - * is the responsibility of the caller to distinguish between PROT_NONE - * protections and NUMA hinting fault protections. + * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is + * perfectly valid to indicate "no" in that case, which is why our default + * implementation defaults to "always no". + * + * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE + * page protection due to NUMA hinting. NUMA hinting faults only apply in + * accessible VMAs. + * + * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault, + * looking at the VMA accessibility is sufficient. */ static inline int pte_protnone(pte_t pte) { -- cgit v1.2.3 From dd6fa0b61814492476463149c91110e529364e82 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:50 +0100 Subject: mm: call free_huge_page() directly Indirect calls are expensive, thanks to Spectre. Call free_huge_page() directly if the folio belongs to hugetlb. Link: https://lkml.kernel.org/r/20230816151201.3655946-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0a393bc02f25..5a1dfaffbd80 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -26,6 +26,8 @@ typedef struct { unsigned long pd; } hugepd_t; #define __hugepd(x) ((hugepd_t) { (x) }) #endif +void free_huge_page(struct page *page); + #ifdef CONFIG_HUGETLB_PAGE #include @@ -165,7 +167,6 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void folio_putback_active_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); -void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); -- cgit v1.2.3 From 454a00c40a21c59e99c526fe8cc57bd029cf8f0e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:51 +0100 Subject: mm: convert free_huge_page() to free_huge_folio() Pass a folio instead of the head page to save a few instructions. Update the documentation, at least in English. Link: https://lkml.kernel.org/r/20230816151201.3655946-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Sidhartha Kumar Cc: Yanteng Si Cc: David Hildenbrand Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5a1dfaffbd80..5b2626063f4f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -26,7 +26,7 @@ typedef struct { unsigned long pd; } hugepd_t; #define __hugepd(x) ((hugepd_t) { (x) }) #endif -void free_huge_page(struct page *page); +void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3 From 8dc4a8f1e038189cb575f89bcd23364698b88cc1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:52 +0100 Subject: mm: convert free_transhuge_folio() to folio_undo_large_rmappable() Indirect calls are expensive, thanks to Spectre. Test for TRANSHUGE_PAGE_DTOR and destroy the folio appropriately. Move the free_compound_page() call into destroy_large_folio() to simplify later patches. Link: https://lkml.kernel.org/r/20230816151201.3655946-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 -- include/linux/mm.h | 2 -- 2 files changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e718dbe928ba..ceda26a20830 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -141,8 +141,6 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); void prep_transhuge_page(struct page *page); -void free_transhuge_page(struct page *page); - bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) diff --git a/include/linux/mm.h b/include/linux/mm.h index 55eb2789794e..0d14e2045658 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1253,9 +1253,7 @@ enum compound_dtor_id { #ifdef CONFIG_HUGETLB_PAGE HUGETLB_PAGE_DTOR, #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE TRANSHUGE_PAGE_DTOR, -#endif NR_COMPOUND_DTORS, }; -- cgit v1.2.3 From da6e7bf3a0315025e4199d599bd31763f0df3b4a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:53 +0100 Subject: mm: convert prep_transhuge_page() to folio_prep_large_rmappable() Match folio_undo_large_rmappable(), and move the casting from page to folio into the callers (which they were largely doing anyway). Link: https://lkml.kernel.org/r/20230816151201.3655946-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ceda26a20830..fa0350b0812a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -140,7 +140,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -void prep_transhuge_page(struct page *page); +void folio_prep_large_rmappable(struct folio *folio); bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) @@ -280,7 +280,7 @@ static inline bool hugepage_vma_check(struct vm_area_struct *vma, return false; } -static inline void prep_transhuge_page(struct page *page) {} +static inline void folio_prep_large_rmappable(struct folio *folio) {} #define transparent_hugepage_flags 0UL -- cgit v1.2.3 From 0f2f43fabb95192c73b19586ef7536d7ac7c2f8c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:54 +0100 Subject: mm: remove free_compound_page() and the compound_page_dtors array The only remaining destructor is free_compound_page(). Inline it into destroy_large_folio() and remove the array it used to live in. Link: https://lkml.kernel.org/r/20230816151201.3655946-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d14e2045658..0955b6b13fd0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1239,14 +1239,6 @@ void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); -/* - * Compound pages have a destructor function. Provide a - * prototype for that function and accessor functions. - * These are _only_ valid on the head of a compound page. - */ -typedef void compound_page_dtor(struct page *); - -/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */ enum compound_dtor_id { NULL_COMPOUND_DTOR, COMPOUND_PAGE_DTOR, @@ -1299,8 +1291,6 @@ static inline unsigned long thp_size(struct page *page) return PAGE_SIZE << thp_order(page); } -void free_compound_page(struct page *page); - #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when -- cgit v1.2.3 From 9c5ccf2db04b8d7c3df363fdd4856c2b79ab2c6a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:55 +0100 Subject: mm: remove HUGETLB_PAGE_DTOR We can use a bit in page[1].flags to indicate that this folio belongs to hugetlb instead of using a value in page[1].dtors. That lets folio_test_hugetlb() become an inline function like it should be. We can also get rid of NULL_COMPOUND_DTOR. Link: https://lkml.kernel.org/r/20230816151201.3655946-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ---- include/linux/page-flags.h | 43 +++++++++++++++++++++++++++++++++---------- 2 files changed, 33 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0955b6b13fd0..e241f5ee4dc4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1240,11 +1240,7 @@ void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); enum compound_dtor_id { - NULL_COMPOUND_DTOR, COMPOUND_PAGE_DTOR, -#ifdef CONFIG_HUGETLB_PAGE - HUGETLB_PAGE_DTOR, -#endif TRANSHUGE_PAGE_DTOR, NR_COMPOUND_DTORS, }; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 9218028caf33..e9e7cc45352d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -171,15 +171,6 @@ enum pageflags { /* Remapped by swiotlb-xen. */ PG_xen_remapped = PG_owner_priv_1, -#ifdef CONFIG_MEMORY_FAILURE - /* - * Compound pages. Stored in first tail page's flags. - * Indicates that at least one subpage is hwpoisoned in the - * THP. - */ - PG_has_hwpoisoned = PG_error, -#endif - /* non-lru isolated movable page */ PG_isolated = PG_reclaim, @@ -190,6 +181,15 @@ enum pageflags { /* For self-hosted memmap pages */ PG_vmemmap_self_hosted = PG_owner_priv_1, #endif + + /* + * Flags only valid for compound pages. Stored in first tail page's + * flags word. + */ + + /* At least one page in this folio has the hwpoison flag set */ + PG_has_hwpoisoned = PG_error, + PG_hugetlb = PG_active, }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -812,7 +812,23 @@ static inline void ClearPageCompound(struct page *page) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); -bool folio_test_hugetlb(struct folio *folio); +SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) +CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) + +/** + * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs + * @folio: The folio to test. + * + * Context: Any context. Caller should have a reference on the folio to + * prevent it from being turned into a tail page. + * Return: True for hugetlbfs folios, false for anon folios or folios + * belonging to other filesystems. + */ +static inline bool folio_test_hugetlb(struct folio *folio) +{ + return folio_test_large(folio) && + test_bit(PG_hugetlb, folio_flags(folio, 1)); +} #else TESTPAGEFLAG_FALSE(Huge, hugetlb) #endif @@ -1056,6 +1072,13 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) #define PAGE_FLAGS_CHECK_AT_PREP \ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) +/* + * Flags stored in the second page of a compound page. They may overlap + * the CHECK_AT_FREE flags above, so need to be cleared. + */ +#define PAGE_FLAGS_SECOND \ + (1UL << PG_has_hwpoisoned | 1UL << PG_hugetlb) + #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** -- cgit v1.2.3 From de53c05f2ae3d47d30db58e9c4e54e3bbc868377 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:56 +0100 Subject: mm: add large_rmappable page flag Stored in the first tail page's flags, this flag replaces the destructor. That removes the last of the destructors, so remove all references to folio_dtor and compound_dtor. Link: https://lkml.kernel.org/r/20230816151201.3655946-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 ------------- include/linux/mm_types.h | 2 -- include/linux/page-flags.h | 7 ++++++- 3 files changed, 6 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e241f5ee4dc4..63d573781973 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1239,19 +1239,6 @@ void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); -enum compound_dtor_id { - COMPOUND_PAGE_DTOR, - TRANSHUGE_PAGE_DTOR, - NR_COMPOUND_DTORS, -}; - -static inline void folio_set_compound_dtor(struct folio *folio, - enum compound_dtor_id compound_dtor) -{ - VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio); - folio->_folio_dtor = compound_dtor; -} - void destroy_large_folio(struct folio *folio); /* Returns the number of bytes in this potentially compound page. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index de5ac95572c8..1c5c2349c18e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -264,7 +264,6 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. - * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). @@ -318,7 +317,6 @@ struct folio { unsigned long _flags_1; unsigned long _head_1; /* public: */ - unsigned char _folio_dtor; unsigned char _folio_order; atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e9e7cc45352d..85d54b6c9e0b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -190,6 +190,7 @@ enum pageflags { /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_error, PG_hugetlb = PG_active, + PG_large_rmappable = PG_workingset, /* anon or file-backed */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -806,6 +807,9 @@ static inline void ClearPageCompound(struct page *page) BUG_ON(!PageHead(page)); ClearPageHead(page); } +PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND) +#else +TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable) #endif #define PG_head_mask ((1UL << PG_head)) @@ -1077,7 +1081,8 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) * the CHECK_AT_FREE flags above, so need to be cleared. */ #define PAGE_FLAGS_SECOND \ - (1UL << PG_has_hwpoisoned | 1UL << PG_hugetlb) + (1UL << PG_has_hwpoisoned | 1UL << PG_hugetlb | \ + 1UL << PG_large_rmappable) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) -- cgit v1.2.3 From c704ae9797843402436190a6f094a035237fd012 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:57 +0100 Subject: mm: rearrange page flags Move PG_writeback into bottom byte so that it can use PG_waiters in a later patch. Move PG_head into bottom byte as well to match with where 'order' is moving next. PG_active and PG_workingset move into the second byte to make room for them. By putting PG_head in bit 6, we ensure that it is cleared by assigning the folio order to the bottom byte of the first tail page (since the order cannot be larger than 63). Link: https://lkml.kernel.org/r/20230816151201.3655946-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 85d54b6c9e0b..46fc05c648ff 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -99,13 +99,15 @@ */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */ + PG_writeback, /* Page is under writeback */ PG_referenced, PG_uptodate, PG_dirty, PG_lru, + PG_head, /* Must be in bit 6 */ + PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, - PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_error, PG_slab, PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ @@ -113,8 +115,6 @@ enum pageflags { PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ - PG_writeback, /* Page is under writeback */ - PG_head, /* A head page */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ -- cgit v1.2.3 From ebc1baf5c9b46c2240c580a2fd992b2e48606dfa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:58 +0100 Subject: mm: free up a word in the first tail page Store the folio order in the low byte of the flags word in the first tail page. This frees up the word that was being used to store the order and dtor bytes previously. Link: https://lkml.kernel.org/r/20230816151201.3655946-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++----- include/linux/mm_types.h | 3 +-- include/linux/page-flags.h | 7 ++++--- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 63d573781973..939386e0aeda 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1000,7 +1000,7 @@ struct inode; * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be * prepared to handle wild return values. For example, PG_head may be - * set before _folio_order is initialised, or this may be a tail page. + * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ static inline unsigned int compound_order(struct page *page) @@ -1009,7 +1009,7 @@ static inline unsigned int compound_order(struct page *page) if (!test_bit(PG_head, &folio->flags)) return 0; - return folio->_folio_order; + return folio->_flags_1 & 0xff; } /** @@ -1025,7 +1025,7 @@ static inline unsigned int folio_order(struct folio *folio) { if (!folio_test_large(folio)) return 0; - return folio->_folio_order; + return folio->_flags_1 & 0xff; } #include @@ -1996,7 +1996,7 @@ static inline long folio_nr_pages(struct folio *folio) #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else - return 1L << folio->_folio_order; + return 1L << (folio->_flags_1 & 0xff); #endif } @@ -2014,7 +2014,7 @@ static inline unsigned long compound_nr(struct page *page) #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else - return 1L << folio->_folio_order; + return 1L << (folio->_flags_1 & 0xff); #endif } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1c5c2349c18e..40afb3bbc309 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -264,7 +264,6 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. - * @_folio_order: Do not use directly, call folio_order(). * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). @@ -316,8 +315,8 @@ struct folio { struct { unsigned long _flags_1; unsigned long _head_1; + unsigned long _folio_avail; /* public: */ - unsigned char _folio_order; atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; atomic_t _pincount; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 46fc05c648ff..638b0a96b4c5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -184,7 +184,8 @@ enum pageflags { /* * Flags only valid for compound pages. Stored in first tail page's - * flags word. + * flags word. Cannot use the first 8 flags or any flag marked as + * PF_ANY. */ /* At least one page in this folio has the hwpoison flag set */ @@ -1081,8 +1082,8 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) * the CHECK_AT_FREE flags above, so need to be cleared. */ #define PAGE_FLAGS_SECOND \ - (1UL << PG_has_hwpoisoned | 1UL << PG_hugetlb | \ - 1UL << PG_large_rmappable) + (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ + 1UL << PG_hugetlb | 1UL << PG_large_rmappable) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) -- cgit v1.2.3 From 6199277baf73a4877b42991b97c40e173e530756 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:59 +0100 Subject: mm: remove folio_test_transhuge() This function is misleading; people think it means "Is this a THP", when all it actually does is check whether this is a large folio. Remove it; the one remaining user should have been checking to see whether the folio is PMD sized or not. Link: https://lkml.kernel.org/r/20230816151201.3655946-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 638b0a96b4c5..5c02720c53a5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -853,11 +853,6 @@ static inline int PageTransHuge(struct page *page) return PageHead(page); } -static inline bool folio_test_transhuge(struct folio *folio) -{ - return folio_test_head(folio); -} - /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known -- cgit v1.2.3 From b10ff04dc0ec7cc7dbb0eac98c4202ec8d28c21b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:12:00 +0100 Subject: mm: add tail private fields to struct folio Because THP_SWAP uses page->private for each page, we must not use the space which overlaps that field for anything which would conflict with that. We avoid the conflict on 32-bit systems by disallowing THP_SWAP on 32-bit. Link: https://lkml.kernel.org/r/20230816151201.3655946-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 40afb3bbc309..06e9315bfe7c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -322,8 +322,11 @@ struct folio { atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; -#endif + /* 4 byte gap here */ /* private: the union with struct page is transitional */ + /* Fix THP_SWAP to not use tail->private */ + unsigned long _private_1; +#endif }; struct page __page_1; }; @@ -344,6 +347,9 @@ struct folio { /* public: */ struct list_head _deferred_list; /* private: the union with struct page is transitional */ + unsigned long _avail_2a; + /* Fix THP_SWAP to not use tail->private */ + unsigned long _private_2a; }; struct page __page_2; }; @@ -368,12 +374,18 @@ FOLIO_MATCH(memcg_data, memcg_data); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); +#ifdef CONFIG_64BIT +FOLIO_MATCH(private, _private_1); +#endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); +FOLIO_MATCH(flags, _flags_2a); +FOLIO_MATCH(compound_head, _head_2a); +FOLIO_MATCH(private, _private_2a); #undef FOLIO_MATCH /** -- cgit v1.2.3 From 7a32b58be9bab8f0440a4af526bfb1269e5affdb Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:53 -0700 Subject: mm: add missing VM_FAULT_RESULT_TRACE name for VM_FAULT_COMPLETED VM_FAULT_RESULT_TRACE should contain an element for every vm_fault_reason to be used as flag_array inside trace_print_flags_seq(). The element for VM_FAULT_COMPLETED is missing, add it. Link: https://lkml.kernel.org/r/20230630211957.1341547-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Peter Xu Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 06e9315bfe7c..2b9d8be28361 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1184,7 +1184,8 @@ enum vm_fault_reason { { VM_FAULT_RETRY, "RETRY" }, \ { VM_FAULT_FALLBACK, "FALLBACK" }, \ { VM_FAULT_DONE_COW, "DONE_COW" }, \ - { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" } + { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \ + { VM_FAULT_COMPLETED, "COMPLETED" } struct vm_special_mapping { const char *name; /* The name, e.g. "[vdso]". */ -- cgit v1.2.3 From fdc724d6aa44efd75cc9b6a3c3900baac44bc50a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:55 -0700 Subject: mm: change folio_lock_or_retry to use vm_fault directly Change folio_lock_or_retry to accept vm_fault struct and return the vm_fault_t directly. Link: https://lkml.kernel.org/r/20230630211957.1341547-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Matthew Wilcox Acked-by: Peter Xu Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f4f24b594cd7..437e4526028c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -916,8 +916,7 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page, void __folio_lock(struct folio *folio); int __folio_lock_killable(struct folio *folio); -bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, - unsigned int flags); +vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf); void unlock_page(struct page *page); void folio_unlock(struct folio *folio); @@ -1021,11 +1020,13 @@ static inline int folio_lock_killable(struct folio *folio) * Return value and mmap_lock implications depend on flags; see * __folio_lock_or_retry(). */ -static inline bool folio_lock_or_retry(struct folio *folio, - struct mm_struct *mm, unsigned int flags) +static inline vm_fault_t folio_lock_or_retry(struct folio *folio, + struct vm_fault *vmf) { might_sleep(); - return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags); + if (!folio_trylock(folio)) + return __folio_lock_or_retry(folio, vmf); + return 0; } /* -- cgit v1.2.3 From 1235ccd05b6dd6970ff50baea99aa994023fbc4a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:56 -0700 Subject: mm: handle swap page faults under per-VMA lock When page fault is handled under per-VMA lock protection, all swap page faults are retried with mmap_lock because folio_lock_or_retry has to drop and reacquire mmap_lock if folio could not be immediately locked. Follow the same pattern as mmap_lock to drop per-VMA lock when waiting for folio and retrying once folio is available. With this obstacle removed, enable do_swap_page to operate under per-VMA lock protection. Drivers implementing ops->migrate_to_ram might still rely on mmap_lock, therefore we have to fall back to mmap_lock in that particular case. Note that the only time do_swap_page calls synchronous swap_readpage is when SWP_SYNCHRONOUS_IO is set, which is only set for QUEUE_FLAG_SYNCHRONOUS devices: brd, zram and nvdimms (both btt and pmem). Therefore we don't sleep in this path, and there's no need to drop the mmap or per-VMA lock. Link: https://lkml.kernel.org/r/20230630211957.1341547-6-surenb@google.com Signed-off-by: Suren Baghdasaryan Tested-by: Alistair Popple Reviewed-by: Alistair Popple Acked-by: Peter Xu Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 939386e0aeda..0d16208178c7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -729,6 +729,14 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) vma->detached = detached; } +static inline void release_fault_lock(struct vm_fault *vmf) +{ + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + vma_end_read(vmf->vma); + else + mmap_read_unlock(vmf->vma->vm_mm); +} + struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); @@ -749,6 +757,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, return NULL; } +static inline void release_fault_lock(struct vm_fault *vmf) +{ + mmap_read_unlock(vmf->vma->vm_mm); +} + #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; -- cgit v1.2.3 From 29a22b9e08d70d6c9b075c12c47b6e895cb65cf0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:57 -0700 Subject: mm: handle userfaults under VMA lock Enable handle_userfault to operate under VMA lock by releasing VMA lock instead of mmap_lock and retrying. Note that FAULT_FLAG_RETRY_NOWAIT should never be used when handling faults under per-VMA lock protection because that would break the assumption that lock is dropped on retry. [surenb@google.com: fix a lockdep issue in vma_assert_write_locked] Link: https://lkml.kernel.org/r/20230712195652.969194-1-surenb@google.com Link: https://lkml.kernel.org/r/20230630211957.1341547-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Peter Xu Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d16208178c7..c1db400e83cb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -679,6 +679,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) rcu_read_unlock(); } +/* WARNING! Can only be used if mmap_lock is expected to be write-locked */ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -721,6 +722,12 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + if (!rwsem_is_locked(&vma->vm_lock->lock)) + vma_assert_write_locked(vma); +} + static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) { /* When detaching vma should be write-locked */ @@ -737,6 +744,14 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } +static inline void assert_fault_locked(struct vm_fault *vmf) +{ + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + vma_assert_locked(vmf->vma); + else + mmap_assert_locked(vmf->vma->vm_mm); +} + struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); @@ -762,6 +777,11 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } +static inline void assert_fault_locked(struct vm_fault *vmf) +{ + mmap_assert_locked(vmf->vma->vm_mm); +} + #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; -- cgit v1.2.3 From f82e6bf9bb9b6e1dc001320a88eee67d7ac31e96 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 26 Jul 2023 15:32:23 +0000 Subject: mm: memcg: use rstat for non-hierarchical stats Currently, memcg uses rstat to maintain aggregated hierarchical stats. Counters are maintained for hierarchical stats at each memcg. Rstat tracks which cgroups have updates on which cpus to keep those counters fresh on the read-side. Non-hierarchical stats are currently not covered by rstat. Their per-cpu counters are summed up on every read, which is expensive. The original implementation did the same. At some point before rstat, non-hierarchical aggregated counters were introduced by commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting"). However, those counters were updated on the performance critical write-side, which caused regressions, so they were later removed by commit 815744d75152 ("mm: memcontrol: don't batch updates of local VM stats and events"). See [1] for more detailed history. Kernel versions in between a983b5ebee57 & 815744d75152 (a year and a half) enjoyed cheap reads of non-hierarchical stats, specifically on cgroup v1. When moving to more recent kernels, a performance regression for reading non-hierarchical stats is observed. Now that we have rstat, we know exactly which percpu counters have updates for each stat. We can maintain non-hierarchical counters again, making reads much more efficient, without affecting the performance critical write-side. Hence, add non-hierarchical (i.e local) counters for the stats, and extend rstat flushing to keep those up-to-date. A caveat is that we now need a stats flush before reading local/non-hierarchical stats through {memcg/lruvec}_page_state_local() or memcg_events_local(), where we previously only needed a flush to read hierarchical stats. Most contexts reading non-hierarchical stats are already doing a flush, add a flush to the only missing context in count_shadow_nodes(). With this patch, reading memory.stat from 1000 memcgs is 3x faster on a machine with 256 cpus on cgroup v1: # for i in $(seq 1000); do mkdir /sys/fs/cgroup/memory/cg$i; done # time cat /sys/fs/cgroup/memory/cg*/memory.stat > /dev/null real 0m0.125s user 0m0.005s sys 0m0.120s After: real 0m0.032s user 0m0.005s sys 0m0.027s To make sure there are no regressions on cgroup v2, I ran an artificial reclaim/refault stress test [2] that creates (NR_CPUS * 2) cgroups, assigns them limits, runs a worker process in each cgroup that allocates tmpfs memory equal to quadruple the limit (to invoke reclaim continuously), and then reads back the entire file (to invoke refaults). All workers are run in parallel, and zram is used as a swapping backend. Both reclaim and refault have conditional stats flushing. I ran this on a machine with 112 cpus, once on mm-unstable, and once on mm-unstable with this patch reverted. (1) A few runs without this patch: # time ./stress_reclaim_refault.sh real 0m9.949s user 0m0.496s sys 14m44.974s # time ./stress_reclaim_refault.sh real 0m10.049s user 0m0.486s sys 14m55.791s # time ./stress_reclaim_refault.sh real 0m9.984s user 0m0.481s sys 14m53.841s (2) A few runs with this patch: # time ./stress_reclaim_refault.sh real 0m9.885s user 0m0.486s sys 14m48.753s # time ./stress_reclaim_refault.sh real 0m9.903s user 0m0.495s sys 14m48.339s # time ./stress_reclaim_refault.sh real 0m9.861s user 0m0.507s sys 14m49.317s No regressions are observed with this patch. There is actually a very slight improvement. If I have to guess, maybe it's because we avoid the percpu loop in count_shadow_nodes() when calling lruvec_page_state_local(), but I could not prove this using perf, it's probably in the noise. [1] https://lore.kernel.org/lkml/20230725201811.GA1231514@cmpxchg.org/ [2] https://lore.kernel.org/lkml/CAJD7tkb17x=qwoO37uxyYXLEUVp15BQKR+Xfh7Sg9Hx-wTQ_=w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230803185046.1385770-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20230726153223.821757-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 163004ae3349..11810a2cfd2d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -111,6 +111,9 @@ struct lruvec_stats { /* Aggregated (CPU and subtree) state */ long state[NR_VM_NODE_STAT_ITEMS]; + /* Non-hierarchical (CPU aggregated) state */ + long state_local[NR_VM_NODE_STAT_ITEMS]; + /* Pending child counts during tree propagation */ long state_pending[NR_VM_NODE_STAT_ITEMS]; }; @@ -1018,14 +1021,12 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, { struct mem_cgroup_per_node *pn; long x = 0; - int cpu; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for_each_possible_cpu(cpu) - x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu); + x = READ_ONCE(pn->lruvec_stats.state_local[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; -- cgit v1.2.3 From f9bff0e31881d03badf191d3b0005839391f5f2b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:29 +0100 Subject: minmax: add in_range() macro Patch series "New page table range API", v6. This patchset changes the API used by the MM to set up page table entries. The four APIs are: set_ptes(mm, addr, ptep, pte, nr) update_mmu_cache_range(vma, addr, ptep, nr) flush_dcache_folio(folio) flush_icache_pages(vma, page, nr) flush_dcache_folio() isn't technically new, but no architecture implemented it, so I've done that for them. The old APIs remain around but are mostly implemented by calling the new interfaces. The new APIs are based around setting up N page table entries at once. The N entries belong to the same PMD, the same folio and the same VMA, so ptep++ is a legitimate operation, and locking is taken care of for you. Some architectures can do a better job of it than just a loop, but I have hesitated to make too deep a change to architectures I don't understand well. One thing I have changed in every architecture is that PG_arch_1 is now a per-folio bit instead of a per-page bit when used for dcache clean/dirty tracking. This was something that would have to happen eventually, and it makes sense to do it now rather than iterate over every page involved in a cache flush and figure out if it needs to happen. The point of all this is better performance, and Fengwei Yin has measured improvement on x86. I suspect you'll see improvement on your architecture too. Try the new will-it-scale test mentioned here: https://lore.kernel.org/linux-mm/20230206140639.538867-5-fengwei.yin@intel.com/ You'll need to run it on an XFS filesystem and have CONFIG_TRANSPARENT_HUGEPAGE set. This patchset is the basis for much of the anonymous large folio work being done by Ryan, so it's received quite a lot of testing over the last few months. This patch (of 38): Determine if a value lies within a range more efficiently (subtraction + comparison vs two comparisons and an AND). It also has useful (under some circumstances) behaviour if the range exceeds the maximum value of the type. Convert all the conflicting definitions of in_range() within the kernel; some can use the generic definition while others need their own definition. Link: https://lkml.kernel.org/r/20230802151406.3735276-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230802151406.3735276-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/minmax.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 396df1121bff..4f011eb6533d 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -3,6 +3,7 @@ #define _LINUX_MINMAX_H #include +#include /* * min()/max()/clamp() macros must accomplish three things: @@ -158,6 +159,32 @@ */ #define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) +static inline bool in_range64(u64 val, u64 start, u64 len) +{ + return (val - start) < len; +} + +static inline bool in_range32(u32 val, u32 start, u32 len) +{ + return (val - start) < len; +} + +/** + * in_range - Determine if a value lies within a range. + * @val: Value to test. + * @start: First value in range. + * @len: Number of values in range. + * + * This is more efficient than "if (start <= val && val < (start + len))". + * It also gives a different answer if @start + @len overflows the size of + * the type by a sufficient amount to encompass @val. Decide for yourself + * which behaviour you want, or prove that start + len never overflow. + * Do not blindly replace one form with the other. + */ +#define in_range(val, start, len) \ + ((sizeof(start) | sizeof(len) | sizeof(val)) <= sizeof(u32) ? \ + in_range32(val, start, len) : in_range64(val, start, len)) + /** * swap - swap values of @a and @b * @a: first value -- cgit v1.2.3 From a379322022c0961fe0b638cdd842d3c38eeff92c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:30 +0100 Subject: mm: convert page_table_check_pte_set() to page_table_check_ptes_set() Tell the page table check how many PTEs & PFNs we want it to check. Link: https://lkml.kernel.org/r/20230802151406.3735276-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (IBM) Acked-by: Pasha Tatashin Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/page_table_check.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 7f6b9bf926c5..6722941c7cb8 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -17,7 +17,8 @@ void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); -void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); +void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, + unsigned int nr); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, @@ -64,13 +65,13 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) __page_table_check_pud_clear(mm, pud); } -static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, - pte_t pte) +static inline void page_table_check_ptes_set(struct mm_struct *mm, + pte_t *ptep, pte_t pte, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_set(mm, ptep, pte); + __page_table_check_ptes_set(mm, ptep, pte, nr); } static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, @@ -123,8 +124,8 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } -static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, - pte_t pte) +static inline void page_table_check_ptes_set(struct mm_struct *mm, + pte_t *ptep, pte_t pte, unsigned int nr) { } -- cgit v1.2.3 From bc60abfbe687e886f1c38d49ef2a00e90b4b49cf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:32 +0100 Subject: mm: add folio_flush_mapping() This is the folio equivalent of page_mapping_file(), but rename it to make it clear that it's very different from page_file_mapping(). Theoretically, there's nothing flush-only about it, but there are no other users today, and I doubt there will be; it's almost always more useful to know the swapfile's mapping or the swapcache's mapping. Link: https://lkml.kernel.org/r/20230802151406.3735276-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 437e4526028c..88d161887cf2 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -389,6 +389,26 @@ static inline struct address_space *folio_file_mapping(struct folio *folio) return folio->mapping; } +/** + * folio_flush_mapping - Find the file mapping this folio belongs to. + * @folio: The folio. + * + * For folios which are in the page cache, return the mapping that this + * page belongs to. Anonymous folios return NULL, even if they're in + * the swap cache. Other kinds of folio also return NULL. + * + * This is ONLY used by architecture cache flushing code. If you aren't + * writing cache flushing code, you want either folio_mapping() or + * folio_file_mapping(). + */ +static inline struct address_space *folio_flush_mapping(struct folio *folio) +{ + if (unlikely(folio_test_swapcache(folio))) + return NULL; + + return folio_mapping(folio); +} + static inline struct address_space *page_file_mapping(struct page *page) { return folio_file_mapping(page_folio(page)); @@ -399,11 +419,7 @@ static inline struct address_space *page_file_mapping(struct page *page) */ static inline struct address_space *page_mapping_file(struct page *page) { - struct folio *folio = page_folio(page); - - if (unlikely(folio_test_swapcache(folio))) - return NULL; - return folio_mapping(folio); + return folio_flush_mapping(page_folio(page)); } /** -- cgit v1.2.3 From 29d26f1215de14721188988a59b1426abb85b7be Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:33 +0100 Subject: mm: remove ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO Current best practice is to reuse the name of the function as a define to indicate that the function is implemented by the architecture. Link: https://lkml.kernel.org/r/20230802151406.3735276-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/cacheflush.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cacheflush.h b/include/linux/cacheflush.h index a6189d21f2ba..82136f3fcf54 100644 --- a/include/linux/cacheflush.h +++ b/include/linux/cacheflush.h @@ -7,14 +7,14 @@ struct folio; #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO +#ifndef flush_dcache_folio void flush_dcache_folio(struct folio *folio); #endif #else static inline void flush_dcache_folio(struct folio *folio) { } -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO 0 +#define flush_dcache_folio flush_dcache_folio #endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ #endif /* _LINUX_CACHEFLUSH_H */ -- cgit v1.2.3 From bcc6cc832573a99d1f935c89a28e2c71fd1aaf0c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:34 +0100 Subject: mm: add default definition of set_ptes() Most architectures can just define set_pte() and PFN_PTE_SHIFT to use this definition. It's also a handy spot to document the guarantees provided by the MM. Link: https://lkml.kernel.org/r/20230802151406.3735276-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Suggested-by: Mike Rapoport (IBM) Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 81 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 6064f454c8e3..81c3f7decb1c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -182,6 +182,66 @@ static inline int pmd_young(pmd_t pmd) } #endif +/* + * A facility to provide lazy MMU batching. This allows PTE updates and + * page invalidations to be delayed until a call to leave lazy MMU mode + * is issued. Some architectures may benefit from doing this, and it is + * beneficial for both shadow and direct mode hypervisors, which may batch + * the PTE updates which happen during this window. Note that using this + * interface requires that read hazards be removed from the code. A read + * hazard could result in the direct mode hypervisor case, since the actual + * write to the page tables may not yet have taken place, so reads though + * a raw PTE pointer after it has been modified are not guaranteed to be + * up to date. This mode can only be entered and left under the protection of + * the page table locks for all page tables which may be modified. In the UP + * case, this is required so that preemption is disabled, and in the SMP case, + * it must synchronize the delayed page table writes properly on other CPUs. + */ +#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE +#define arch_enter_lazy_mmu_mode() do {} while (0) +#define arch_leave_lazy_mmu_mode() do {} while (0) +#define arch_flush_lazy_mmu_mode() do {} while (0) +#endif + +#ifndef set_ptes +#ifdef PFN_PTE_SHIFT +/** + * set_ptes - Map consecutive pages to a contiguous range of addresses. + * @mm: Address space to map the pages into. + * @addr: Address to map the first page at. + * @ptep: Page table pointer for the first entry. + * @pte: Page table entry for the first page. + * @nr: Number of pages to map. + * + * May be overridden by the architecture, or the architecture can define + * set_pte() and PFN_PTE_SHIFT. + * + * Context: The caller holds the page table lock. The pages all belong + * to the same folio. The PTEs are all in the same PMD. + */ +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + page_table_check_ptes_set(mm, ptep, pte, nr); + + arch_enter_lazy_mmu_mode(); + for (;;) { + set_pte(ptep, pte); + if (--nr == 0) + break; + ptep++; + pte = __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + } + arch_leave_lazy_mmu_mode(); +} +#ifndef set_pte_at +#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) +#endif +#endif +#else +#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) +#endif + #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, @@ -1051,27 +1111,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define pgprot_decrypted(prot) (prot) #endif -/* - * A facility to provide lazy MMU batching. This allows PTE updates and - * page invalidations to be delayed until a call to leave lazy MMU mode - * is issued. Some architectures may benefit from doing this, and it is - * beneficial for both shadow and direct mode hypervisors, which may batch - * the PTE updates which happen during this window. Note that using this - * interface requires that read hazards be removed from the code. A read - * hazard could result in the direct mode hypervisor case, since the actual - * write to the page tables may not yet have taken place, so reads though - * a raw PTE pointer after it has been modified are not guaranteed to be - * up to date. This mode can only be entered and left under the protection of - * the page table locks for all page tables which may be modified. In the UP - * case, this is required so that preemption is disabled, and in the SMP case, - * it must synchronize the delayed page table writes properly on other CPUs. - */ -#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE -#define arch_enter_lazy_mmu_mode() do {} while (0) -#define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_flush_lazy_mmu_mode() do {} while (0) -#endif - /* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for -- cgit v1.2.3 From 29269ad90bed55a9ece007a7ece53f4505df3781 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:58 +0100 Subject: mm: remove page_mapping_file() This function has no more users. Link: https://lkml.kernel.org/r/20230802151406.3735276-31-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 88d161887cf2..b5c4c8beefe2 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -414,14 +414,6 @@ static inline struct address_space *page_file_mapping(struct page *page) return folio_file_mapping(page_folio(page)); } -/* - * For file cache pages, return the address_space, otherwise return NULL - */ -static inline struct address_space *page_mapping_file(struct page *page) -{ - return folio_flush_mapping(page_folio(page)); -} - /** * folio_inode - Get the host inode for this folio. * @folio: The folio. -- cgit v1.2.3 From 203b7b6aad6769a43987deb81c35456de8bb16c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:59 +0100 Subject: mm: rationalise flush_icache_pages() and flush_icache_page() Move the default (no-op) implementation of flush_icache_pages() to from . Remove the flush_icache_page() wrapper from each architecture into . Link: https://lkml.kernel.org/r/20230802151406.3735276-32-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/cacheflush.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cacheflush.h b/include/linux/cacheflush.h index 82136f3fcf54..55f297b2c23f 100644 --- a/include/linux/cacheflush.h +++ b/include/linux/cacheflush.h @@ -17,4 +17,13 @@ static inline void flush_dcache_folio(struct folio *folio) #define flush_dcache_folio flush_dcache_folio #endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ +#ifndef flush_icache_pages +static inline void flush_icache_pages(struct vm_area_struct *vma, + struct page *page, unsigned int nr) +{ +} +#endif + +#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) + #endif /* _LINUX_CACHEFLUSH_H */ -- cgit v1.2.3 From af4fcb0729329cc4b3f6977d7f75562a00174bd1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:14:00 +0100 Subject: mm: tidy up set_ptes definition Now that all architectures are converted, we can remove the PFN_PTE_SHIFT ifdef and we can define set_pte_at() unconditionally. Link: https://lkml.kernel.org/r/20230802151406.3735276-33-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 81c3f7decb1c..fc811c9b421a 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -204,7 +204,6 @@ static inline int pmd_young(pmd_t pmd) #endif #ifndef set_ptes -#ifdef PFN_PTE_SHIFT /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. @@ -234,13 +233,8 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, } arch_leave_lazy_mmu_mode(); } -#ifndef set_pte_at -#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) -#endif #endif -#else #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) -#endif #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, -- cgit v1.2.3 From 86f35f69db8e7d169c36472a349507ab0a461f49 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Wed, 2 Aug 2023 16:14:03 +0100 Subject: rmap: add folio_add_file_rmap_range() folio_add_file_rmap_range() allows to add pte mapping to a specific range of file folio. Comparing to page_add_file_rmap(), it batched updates __lruvec_stat for large folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-36-willy@infradead.org Signed-off-by: Yin Fengwei Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b87d01660412..a3825ce81102 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -198,6 +198,8 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); +void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr, + struct vm_area_struct *, bool compound); void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); -- cgit v1.2.3 From 3bd786f76de2e01745f462844fd1a206052ee8b8 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Wed, 2 Aug 2023 16:14:04 +0100 Subject: mm: convert do_set_pte() to set_pte_range() set_pte_range() allows to setup page table entries for a specific range. It takes advantage of batched rmap update for large folio. It now takes care of calling update_mmu_cache_range(). Link: https://lkml.kernel.org/r/20230802151406.3735276-37-willy@infradead.org Signed-off-by: Yin Fengwei Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c1db400e83cb..ddb95967ba64 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1322,7 +1322,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); +void set_pte_range(struct vm_fault *vmf, struct folio *folio, + struct page *page, unsigned int nr, unsigned long addr); vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); -- cgit v1.2.3 From cfeed8ffe55b37fa10286aaaa1369da00cb88440 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 21 Aug 2023 18:08:46 +0200 Subject: mm/swap: stop using page->private on tail pages for THP_SWAP Patch series "mm/swap: stop using page->private on tail pages for THP_SWAP + cleanups". This series stops using page->private on tail pages for THP_SWAP, replaces folio->private by folio->swap for swapcache folios, and starts using "new_folio" for tail pages that we are splitting to remove the usage of page->private for swapcache handling completely. This patch (of 4): Let's stop using page->private on tail pages, making it possible to just unconditionally reuse that field in the tail pages of large folios. The remaining usage of the private field for THP_SWAP is in the THP splitting code (mm/huge_memory.c), that we'll handle separately later. Update the THP_SWAP documentation and sanity checks in mm_types.h and __split_huge_page_tail(). [david@redhat.com: stop using page->private on tail pages for THP_SWAP] Link: https://lkml.kernel.org/r/6f0a82a3-6948-20d9-580b-be1dbf415701@redhat.com Link: https://lkml.kernel.org/r/20230821160849.531668-1-david@redhat.com Link: https://lkml.kernel.org/r/20230821160849.531668-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Catalin Marinas [arm64] Reviewed-by: Yosry Ahmed Cc: Dan Streetman Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Seth Jennings Cc: Vitaly Wool Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 12 +----------- include/linux/swap.h | 9 +++++++++ 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2b9d8be28361..55cd4bc57b8d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -322,11 +322,8 @@ struct folio { atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; - /* 4 byte gap here */ - /* private: the union with struct page is transitional */ - /* Fix THP_SWAP to not use tail->private */ - unsigned long _private_1; #endif + /* private: the union with struct page is transitional */ }; struct page __page_1; }; @@ -347,9 +344,6 @@ struct folio { /* public: */ struct list_head _deferred_list; /* private: the union with struct page is transitional */ - unsigned long _avail_2a; - /* Fix THP_SWAP to not use tail->private */ - unsigned long _private_2a; }; struct page __page_2; }; @@ -374,9 +368,6 @@ FOLIO_MATCH(memcg_data, memcg_data); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); -#ifdef CONFIG_64BIT -FOLIO_MATCH(private, _private_1); -#endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ @@ -385,7 +376,6 @@ FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); FOLIO_MATCH(flags, _flags_2a); FOLIO_MATCH(compound_head, _head_2a); -FOLIO_MATCH(private, _private_2a); #undef FOLIO_MATCH /** diff --git a/include/linux/swap.h b/include/linux/swap.h index bb5adc604144..e5cf58a1cf9e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -339,6 +339,15 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio) return entry; } +static inline swp_entry_t page_swap_entry(struct page *page) +{ + struct folio *folio = page_folio(page); + swp_entry_t entry = folio_swap_entry(folio); + + entry.val += folio_page_idx(folio, page); + return entry; +} + static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) { folio->private = (void *)entry.val; -- cgit v1.2.3 From 85a1333417a7561c1d10a77d6c873a37e6ea63a0 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 21 Aug 2023 18:08:47 +0200 Subject: mm/swap: use dedicated entry for swap in folio Let's stop working on the private field and use an explicit swap field. We have to move the swp_entry_t typedef. Link: https://lkml.kernel.org/r/20230821160849.531668-3-david@redhat.com Signed-off-by: Matthew Wilcox Signed-off-by: David Hildenbrand Reviewed-by: Chris Li Cc: Catalin Marinas Cc: Dan Streetman Cc: Hugh Dickins Cc: Peter Xu Cc: Seth Jennings Cc: Vitaly Wool Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 23 +++++++++++++---------- include/linux/swap.h | 5 ++--- 2 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 55cd4bc57b8d..36c5b43999e6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -248,6 +248,14 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); } +/* + * A swap entry has to fit into a "unsigned long", as the entry is hidden + * in the "index" field of the swapper address space. + */ +typedef struct { + unsigned long val; +} swp_entry_t; + /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. @@ -258,7 +266,7 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @index: Offset within the file, in units of pages. For anonymous memory, * this is the index from the beginning of the mmap. * @private: Filesystem per-folio data (see folio_attach_private()). - * Used for swp_entry_t if folio_test_swapcache(). + * @swap: Used for swp_entry_t if folio_test_swapcache(). * @_mapcount: Do not access this member directly. Use folio_mapcount() to * find out how many times this folio is mapped by userspace. * @_refcount: Do not access this member directly. Use folio_ref_count() @@ -301,7 +309,10 @@ struct folio { }; struct address_space *mapping; pgoff_t index; - void *private; + union { + void *private; + swp_entry_t swap; + }; atomic_t _mapcount; atomic_t _refcount; #ifdef CONFIG_MEMCG @@ -1209,14 +1220,6 @@ enum tlb_flush_reason { NR_TLB_FLUSH_REASONS, }; - /* - * A swap entry has to fit into a "unsigned long", as the entry is hidden - * in the "index" field of the swapper address space. - */ -typedef struct { - unsigned long val; -} swp_entry_t; - /** * enum fault_flag - Fault flag definitions. * @FAULT_FLAG_WRITE: Fault was a write fault. diff --git a/include/linux/swap.h b/include/linux/swap.h index e5cf58a1cf9e..352eca0a75bc 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -335,8 +335,7 @@ struct swap_info_struct { static inline swp_entry_t folio_swap_entry(struct folio *folio) { - swp_entry_t entry = { .val = page_private(&folio->page) }; - return entry; + return folio->swap; } static inline swp_entry_t page_swap_entry(struct page *page) @@ -350,7 +349,7 @@ static inline swp_entry_t page_swap_entry(struct page *page) static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) { - folio->private = (void *)entry.val; + folio->swap = entry; } /* linux/mm/workingset.c */ -- cgit v1.2.3 From 3d2c908768877714a354ee6d7bf93e801400d5e2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 21 Aug 2023 18:08:48 +0200 Subject: mm/swap: inline folio_set_swap_entry() and folio_swap_entry() Let's simply work on the folio directly and remove the helpers. Link: https://lkml.kernel.org/r/20230821160849.531668-4-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Matthew Wilcox Reviewed-by: Chris Li Cc: Catalin Marinas Cc: Dan Streetman Cc: Hugh Dickins Cc: Peter Xu Cc: Seth Jennings Cc: Vitaly Wool Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/swap.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 352eca0a75bc..493487ed7c38 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -333,25 +333,15 @@ struct swap_info_struct { */ }; -static inline swp_entry_t folio_swap_entry(struct folio *folio) -{ - return folio->swap; -} - static inline swp_entry_t page_swap_entry(struct page *page) { struct folio *folio = page_folio(page); - swp_entry_t entry = folio_swap_entry(folio); + swp_entry_t entry = folio->swap; entry.val += folio_page_idx(folio, page); return entry; } -static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) -{ - folio->swap = entry; -} - /* linux/mm/workingset.c */ bool workingset_test_recent(void *shadow, bool file, bool *workingset); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); -- cgit v1.2.3 From bb7dbaafff3f582d18028a5b99a8faa789842678 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 19 Aug 2023 04:18:37 +0100 Subject: mm: remove checks for pte_index Since pte_index is always defined, we don't need to check whether it's defined or not. Delete the slow version that doesn't depend on it and remove the #define since nobody needs to test for it. Link: https://lkml.kernel.org/r/20230819031837.3160096-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (IBM) Cc: Christian Dietrich Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index fc811c9b421a..95ad544ad395 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -63,7 +63,6 @@ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } -#define pte_index pte_index #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) -- cgit v1.2.3 From 051ddcfeb1bdbae45e660c0db2468d29ca15c6c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:23:33 +0100 Subject: mm: move PMD_ORDER to pgtable.h Patch series "Change calling convention for ->huge_fault", v2. There are two unrelated changes to the calling convention for ->huge_fault. I've bundled them together to help people notice the change. The first is to improve scalability of DAX page faults by allowing them to be handled under the VMA lock. The second is to remove enum page_entry_size since it's really unnecessary. The changelogs and documentation updates hopefully work to that end. This patch (of 3): Allow this to be used in generic code. Also add PUD_ORDER. Link: https://lkml.kernel.org/r/20230818202335.2739663-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230818202335.2739663-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 95ad544ad395..f49abcfe5eda 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -5,6 +5,9 @@ #include #include +#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) +#define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT) + #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU -- cgit v1.2.3 From 1d024e7a8dabcc3c84d77532a88c774c32cf8245 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:23:35 +0100 Subject: mm: remove enum page_entry_size Remove the unnecessary encoding of page order into an enum and pass the page order directly. That lets us get rid of pe_order(). The switch constructs have to be changed to if/else constructs to prevent GCC from warning on builds with 3-level page tables where PMD_ORDER and PUD_ORDER have the same value. If you are looking at this commit because your driver stopped compiling, look at the previous commit as well and audit your driver to be sure it doesn't depend on mmap_lock being held in its ->huge_fault method. [willy@infradead.org: use "order %u" to match the (non dev_t) style] Link: https://lkml.kernel.org/r/ZOUYekbtTv+n8hYf@casper.infradead.org Link: https://lkml.kernel.org/r/20230818202335.2739663-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/dax.h | 4 ++-- include/linux/mm.h | 10 +--------- 2 files changed, 3 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 261944ec0887..22cd9902345d 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -241,10 +241,10 @@ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); -vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *errp, const struct iomap_ops *ops); vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, pfn_t pfn); + unsigned int order, pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); diff --git a/include/linux/mm.h b/include/linux/mm.h index ddb95967ba64..53efddc4d178 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -532,13 +532,6 @@ struct vm_fault { */ }; -/* page entry size for vm->huge_fault() */ -enum page_entry_size { - PE_SIZE_PTE = 0, - PE_SIZE_PMD, - PE_SIZE_PUD, -}; - /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -562,8 +555,7 @@ struct vm_operations_struct { int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); vm_fault_t (*fault)(struct vm_fault *vmf); - vm_fault_t (*huge_fault)(struct vm_fault *vmf, - enum page_entry_size pe_size); + vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); -- cgit v1.2.3 From 8f9ff2deb8b91ad1cba666c7adbe4ca79e2d3225 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Aug 2023 21:23:35 +0100 Subject: secretmem: convert page_is_secretmem() to folio_is_secretmem() The only caller already has a folio, so use it to save calling compound_head() in PageLRU() and remove a use of page->mapping. Link: https://lkml.kernel.org/r/20230822202335.179081-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/secretmem.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h index 988528b5da43..35f3a4a8ceb1 100644 --- a/include/linux/secretmem.h +++ b/include/linux/secretmem.h @@ -6,24 +6,23 @@ extern const struct address_space_operations secretmem_aops; -static inline bool page_is_secretmem(struct page *page) +static inline bool folio_is_secretmem(struct folio *folio) { struct address_space *mapping; /* - * Using page_mapping() is quite slow because of the actual call - * instruction and repeated compound_head(page) inside the - * page_mapping() function. + * Using folio_mapping() is quite slow because of the actual call + * instruction. * We know that secretmem pages are not compound and LRU so we can * save a couple of cycles here. */ - if (PageCompound(page) || !PageLRU(page)) + if (folio_test_large(folio) || !folio_test_lru(folio)) return false; mapping = (struct address_space *) - ((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); + ((unsigned long)folio->mapping & ~PAGE_MAPPING_FLAGS); - if (!mapping || mapping != page->mapping) + if (!mapping || mapping != folio->mapping) return false; return mapping->a_ops == &secretmem_aops; @@ -39,7 +38,7 @@ static inline bool vma_is_secretmem(struct vm_area_struct *vma) return false; } -static inline bool page_is_secretmem(struct page *page) +static inline bool folio_is_secretmem(struct folio *folio) { return false; } -- cgit v1.2.3 From 52ae298e3e5c9be5bb95e1c6d9199e5210f2a156 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 22 Aug 2023 00:51:45 +0200 Subject: maple_tree: shrink struct maple_tree Pack the members of struct maple_tree to avoid holes on 64-bit. The size shrinks from 24 to 16 bytes which will save eight bytes in every structure which embeds it. [willy@infradead.org: changelog alterations] Link: https://lkml.kernel.org/r/20230821225145.2169848-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Reviewed-by: Liam R. Howlett Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index c962af188681..e41c70ac7744 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -220,8 +220,8 @@ struct maple_tree { spinlock_t ma_lock; lockdep_map_p ma_external_lock; }; - void __rcu *ma_root; unsigned int ma_flags; + void __rcu *ma_root; }; /** -- cgit v1.2.3