From a19621ed4e0ac9e1d296d07dc8ff8c27d5c03b43 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 15 May 2024 15:07:06 +0800 Subject: mm: add folio_alloc_mpol() Patch series "mm: convert to folio_alloc_mpol()". This patch (of 4): This adds a new folio_alloc_mpol() like folio_alloc() but allocate folio according to NUMA mempolicy. Link: https://lkml.kernel.org/r/20240515070709.78529-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240515070709.78529-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7f9691d375f0..f53f76e0b17e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -303,6 +303,8 @@ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); +struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); #else @@ -319,6 +321,11 @@ static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } +static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid) +{ + return folio_alloc_noprof(gfp, order); +} #define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \ folio_alloc_noprof(gfp, order) #endif @@ -326,6 +333,7 @@ static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) #define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) +#define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) -- cgit v1.2.3 From 7f83bf14603ef41a44dc907594d749a283e22c37 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Wed, 15 May 2024 10:47:54 +0800 Subject: mm/huge_memory: mark racy access onhuge_anon_orders_always huge_anon_orders_always is accessed lockless, it is better to use the READ_ONCE() wrapper. This is not fixing any visible bug, hopefully this can cease some KCSAN complains in the future. Also do that for huge_anon_orders_madvise. Link: https://lkml.kernel.org/r/20240515104754889HqrahFPePOIE1UlANHVAh@zte.com.cn Signed-off-by: Ran Xiaokai Acked-by: David Hildenbrand Reviewed-by: Lu Zhongjun Reviewed-by: xu xin Cc: Yang Yang Cc: Matthew Wilcox (Oracle) Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2aa986a5cd1b..088d66a54643 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -134,8 +134,8 @@ static inline bool hugepage_flags_enabled(void) * So we don't need to look at huge_anon_orders_inherit. */ return hugepage_global_enabled() || - huge_anon_orders_always || - huge_anon_orders_madvise; + READ_ONCE(huge_anon_orders_always) || + READ_ONCE(huge_anon_orders_madvise); } static inline int highest_order(unsigned long orders) -- cgit v1.2.3 From 564a2ee9f9f66ceef135b7208cff705d48ad76c4 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:51 +0800 Subject: mm: remove page_file_offset and folio_file_pos These two helpers were useful for mixed usage of swap cache and page cache, which help retrieve the corresponding file or swap device offset of a page or folio. They were introduced in commit f981c5950fa8 ("mm: methods for teaching filesystems about PG_swapcache pages") and used in commit d56b4ddf7781 ("nfs: teach the NFS client how to treat PG_swapcache pages"), suppose to be used with direct_IO for swap over fs. But after commit e1209d3a7a67 ("mm: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space"), swap with direct_IO is no more, and swap cache mapping is never exposed to fs. Now we have dropped all users of page_file_offset and folio_file_pos, so they can be deleted. Link: https://lkml.kernel.org/r/20240521175854.96038-10-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 59f1df0cde5a..78f2cd8c73ff 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -932,11 +932,6 @@ static inline loff_t page_offset(struct page *page) return ((loff_t)page->index) << PAGE_SHIFT; } -static inline loff_t page_file_offset(struct page *page) -{ - return ((loff_t)page_index(page)) << PAGE_SHIFT; -} - /** * folio_pos - Returns the byte position of this folio in its file. * @folio: The folio. @@ -946,18 +941,6 @@ static inline loff_t folio_pos(struct folio *folio) return page_offset(&folio->page); } -/** - * folio_file_pos - Returns the byte position of this folio in its file. - * @folio: The folio. - * - * This differs from folio_pos() for folios which belong to a swap file. - * NFS is the only filesystem today which needs to use folio_file_pos(). - */ -static inline loff_t folio_file_pos(struct folio *folio) -{ - return page_file_offset(&folio->page); -} - /* * Get the offset in PAGE_SIZE (even for hugetlb folios). */ -- cgit v1.2.3 From 05b0c7edad9b8a5ccf1b46b01e1b96fcd10b50d8 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:52 +0800 Subject: mm: drop page_index and simplify folio_index There are two helpers for retrieving the index within address space for mixed usage of swap cache and page cache: - page_index - folio_index This commit drops page_index, as we have eliminated all users, and converts folio_index's helper __page_file_index to use folio to avoid the page conversion. Link: https://lkml.kernel.org/r/20240521175854.96038-11-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 ------------- include/linux/pagemap.h | 8 ++++---- 2 files changed, 4 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index eb7c96d24ac0..99174fac3d47 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2295,19 +2295,6 @@ static inline void *folio_address(const struct folio *folio) return page_address(&folio->page); } -extern pgoff_t __page_file_index(struct page *page); - -/* - * Return the pagecache index of the passed page. Regular pagecache pages - * use ->index whereas swapcache pages use swp_offset(->private) - */ -static inline pgoff_t page_index(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return __page_file_index(page); - return page->index; -} - /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 78f2cd8c73ff..1586b5c21e81 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -792,7 +792,7 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } -#define swapcache_index(folio) __page_file_index(&(folio)->page) +extern pgoff_t __folio_swap_cache_index(struct folio *folio); /** * folio_index - File index of a folio. @@ -807,9 +807,9 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, */ static inline pgoff_t folio_index(struct folio *folio) { - if (unlikely(folio_test_swapcache(folio))) - return swapcache_index(folio); - return folio->index; + if (unlikely(folio_test_swapcache(folio))) + return __folio_swap_cache_index(folio); + return folio->index; } /** -- cgit v1.2.3 From 63818aaf0da8c036d600424ac961620dcdc13ce2 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 20 May 2024 15:44:07 -0700 Subject: mm/hugetlb: remove {Set,Clear}Hpage macros All users have been converted to use the folio version of these macros, we can safely remove the page based interface. Link: https://lkml.kernel.org/r/20240520224407.110062-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2b3c3a404769..15a58f69782c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -625,18 +625,14 @@ static __always_inline \ void folio_set_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ set_bit(HPG_##flname, private); \ - } \ -static inline void SetHPage##uname(struct page *page) \ - { set_bit(HPG_##flname, &(page->private)); } + } #define CLEARHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_clear_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ clear_bit(HPG_##flname, private); \ - } \ -static inline void ClearHPage##uname(struct page *page) \ - { clear_bit(HPG_##flname, &(page->private)); } + } #else #define TESTHPAGEFLAG(uname, flname) \ static inline bool \ @@ -648,15 +644,11 @@ static inline int HPage##uname(struct page *page) \ #define SETHPAGEFLAG(uname, flname) \ static inline void \ folio_set_hugetlb_##flname(struct folio *folio) \ - { } \ -static inline void SetHPage##uname(struct page *page) \ { } #define CLEARHPAGEFLAG(uname, flname) \ static inline void \ folio_clear_hugetlb_##flname(struct folio *folio) \ - { } \ -static inline void ClearHPage##uname(struct page *page) \ { } #endif -- cgit v1.2.3 From 6ad28e7e52e2bae76b1d3eb4466999d04d50db92 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 22 May 2024 14:57:13 +0200 Subject: mm/rmap: sanity check that zeropages are not passed to RMAP Using insert_page() we might have previously ended up passing the zeropage into rmap code. Make sure that won't happen again. Note that we won't check the huge zeropage for now, which might still end up in RMAP code. Link: https://lkml.kernel.org/r/20240522125713.775114-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Dan Williams Cc: Vincent Donnefort Signed-off-by: Andrew Morton --- include/linux/rmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 7229b9baf20d..5cb0d419a1d7 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -200,6 +200,9 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio, /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + /* When (un)mapping zeropages, we should never touch ref+mapcount. */ + VM_WARN_ON_FOLIO(is_zero_folio(folio), folio); + /* * TODO: we get driver-allocated folios that have nothing to do with * the rmap using vm_insert_page(); therefore, we cannot assume that -- cgit v1.2.3 From 23b1b44e6c61295084284aa7d87db863a7802b92 Mon Sep 17 00:00:00 2001 From: Bang Li Date: Wed, 22 May 2024 14:12:02 +0800 Subject: mm: add update_mmu_tlb_range() Patch series "Add update_mmu_tlb_range() to simplify code", v4. This series of commits mainly adds the update_mmu_tlb_range() to batch update tlb in an address range and implement update_mmu_tlb() using update_mmu_tlb_range(). After commit 19eaf44954df ("mm: thp: support allocation of anonymous multi-size THP"), We may need to batch update tlb of a certain address range by calling update_mmu_tlb() in a loop. Using the update_mmu_tlb_range(), we can simplify the code and possibly reduce the execution of some unnecessary code in some architectures. This patch (of 3): Add update_mmu_tlb_range(), we can batch update tlb of an address range. Link: https://lkml.kernel.org/r/20240522061204.117421-1-libang.li@antgroup.com Link: https://lkml.kernel.org/r/20240522061204.117421-2-libang.li@antgroup.com Signed-off-by: Bang Li Acked-by: David Hildenbrand Cc: Chris Zankel Cc: Huacai Chen Cc: Lance Yang Cc: Max Filippov Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Ryan Roberts Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 18019f037bae..17d1caee39ab 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -729,6 +729,13 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ +#ifndef update_mmu_tlb_range +static inline void update_mmu_tlb_range(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) +{ +} +#endif + #ifndef __HAVE_ARCH_UPDATE_MMU_TLB static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) -- cgit v1.2.3 From 8f65aa32239f1c3f11b7a25bd5921223bafc5fed Mon Sep 17 00:00:00 2001 From: Bang Li Date: Wed, 22 May 2024 14:12:03 +0800 Subject: mm: implement update_mmu_tlb() using update_mmu_tlb_range() Let's make update_mmu_tlb() simply a generic wrapper around update_mmu_tlb_range(). Only the latter can now be overridden by the architecture. We can now remove __HAVE_ARCH_UPDATE_MMU_TLB as well. Link: https://lkml.kernel.org/r/20240522061204.117421-3-libang.li@antgroup.com Signed-off-by: Bang Li Acked-by: David Hildenbrand Cc: Chris Zankel Cc: Huacai Chen Cc: Lance Yang Cc: Max Filippov Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Ryan Roberts Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 17d1caee39ab..117b807e3f89 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -736,13 +736,11 @@ static inline void update_mmu_tlb_range(struct vm_area_struct *vma, } #endif -#ifndef __HAVE_ARCH_UPDATE_MMU_TLB static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + update_mmu_tlb_range(vma, address, ptep, 1); } -#define __HAVE_ARCH_UPDATE_MMU_TLB -#endif /* * Some architectures may be able to avoid expensive synchronization -- cgit v1.2.3 From b8b9488d50b7150bd4830dfff487e8d4ef6589ba Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Fri, 24 May 2024 15:53:04 -0600 Subject: mm/memory-failure: improve memory failure action_result messages Added two explicit MF_MSG messages describing failure in get_hwpoison_page. Attemped to document the definition of various action names, and made a few adjustment to the action_result() calls. Link: https://lkml.kernel.org/r/20240524215306.2705454-4-jane.chu@oracle.com Signed-off-by: Jane Chu Reviewed-by: Oscar Salvador Acked-by: Miaohe Lin Cc: Naoya Horiguchi Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 99174fac3d47..f4bf94ca99e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4096,6 +4096,7 @@ enum mf_action_page_type { MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, + MF_MSG_GET_HWPOISON, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, @@ -4109,6 +4110,7 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, + MF_MSG_ALREADY_POISONED, MF_MSG_UNKNOWN, }; -- cgit v1.2.3 From 188f87f2648b13f5de17d5e068f18d317e0c1f98 Mon Sep 17 00:00:00 2001 From: Eric Chanudet Date: Wed, 22 May 2024 16:38:01 -0400 Subject: mm/mm_init: use node's number of cpus in deferred_page_init_max_threads x86_64 is already using the node's cpu as maximum threads. Make that the default for all archs setting DEFERRED_STRUCT_PAGE_INIT. This returns to the behavior prior making the function arch-specific with commit ecd096506922 ("mm: make deferred init's max threads arch-specific"). Setting DEFERRED_STRUCT_PAGE_INIT and testing on a few arm64 platforms shows faster deferred_init_memmap completions: | | x13s | SA8775p-ride | Ampere R137-P31 | Ampere HR330 | | | Metal, 32GB | VM, 36GB | VM, 58GB | Metal, 128GB | | | 8cpus | 8cpus | 8cpus | 32cpus | |---------|-------------|--------------|-----------------|--------------| | threads | ms (%) | ms (%) | ms (%) | ms (%) | |---------|-------------|--------------|-----------------|--------------| | 1 | 108 (0%) | 72 (0%) | 224 (0%) | 324 (0%) | | cpus | 24 (-77%) | 36 (-50%) | 40 (-82%) | 56 (-82%) | Michael Ellerman reported: : On a machine here (1TB, 40 cores, 4KB pages) the existing code gives: : : [ 0.500124] node 2 deferred pages initialised in 210ms : [ 0.515790] node 3 deferred pages initialised in 230ms : [ 0.516061] node 0 deferred pages initialised in 230ms : [ 0.516522] node 7 deferred pages initialised in 230ms : [ 0.516672] node 4 deferred pages initialised in 230ms : [ 0.516798] node 6 deferred pages initialised in 230ms : [ 0.517051] node 5 deferred pages initialised in 230ms : [ 0.523887] node 1 deferred pages initialised in 240ms : : vs with the patch: : : [ 0.379613] node 0 deferred pages initialised in 90ms : [ 0.380388] node 1 deferred pages initialised in 90ms : [ 0.380540] node 4 deferred pages initialised in 100ms : [ 0.390239] node 6 deferred pages initialised in 100ms : [ 0.390249] node 2 deferred pages initialised in 100ms : [ 0.390786] node 3 deferred pages initialised in 110ms : [ 0.396721] node 5 deferred pages initialised in 110ms : [ 0.397095] node 7 deferred pages initialised in 110ms : : Which is a nice speedup. [echanude@redhat.com: v3] Link: https://lkml.kernel.org/r/20240528185455.643227-4-echanude@redhat.com Link: https://lkml.kernel.org/r/20240522203758.626932-4-echanude@redhat.com Signed-off-by: Eric Chanudet Tested-by: Michael Ellerman (powerpc) Reviewed-by: Baoquan He Acked-by: Alexander Gordeev Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/memblock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e2082240586d..40c62aca36ec 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -335,8 +335,6 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, for (; i != U64_MAX; \ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) -int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); - #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** -- cgit v1.2.3 From ffc3c8a631eeadd0de63605cecfb6ba544245352 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 09:49:50 +0800 Subject: mm: memcontrol: remove page_memcg() The page_memcg() only called by mod_memcg_page_state(), so squash it to cleanup page_memcg(). Link: https://lkml.kernel.org/r/20240524014950.187805-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 030d34e9d117..3d1599146afe 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -443,11 +443,6 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return __folio_memcg(folio); } -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return folio_memcg(page_folio(page)); -} - /** * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. * @folio: Pointer to the folio. @@ -1014,7 +1009,7 @@ static inline void mod_memcg_page_state(struct page *page, return; rcu_read_lock(); - memcg = page_memcg(page); + memcg = folio_memcg(page_folio(page)); if (memcg) mod_memcg_state(memcg, idx, val); rcu_read_unlock(); @@ -1133,11 +1128,6 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return NULL; } -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return NULL; -} - static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { WARN_ON_ONCE(!rcu_read_lock_held()); @@ -1636,7 +1626,7 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, spin_unlock_irqrestore(&lruvec->lru_lock, flags); } -/* Test requires a stable page->memcg binding, see page_memcg() */ +/* Test requires a stable folio->memcg binding, see folio_memcg() */ static inline bool folio_matches_lruvec(struct folio *folio, struct lruvec *lruvec) { -- cgit v1.2.3 From 06668257a355a05483c188e35a18c80471d06987 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 24 May 2024 19:18:10 +0100 Subject: mm: remove page_mapping() All callers are now converted, delete this compatibility wrapper. Also fix up some comments which referred to page_mapping. Link: https://lkml.kernel.org/r/20240423225552.4113447-7-willy@infradead.org Link: https://lkml.kernel.org/r/20240524181813.698813-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Eric Biggers Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 2 +- include/linux/page-flags.h | 23 ++++++++++++----------- include/linux/pagemap.h | 1 - 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index e022e40b099e..14acf1bbe0ce 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -53,7 +53,7 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); * filesystem and block layers. Nowadays the basic I/O unit * is the bio, and buffer_heads are used for extracting block * mappings (via a get_block_t call), for tracking state within - * a page (via a page_mapping) and for wrapping bio submission + * a folio (via a folio_mapping) and for wrapping bio submission * for backward compatibility reasons (e.g. submit_bh). */ struct buffer_head { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b9e914e1face..813eea2efe26 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -655,27 +655,28 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #endif /* - * On an anonymous page mapped into a user virtual memory area, - * page->mapping points to its anon_vma, not to a struct address_space; + * On an anonymous folio mapped into a user virtual memory area, + * folio->mapping points to its anon_vma, not to a struct address_space; * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON - * bit; and then page->mapping points, not to an anon_vma, but to a private + * bit; and then folio->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged page. See ksm.h. * * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable - * page and then page->mapping points to a struct movable_operations. + * page and then folio->mapping points to a struct movable_operations. * - * Please note that, confusingly, "page_mapping" refers to the inode - * address_space which maps the page from disk; whereas "page_mapped" - * refers to user virtual address space into which the page is mapped. + * Please note that, confusingly, "folio_mapping" refers to the inode + * address_space which maps the folio from disk; whereas "folio_mapped" + * refers to user virtual address space into which the folio is mapped. * * For slab pages, since slab reuses the bits in struct page to store its - * internal states, the page->mapping does not exist as such, nor do these - * flags below. So in order to avoid testing non-existent bits, please - * make sure that PageSlab(page) actually evaluates to false before calling - * the following functions (e.g., PageAnon). See mm/slab.h. + * internal states, the folio->mapping does not exist as such, nor do + * these flags below. So in order to avoid testing non-existent bits, + * please make sure that folio_test_slab(folio) actually evaluates to + * false before calling the following functions (e.g., folio_test_anon). + * See mm/slab.h. */ #define PAGE_MAPPING_ANON 0x1 #define PAGE_MAPPING_MOVABLE 0x2 diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 1586b5c21e81..e37e16ebff7a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -426,7 +426,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) #endif } -struct address_space *page_mapping(struct page *); struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); -- cgit v1.2.3 From 1df07243483c1a32c8f2f93b06dc52a583a4dd1c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 13:36:18 +0800 Subject: rmap: remove DEFINE_PAGE_VMA_WALK() This are no users since commit 40d707f33db5 ("mm/ksm: use folio in write_protect_page"), so remove DEFINE_PAGE_VMA_WALK(). Link: https://lkml.kernel.org/r/20240524053618.208895-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Cc: Alex Shi Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/rmap.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 5cb0d419a1d7..bb53e5920b88 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -684,16 +684,6 @@ struct page_vma_mapped_walk { unsigned int flags; }; -#define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ - struct page_vma_mapped_walk name = { \ - .pfn = page_to_pfn(_page), \ - .nr_pages = compound_nr(_page), \ - .pgoff = page_to_pgoff(_page), \ - .vma = _vma, \ - .address = _address, \ - .flags = _flags, \ - } - #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ .pfn = folio_pfn(_folio), \ -- cgit v1.2.3 From 940d6683c79950b21b3762124eabfa9b2f6fee96 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 13:28:42 +0800 Subject: mm: migrate: remove migrate_folio_extra() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migrate_folio_extra() is only called in migrate.c now, convert it a static function and take a new src_private argument which could be shared by migrate_folio() and filemap_migrate_folio() to simplify code a bit. Link: https://lkml.kernel.org/r/20240524052843.182275-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Tony Luck Cc: Vishal Moola (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/migrate.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 2ce13e8a309b..517f70b70620 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -63,8 +63,6 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION void putback_movable_pages(struct list_head *l); -int migrate_folio_extra(struct address_space *mapping, struct folio *dst, - struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free, -- cgit v1.2.3 From 906632843d00a4a42072b19c423b30a9e7adef3c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 13:28:43 +0800 Subject: mm: remove MIGRATE_SYNC_NO_COPY mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 2916ecc0f9d4 ("mm/migrate: new migrate mode MIGRATE_SYNC_NO_COPY") introduce a new MIGRATE_SYNC_NO_COPY mode to allow to offload the copy to a device DMA engine, which is only used __migrate_device_pages() to decide whether or not copy the old page, and the MIGRATE_SYNC_NO_COPY mode only set in hmm, as the MIGRATE_SYNC_NO_COPY set is removed by previous cleanup, it seems that we could remove the unnecessary MIGRATE_SYNC_NO_COPY. Link: https://lkml.kernel.org/r/20240524052843.182275-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Tony Luck Cc: Vishal Moola (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/migrate_mode.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index f37cc03f9369..9fb482bb7323 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -7,16 +7,11 @@ * on most operations but not ->writepage as the potential stall time * is too significant * MIGRATE_SYNC will block when migrating pages - * MIGRATE_SYNC_NO_COPY will block when migrating pages but will not copy pages - * with the CPU. Instead, page copy happens outside the migratepage() - * callback and is likely using a DMA engine. See migrate_vma() and HMM - * (mm/hmm.c) for users of this mode. */ enum migrate_mode { MIGRATE_ASYNC, MIGRATE_SYNC_LIGHT, MIGRATE_SYNC, - MIGRATE_SYNC_NO_COPY, }; enum migrate_reason { -- cgit v1.2.3 From ebfba0045176cb013f49cb3e5bd9f0b16eba203c Mon Sep 17 00:00:00 2001 From: Chuanhua Han Date: Wed, 29 May 2024 20:28:19 +1200 Subject: mm: swap: introduce swap_free_nr() for batched swap_free() Patch series "large folios swap-in: handle refault cases first", v5. This patchset is extracted from the large folio swapin series[1], primarily addressing the handling of scenarios involving large folios in the swap cache. Currently, it is particularly focused on addressing the refaulting of mTHP, which is still undergoing reclamation. This approach aims to streamline code review and expedite the integration of this segment into the MM tree. It relies on Ryan's swap-out series[2], leveraging the helper function swap_pte_batch() introduced by that series. Presently, do_swap_page only encounters a large folio in the swap cache before the large folio is released by vmscan. However, the code should remain equally useful once we support large folio swap-in via swapin_readahead(). This approach can effectively reduce page faults and eliminate most redundant checks and early exits for MTE restoration in recent MTE patchset[3]. The large folio swap-in for SWP_SYNCHRONOUS_IO and swapin_readahead() will be split into separate patch sets and sent at a later time. [1] https://lore.kernel.org/linux-mm/20240304081348.197341-1-21cnbao@gmail.com/ [2] https://lore.kernel.org/linux-mm/20240408183946.2991168-1-ryan.roberts@arm.com/ [3] https://lore.kernel.org/linux-mm/20240322114136.61386-1-21cnbao@gmail.com/ This patch (of 6): While swapping in a large folio, we need to free swaps related to the whole folio. To avoid frequently acquiring and releasing swap locks, it is better to introduce an API for batched free. Furthermore, this new function, swap_free_nr(), is designed to efficiently handle various scenarios for releasing a specified number, nr, of swap entries. Link: https://lkml.kernel.org/r/20240529082824.150954-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240529082824.150954-2-21cnbao@gmail.com Signed-off-by: Chuanhua Han Co-developed-by: Barry Song Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Acked-by: Chris Li Reviewed-by: "Huang, Ying" Cc: Baolin Wang Cc: David Hildenbrand Cc: Gao Xiang Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Cc: Andreas Larsson Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Khalid Aziz Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index bd450023b9a4..0f41fe49c9dc 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -478,6 +478,7 @@ extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); +extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); @@ -559,6 +560,10 @@ static inline void swap_free(swp_entry_t swp) { } +static inline void swap_free_nr(swp_entry_t entry, int nr_pages) +{ +} + static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } -- cgit v1.2.3 From 54f7a49c20ebb5189980c53e6e66709d22bee572 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 29 May 2024 20:28:20 +1200 Subject: mm: remove the implementation of swap_free() and always use swap_free_nr() To streamline maintenance efforts, we propose removing the implementation of swap_free(). Instead, we can simply invoke swap_free_nr() with nr set to 1. swap_free_nr() is designed with a bitmap consisting of only one long, resulting in overhead that can be ignored for cases where nr equals 1. A prime candidate for leveraging swap_free_nr() lies within kernel/power/swap.c. Implementing this change facilitates the adoption of batch processing for hibernation. Link: https://lkml.kernel.org/r/20240529082824.150954-3-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: "Huang, Ying" Reviewed-by: "Huang, Ying" Acked-by: Chris Li Reviewed-by: Ryan Roberts Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Len Brown Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Andreas Larsson Cc: Baolin Wang Cc: Chuanhua Han Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gao Xiang Cc: Johannes Weiner Cc: Kairui Song Cc: Khalid Aziz Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0f41fe49c9dc..d33ce740b695 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -477,7 +477,6 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); -extern void swap_free(swp_entry_t); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); @@ -556,10 +555,6 @@ static inline int swapcache_prepare(swp_entry_t swp) return 0; } -static inline void swap_free(swp_entry_t swp) -{ -} - static inline void swap_free_nr(swp_entry_t entry, int nr_pages) { } @@ -608,6 +603,11 @@ static inline void free_swap_and_cache(swp_entry_t entry) free_swap_and_cache_nr(entry, 1); } +static inline void swap_free(swp_entry_t entry) +{ + swap_free_nr(entry, 1); +} + #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { -- cgit v1.2.3 From 29f252cdc293f4a50b5d3dcbed53701d8444614d Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 29 May 2024 20:28:22 +1200 Subject: mm: introduce arch_do_swap_page_nr() which allows restore metadata for nr pages Should do_swap_page() have the capability to directly map a large folio, metadata restoration becomes necessary for a specified number of pages denoted as nr. It's important to highlight that metadata restoration is solely required by the SPARC platform, which, however, does not enable THP_SWAP. Consequently, in the present kernel configuration, there exists no practical scenario where users necessitate the restoration of nr metadata. Platforms implementing THP_SWAP might invoke this function with nr values exceeding 1, subsequent to do_swap_page() successfully mapping an entire large folio. Nonetheless, their arch_do_swap_page_nr() functions remain empty. Link: https://lkml.kernel.org/r/20240529082824.150954-5-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Reviewed-by: Khalid Aziz Cc: "David S. Miller" Cc: Andreas Larsson Cc: Baolin Wang Cc: Chris Li Cc: Christoph Hellwig Cc: Chuanhua Han Cc: David Hildenbrand Cc: Gao Xiang Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Len Brown Cc: Matthew Wilcox (Oracle) Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 117b807e3f89..2f32eaccf0b9 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1089,6 +1089,15 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) }) #ifndef __HAVE_ARCH_DO_SWAP_PAGE +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + +} +#else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be @@ -1097,12 +1106,17 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ -static inline void arch_do_swap_page(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long addr, - pte_t pte, pte_t oldpte) -{ - +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + for (int i = 0; i < nr; i++) { + arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, + pte_advance_pfn(pte, i), + pte_advance_pfn(oldpte, i)); + } } #endif -- cgit v1.2.3 From 16540dae959d862909716d5c854e9b3aee285609 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 30 May 2024 10:14:27 -0700 Subject: mm/hugetlb: mm/memory_hotplug: use a folio in scan_movable_pages() By using a folio in scan_movable_pages() we convert the last user of the page-based hugetlb information macro functions to the folio version. After this conversion, we can safely remove the page-based definitions from include/linux/hugetlb.h. Link: https://lkml.kernel.org/r/20240530171427.242018-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 15a58f69782c..279aca379b95 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -616,9 +616,7 @@ static __always_inline \ bool folio_test_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ return test_bit(HPG_##flname, private); \ - } \ -static inline int HPage##uname(struct page *page) \ - { return test_bit(HPG_##flname, &(page->private)); } + } #define SETHPAGEFLAG(uname, flname) \ static __always_inline \ @@ -637,8 +635,6 @@ void folio_clear_hugetlb_##flname(struct folio *folio) \ #define TESTHPAGEFLAG(uname, flname) \ static inline bool \ folio_test_hugetlb_##flname(struct folio *folio) \ - { return 0; } \ -static inline int HPage##uname(struct page *page) \ { return 0; } #define SETHPAGEFLAG(uname, flname) \ -- cgit v1.2.3 From fefc6e6631ff43427e81f08c8e49f7787ff0213a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 28 May 2024 09:40:50 -0700 Subject: memcg: rearrange fields of mem_cgroup_per_node Kernel test robot reported [1] performance regression for will-it-scale test suite's page_fault2 test case for the commit 70a64b7919cb ("memcg: dynamically allocate lruvec_stats"). After inspection it seems like the commit has unintentionally introduced false cache sharing. After the commit the fields of mem_cgroup_per_node which get read on the performance critical path share the cacheline with the fields which get updated often on LRU page allocations or deallocations. This has caused contention on that cacheline and the workloads which manipulates a lot of LRU pages are regressed as reported by the test report. The solution is to rearrange the fields of mem_cgroup_per_node such that the false sharing is eliminated. Let's move all the read only pointers at the start of the struct, followed by memcg-v1 only fields and at the end fields which get updated often. Experiment setup: Ran fallocate1, fallocate2, page_fault1, page_fault2 and page_fault3 from the will-it-scale test suite inside a three level memcg with /tmp mounted as tmpfs on two different machines, one a single numa node and the other one, two node machine. $ ./[testcase]_processes -t $NR_CPUS -s 50 Results for single node, 52 CPU machine: Testcase base with-patch fallocate1 1031081 1431291 (38.80 %) fallocate2 1029993 1421421 (38.00 %) page_fault1 2269440 3405788 (50.07 %) page_fault2 2375799 3572868 (50.30 %) page_fault3 28641143 28673950 ( 0.11 %) Results for dual node, 80 CPU machine: Testcase base with-patch fallocate1 2976288 3641185 (22.33 %) fallocate2 2979366 3638181 (22.11 %) page_fault1 6221790 7748245 (24.53 %) page_fault2 6482854 7847698 (21.05 %) page_fault3 28804324 28991870 ( 0.65 %) Link: https://lkml.kernel.org/r/20240528164050.2625718-1-shakeel.butt@linux.dev Fixes: 70a64b7919cb ("memcg: dynamically allocate lruvec_stats") Signed-off-by: Shakeel Butt Reported-by: kernel test robot Reviewed-by: Yosry Ahmed Reviewed-by: Roman Gushchin Cc: Feng Tang Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3d1599146afe..7403dd5926eb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -96,23 +96,29 @@ struct mem_cgroup_reclaim_iter { * per-node information in memory controller. */ struct mem_cgroup_per_node { - struct lruvec lruvec; + /* Keep the read-only fields at the start */ + struct mem_cgroup *memcg; /* Back pointer, we cannot */ + /* use container_of */ struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; struct lruvec_stats *lruvec_stats; - - unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; - - struct mem_cgroup_reclaim_iter iter; - struct shrinker_info __rcu *shrinker_info; + /* + * Memcg-v1 only stuff in middle as buffer between read mostly fields + * and update often fields to avoid false sharing. Once v1 stuff is + * moved in a separate struct, an explicit padding is needed. + */ + struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; - struct mem_cgroup *memcg; /* Back pointer, we cannot */ - /* use container_of */ + + /* Fields which get updated often at the end. */ + struct lruvec lruvec; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; + struct mem_cgroup_reclaim_iter iter; }; struct mem_cgroup_threshold { -- cgit v1.2.3 From 21664442be1bf79094dd9d21b8833acbfbd80ea7 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 28 May 2024 16:43:13 +0200 Subject: percpu: add __this_cpu_try_cmpxchg() Add __this_cpu_try_cmpxchg() version of the percpu op. Link: https://lkml.kernel.org/r/20240528144345.5980-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Dennis Zhou Cc: Christoph Hellwig Cc: Lorenzo Stoakes Cc: Tejun Heo Cc: Christoph Lameter Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index ec3573119923..8efce7414fad 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -475,6 +475,12 @@ do { \ raw_cpu_cmpxchg(pcp, oval, nval); \ }) +#define __this_cpu_try_cmpxchg(pcp, ovalp, nval) \ +({ \ + __this_cpu_preempt_check("try_cmpxchg"); \ + raw_cpu_try_cmpxchg(pcp, ovalp, nval); \ +}) + #define __this_cpu_sub(pcp, val) __this_cpu_add(pcp, -(typeof(pcp))(val)) #define __this_cpu_inc(pcp) __this_cpu_add(pcp, 1) #define __this_cpu_dec(pcp) __this_cpu_sub(pcp, 1) -- cgit v1.2.3 From 4b98995530b77a97912230d8e1564ba7738db19c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:07 +0800 Subject: mm: shmem: add multi-size THP sysfs interface for anonymous shmem To support the use of mTHP with anonymous shmem, add a new sysfs interface 'shmem_enabled' in the '/sys/kernel/mm/transparent_hugepage/hugepages-kB/' directory for each mTHP to control whether shmem is enabled for that mTHP, with a value similar to the top level 'shmem_enabled', which can be set to: "always", "inherit (to inherit the top level setting)", "within_size", "advise", "never". An 'inherit' option is added to ensure compatibility with these global settings, and the options 'force' and 'deny' are dropped, which are rather testing artifacts from the old ages. By default, PMD-sized hugepages have enabled="inherit" and all other hugepage sizes have enabled="never" for '/sys/kernel/mm/transparent_hugepage/hugepages-xxkB/shmem_enabled'. In addition, if top level value is 'force', then only PMD-sized hugepages have enabled="inherit", otherwise configuration will be failed and vice versa. That means now we will avoid using non-PMD sized THP to override the global huge allocation. [baolin.wang@linux.alibaba.com: fix transhuge.rst indentation] Link: https://lkml.kernel.org/r/b189d815-998b-4dfd-ba89-218ff51313f8@linux.alibaba.com [akpm@linux-foundation.org: reflow transhuge.rst addition to 80 cols] [baolin.wang@linux.alibaba.com: move huge_shmem_orders_lock under CONFIG_SYSFS] Link: https://lkml.kernel.org/r/eb34da66-7f12-44f3-a39e-2bcc90c33354@linux.alibaba.com [akpm@linux-foundation.org: huge_memory.c needs mm_types.h] Link: https://lkml.kernel.org/r/ffddfa8b3cb4266ff963099ab78cfd7184c57ac7.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 088d66a54643..6a1edd752968 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,6 +6,7 @@ #include #include /* only for vma_is_dax() */ +#include vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -63,6 +64,7 @@ ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; +extern struct kobj_attribute thpsize_shmem_enabled_attr; /* * Mask of all large folio orders supported for anonymous THP; all orders up to @@ -265,6 +267,14 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } +struct thpsize { + struct kobject kobj; + struct list_head node; + int order; +}; + +#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) + enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, -- cgit v1.2.3 From e7a2ab7b3bb5d87f99f2ea3d4481d52fc5ceb52d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:08 +0800 Subject: mm: shmem: add mTHP support for anonymous shmem Commit 19eaf44954df adds multi-size THP (mTHP) for anonymous pages, that can allow THP to be configured through the sysfs interface located at '/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled'. However, the anonymous shmem will ignore the anonymous mTHP rule configured through the sysfs interface, and can only use the PMD-mapped THP, that is not reasonable. Users expect to apply the mTHP rule for all anonymous pages, including the anonymous shmem, in order to enjoy the benefits of mTHP. For example, lower latency than PMD-mapped THP, smaller memory bloat than PMD-mapped THP, contiguous PTEs on ARM architecture to reduce TLB miss etc. In addition, the mTHP interfaces can be extended to support all shmem/tmpfs scenarios in the future, especially for the shmem mmap() case. The primary strategy is similar to supporting anonymous mTHP. Introduce a new interface '/mm/transparent_hugepage/hugepage-XXkb/shmem_enabled', which can have almost the same values as the top-level '/sys/kernel/mm/transparent_hugepage/shmem_enabled', with adding a new additional "inherit" option and dropping the testing options 'force' and 'deny'. By default all sizes will be set to "never" except PMD size, which is set to "inherit". This ensures backward compatibility with the anonymous shmem enabled of the top level, meanwhile also allows independent control of anonymous shmem enabled for each mTHP. Link: https://lkml.kernel.org/r/65796c1e72e51e15f3410195b5c2d5b6c160d411.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 6a1edd752968..53b7a137f460 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -560,6 +560,16 @@ static inline bool thp_migration_supported(void) { return false; } + +static inline int highest_order(unsigned long orders) +{ + return 0; +} + +static inline int next_order(unsigned long *orders, int prev) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, -- cgit v1.2.3 From 66f44583f9b617d74ffa2487e75a9c3adf344ddb Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:10 +0800 Subject: mm: shmem: add mTHP counters for anonymous shmem Add mTHP counters for anonymous shmem. [baolin.wang@linux.alibaba.com: update Documentation/admin-guide/mm/transhuge.rst] Link: https://lkml.kernel.org/r/d86e2e7f-4141-432b-b2ba-c6691f36ef0b@linux.alibaba.com Link: https://lkml.kernel.org/r/4fd9e467d49ae4a747e428bcd821c7d13125ae67.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Lance Yang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 53b7a137f460..7ad41de5eaea 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -281,6 +281,9 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, + MTHP_STAT_FILE_ALLOC, + MTHP_STAT_FILE_FALLBACK, + MTHP_STAT_FILE_FALLBACK_CHARGE, __MTHP_STAT_COUNT }; -- cgit v1.2.3 From cdd9a571b7d8465353fe2e2d8d1edd8a58d82021 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 14:23:56 +0200 Subject: fs/proc: move page_mapcount() to fs/proc/internal.h ... and rename it to folio_precise_page_mapcount(). fs/proc is the last remaining user, and that should stay that way. While at it, cleanup kpagecount_read() a bit: there are still some legacy leftovers -- when the interface was introduced it returned the page refcount, but was changed briefly afterwards to return the page mapcount. Further, some simple folio conversion. Once we stop using the per-page mapcounts of large folios, all folio_precise_page_mapcount() users will have to implement an alternative way to achieve what they are trying to achieve, possibly in a less precise way. [dan.carpenter@linaro.org: fix uninitialized variable in pagemap_pmd_range()] Link: https://lkml.kernel.org/r/9d6eaba7-92f8-4a70-8765-38a519680a87@moroto.mountain Link: https://lkml.kernel.org/r/20240607122357.115423-6-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Dan Carpenter Cc: Alexey Dobriyan Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f4bf94ca99e3..7f864d5f52b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1202,8 +1202,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for - * debugging purposes - it does not include PTE-mapped sub-pages; look - * at folio_mapcount() or page_mapcount() instead. + * debugging purposes or implementation of other core folio_*() primitives. */ static inline int folio_entire_mapcount(const struct folio *folio) { @@ -1221,30 +1220,6 @@ static inline void page_mapcount_reset(struct page *page) atomic_set(&(page)->_mapcount, -1); } -/** - * page_mapcount() - Number of times this precise page is mapped. - * @page: The page. - * - * The number of times this page is mapped. If this page is part of - * a large folio, it includes the number of times this page is mapped - * as part of that folio. - * - * Will report 0 for pages which cannot be mapped into userspace, eg - * slab, page tables and similar. - */ -static inline int page_mapcount(struct page *page) -{ - int mapcount = atomic_read(&page->_mapcount) + 1; - - /* Handle page_has_type() pages */ - if (mapcount < PAGE_MAPCOUNT_RESERVE + 1) - mapcount = 0; - if (unlikely(PageCompound(page))) - mapcount += folio_entire_mapcount(page_folio(page)); - - return mapcount; -} - static inline int folio_large_mapcount(const struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); -- cgit v1.2.3 From 7a581204b1fa4fed233ed3b7ffcf6847cc383dbc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 10:37:10 +0200 Subject: mm/highmem: reimplement totalhigh_pages() by walking zones Patch series "mm/highmem: don't track highmem pages manually". Let's remove highmem special-casing from adjust_managed_page_count(), to result in less confusion why memblock manually adjusts totalram_pages, and __free_pages_core() only adjusts the zone's managed pages -- what about the highmem pages that adjust_managed_page_count() updates? Now, we only maintain totalram_pages and a zone's managed pages independent of highmem support. We can derive the number of highmem pages simply by looking at the relevant zone's managed pages. I don't think there is any particular fast path that needs a maximum-efficient totalhigh_pages() implementation. Note that highmem memory is currently initialized using free_highmem_page()->free_reserved_page(), not __free_pages_core(). In the future we might want to also use __free_pages_core() to initialize highmem memory, to make that less special, and consider moving totalram_pages updates into __free_pages_core() [1], so we can just use adjust_managed_page_count() in there as well. Booting a simple kernel in QEMU reveals no highmem accounting change: Before: Memory: 3095448K/3145208K available (14802K kernel code, 2073K rwdata, 5000K rodata, 740K init, 556K bss, 49760K reserved, 0K cma-reserved, 2244488K highmem) After: Memory: 3095276K/3145208K available (14802K kernel code, 2073K rwdata, 5000K rodata, 740K init, 556K bss, 49932K reserved, 0K cma-reserved, 2244488K highmem) [1] https://lkml.kernel.org/r/20240601133402.2675-1-richard.weiyang@gmail.com This patch (of 2): Can we get rid of the highmem ifdef in adjust_managed_page_count()? Likely yes: we don't have that many totalhigh_pages() users, and they all don't seem to be very performance critical. So let's implement totalhigh_pages() like nr_free_highpages(), collecting information from all zones. This is now similar to what we do in si_meminfo_node() to collect the per-node highmem page count. In the common case (single node, 3-4 zones), we really shouldn't care. We could optimize a bit further (only walk ZONE_HIGHMEM and ZONE_MOVABLE if required), but there doesn't seem a real need for that. [david@redhat.com: fix build bot complaint] Link: https://lkml.kernel.org/r/b57e5bc4-eb72-40e3-add4-57dfa6e03df6@redhat.com Link: https://lkml.kernel.org/r/20240607083711.62833-1-david@redhat.com Link: https://lkml.kernel.org/r/20240607083711.62833-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Wei Yang Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/highmem-internal.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index a3028e400a9c..65f865fbbac0 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -132,7 +132,7 @@ static inline void __kunmap_atomic(const void *addr) } unsigned int __nr_free_highpages(void); -extern atomic_long_t _totalhigh_pages; +unsigned long __totalhigh_pages(void); static inline unsigned int nr_free_highpages(void) { @@ -141,12 +141,7 @@ static inline unsigned int nr_free_highpages(void) static inline unsigned long totalhigh_pages(void) { - return (unsigned long)atomic_long_read(&_totalhigh_pages); -} - -static inline void totalhigh_pages_add(long count) -{ - atomic_long_add(count, &_totalhigh_pages); + return __totalhigh_pages(); } static inline bool is_kmap_addr(const void *x) -- cgit v1.2.3 From 90b8fab5cdc441735d388e286c715da7c85b24f1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 10:37:11 +0200 Subject: mm/highmem: make nr_free_highpages() return "unsigned long" It looks rather weird that totalhigh_pages() returns an "unsigned long" but nr_free_highpages() returns an "unsigned int". Let's return an "unsigned long" from nr_free_highpages() to be consistent. While at it, use a plain "0" instead of a "0UL" in the !CONFIG_HIGHMEM totalhigh_pages() implementation, to make these look alike as well. Link: https://lkml.kernel.org/r/20240607083711.62833-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Wei Yang Signed-off-by: Andrew Morton --- include/linux/highmem-internal.h | 8 ++++---- include/linux/highmem.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 65f865fbbac0..dd100e849f5e 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -131,10 +131,10 @@ static inline void __kunmap_atomic(const void *addr) preempt_enable(); } -unsigned int __nr_free_highpages(void); +unsigned long __nr_free_highpages(void); unsigned long __totalhigh_pages(void); -static inline unsigned int nr_free_highpages(void) +static inline unsigned long nr_free_highpages(void) { return __nr_free_highpages(); } @@ -234,8 +234,8 @@ static inline void __kunmap_atomic(const void *addr) preempt_enable(); } -static inline unsigned int nr_free_highpages(void) { return 0; } -static inline unsigned long totalhigh_pages(void) { return 0UL; } +static inline unsigned long nr_free_highpages(void) { return 0; } +static inline unsigned long totalhigh_pages(void) { return 0; } static inline bool is_kmap_addr(const void *x) { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 00341b56d291..fa6891e06316 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -179,7 +179,7 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset); static inline void *kmap_atomic(struct page *page); /* Highmem related interfaces for management code */ -static inline unsigned int nr_free_highpages(void); +static inline unsigned long nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); #ifndef ARCH_HAS_FLUSH_ANON_PAGE -- cgit v1.2.3 From 29e847d2ade3cdff36afe095fdbeb9b5f71a197a Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 14 Jun 2024 09:51:37 +0800 Subject: mm/rmap: integrate PMD-mapped folio splitting into pagewalk loop In preparation for supporting try_to_unmap_one() to unmap PMD-mapped folios, start the pagewalk first, then call split_huge_pmd_address() to split the folio. Link: https://lkml.kernel.org/r/20240614015138.31461-3-ioworker0@gmail.com Signed-off-by: Lance Yang Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Suggested-by: Baolin Wang Acked-by: Zi Yan Cc: Bang Li Cc: Barry Song Cc: Fangrui Song Cc: Jeff Xie Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: SeongJae Park Cc: Yang Shi Cc: Yin Fengwei Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ++++++ include/linux/rmap.h | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7ad41de5eaea..9f720b0731c4 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -428,6 +428,9 @@ static inline bool thp_migration_supported(void) return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } +void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, bool freeze, struct folio *folio); + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline bool folio_test_pmd_mappable(struct folio *folio) @@ -490,6 +493,9 @@ static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) {} +static inline void split_huge_pmd_locked(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + bool freeze, struct folio *folio) {} #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bb53e5920b88..bf46787c8eba 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -703,6 +703,30 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) spin_unlock(pvmw->ptl); } +/** + * page_vma_mapped_walk_restart - Restart the page table walk. + * @pvmw: Pointer to struct page_vma_mapped_walk. + * + * It restarts the page table walk when changes occur in the page + * table, such as splitting a PMD. Ensures that the PTL held during + * the previous walk is released and resets the state to allow for + * a new walk starting at the current address stored in pvmw->address. + */ +static inline void +page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw) +{ + WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte); + + if (likely(pvmw->ptl)) + spin_unlock(pvmw->ptl); + else + WARN_ON_ONCE(1); + + pvmw->ptl = NULL; + pvmw->pmd = NULL; + pvmw->pte = NULL; +} + bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); /* -- cgit v1.2.3 From 735ecdfaf4e802209caf34b7ac45adf448c54ccc Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 14 Jun 2024 09:51:38 +0800 Subject: mm/vmscan: avoid split lazyfree THP during shrink_folio_list() When the user no longer requires the pages, they would use madvise(MADV_FREE) to mark the pages as lazy free. Subsequently, they typically would not re-write to that memory again. During memory reclaim, if we detect that the large folio and its PMD are both still marked as clean and there are no unexpected references (such as GUP), so we can just discard the memory lazily, improving the efficiency of memory reclamation in this case. On an Intel i5 CPU, reclaiming 1GiB of lazyfree THPs using mem_cgroup_force_empty() results in the following runtimes in seconds (shorter is better): -------------------------------------------- | Old | New | Change | -------------------------------------------- | 0.683426 | 0.049197 | -92.80% | -------------------------------------------- [ioworker0@gmail.com: minor changes per David] Link: https://lkml.kernel.org/r/20240622100057.3352-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240614015138.31461-4-ioworker0@gmail.com Signed-off-by: Lance Yang Suggested-by: Zi Yan Suggested-by: David Hildenbrand Cc: Bang Li Cc: Baolin Wang Cc: Barry Song Cc: Fangrui Song Cc: Jeff Xie Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: SeongJae Park Cc: Yang Shi Cc: Yin Fengwei Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9f720b0731c4..212cca384d7e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -430,6 +430,8 @@ static inline bool thp_migration_supported(void) void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze, struct folio *folio); +bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmdp, struct folio *folio); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -497,6 +499,13 @@ static inline void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze, struct folio *folio) {} +static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp, + struct folio *folio) +{ + return false; +} + #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) -- cgit v1.2.3 From 2b33a97c94bc44468fc1d54b745269c0cf0b7bb2 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 11 Jun 2024 02:45:14 +0000 Subject: mm: zswap: rename is_zswap_enabled() to zswap_is_enabled() In preparation for introducing a similar function, rename is_zswap_enabled() to use zswap_* prefix like other zswap functions. Link: https://lkml.kernel.org/r/20240611024516.1375191-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Barry Song Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Chris Li Cc: David Hildenbrand Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 2a85b941db97..ce5e7bfe8f1e 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -35,7 +35,7 @@ void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); void zswap_folio_swapin(struct folio *folio); -bool is_zswap_enabled(void); +bool zswap_is_enabled(void); #else struct zswap_lruvec_state {}; @@ -60,7 +60,7 @@ static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} static inline void zswap_folio_swapin(struct folio *folio) {} -static inline bool is_zswap_enabled(void) +static inline bool zswap_is_enabled(void) { return false; } -- cgit v1.2.3 From 2d4d2b1cfb85cc07f6d5619acb882d8b11e55cf4 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 11 Jun 2024 02:45:15 +0000 Subject: mm: zswap: add zswap_never_enabled() Add zswap_never_enabled() to skip the xarray lookup in zswap_load() if zswap was never enabled on the system. It is implemented using static branches for efficiency, as enabling zswap should be a rare event. This could shave some cycles off zswap_load() when CONFIG_ZSWAP is used but zswap is never enabled. However, the real motivation behind this patch is two-fold: - Incoming large folio swapin work will need to fallback to order-0 folios if zswap was ever enabled, because any part of the folio could be in zswap, until proper handling of large folios with zswap is added. - A warning and recovery attempt will be added in a following change in case the above was not done incorrectly. Zswap will fail the read if the folio is large and it was ever enabled. Expose zswap_never_enabled() in the header for the swapin work to use it later. [yosryahmed@google.com: expose zswap_never_enabled() in the header] Link: https://lkml.kernel.org/r/Zmjf0Dr8s9xSW41X@google.com Link: https://lkml.kernel.org/r/20240611024516.1375191-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Nhat Pham Cc: Barry Song Cc: Chengming Zhou Cc: Chris Li Cc: David Hildenbrand Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/zswap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index ce5e7bfe8f1e..bf83ae5e285d 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -36,6 +36,7 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); void zswap_folio_swapin(struct folio *folio); bool zswap_is_enabled(void); +bool zswap_never_enabled(void); #else struct zswap_lruvec_state {}; @@ -65,6 +66,11 @@ static inline bool zswap_is_enabled(void) return false; } +static inline bool zswap_never_enabled(void) +{ + return false; +} + #endif #endif /* _LINUX_ZSWAP_H */ -- cgit v1.2.3 From 15995a35247442aefa0ffe36a6dad51cb46b0918 Mon Sep 17 00:00:00 2001 From: Sourav Panda Date: Wed, 5 Jun 2024 22:27:51 +0000 Subject: mm: report per-page metadata information Today, we do not have any observability of per-page metadata and how much it takes away from the machine capacity. Thus, we want to describe the amount of memory that is going towards per-page metadata, which can vary depending on build configuration, machine architecture, and system use. This patch adds 2 fields to /proc/vmstat that can used as shown below: Accounting per-page metadata allocated by boot-allocator: /proc/vmstat:nr_memmap_boot * PAGE_SIZE Accounting per-page metadata allocated by buddy-allocator: /proc/vmstat:nr_memmap * PAGE_SIZE Accounting total Perpage metadata allocated on the machine: (/proc/vmstat:nr_memmap_boot + /proc/vmstat:nr_memmap) * PAGE_SIZE Utility for userspace: Observability: Describe the amount of memory overhead that is going to per-page metadata on the system at any given time since this overhead is not currently observable. Debugging: Tracking the changes or absolute value in struct pages can help detect anomalies as they can be correlated with other metrics in the machine (e.g., memtotal, number of huge pages, etc). page_ext overheads: Some kernel features such as page_owner page_table_check that use page_ext can be optionally enabled via kernel parameters. Having the total per-page metadata information helps users precisely measure impact. Furthermore, page-metadata metrics will reflect the amount of struct pages reliquished (or overhead reduced) when hugetlbfs pages are reserved which will vary depending on whether hugetlb vmemmap optimization is enabled or not. For background and results see: lore.kernel.org/all/20240220214558.3377482-1-souravpanda@google.com Link: https://lkml.kernel.org/r/20240605222751.1406125-1-souravpanda@google.com Signed-off-by: Sourav Panda Acked-by: David Rientjes Reviewed-by: Pasha Tatashin Cc: Alexey Dobriyan Cc: Bjorn Helgaas Cc: Chen Linxuan Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Ivan Babrou Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Kirill A. Shutemov Cc: Liam R. Howlett Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: "Rafael J. Wysocki" Cc: Randy Dunlap Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Tomas Mudrunka Cc: Vlastimil Babka Cc: Wei Xu Cc: Yang Yang Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 ++ include/linux/vmstat.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 586a8f0104d7..cb7f265c2b96 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,6 +220,8 @@ enum node_stat_item { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, + NR_MEMMAP, /* page metadata allocated through buddy allocator */ + NR_MEMMAP_BOOT, /* page metadata allocated through boot allocator */ NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 735eae6e272c..16b0cfa80502 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -624,4 +624,8 @@ static inline void lruvec_stat_sub_folio(struct folio *folio, { lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } + +void __meminit mod_node_early_perpage_metadata(int nid, long delta); +void __meminit store_early_perpage_metadata(void); + #endif /* _LINUX_VMSTAT_H */ -- cgit v1.2.3 From 47179fe03588caa13a9bae642b058901709ddc55 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 12 Jun 2024 09:24:08 +0000 Subject: mm/hugetlb_cgroup: prepare cftypes based on template Unlike other cgroup subsystems, the hugetlb cgroup does not provide a static array of cftype that explicitly displays the properties, handling functions, etc., of each file. Instead, it dynamically creates the attribute of cftypes based on the hstate during the startup procedure. This reduces the readability of the code. To fix this issue, introduce two templates of cftypes, and rebuild the attributes according to the hstate to make it ready to be added to cgroup framework. Link: https://lkml.kernel.org/r/20240612092409.2027592-3-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Cc: Muchun Song Cc: Oscar Salvador Cc: kernel test robot From: Xiu Jianfeng Subject: mm/hugetlb_cgroup: register lockdep key for cftype Date: Tue, 18 Jun 2024 07:19:22 +0000 When CONFIG_DEBUG_LOCK_ALLOC is enabled, the following commands can trigger a bug, mount -t cgroup2 none /sys/fs/cgroup cd /sys/fs/cgroup echo "+hugetlb" > cgroup.subtree_control The log is as below: BUG: key ffff8880046d88d8 has not been registered! ------------[ cut here ]------------ DEBUG_LOCKS_WARN_ON(1) WARNING: CPU: 3 PID: 226 at kernel/locking/lockdep.c:4945 lockdep_init_map_type+0x185/0x220 Modules linked in: CPU: 3 PID: 226 Comm: bash Not tainted 6.10.0-rc4-next-20240617-g76db4c64526c #544 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:lockdep_init_map_type+0x185/0x220 Code: 00 85 c0 0f 84 6c ff ff ff 8b 3d 6a d1 85 01 85 ff 0f 85 5e ff ff ff 48 c7 c6 21 99 4a 82 48 c7 c7 60 29 49 82 e8 3b 2e f5 RSP: 0018:ffffc9000083fc30 EFLAGS: 00000282 RAX: 0000000000000000 RBX: ffffffff828dd820 RCX: 0000000000000027 RDX: ffff88803cd9cac8 RSI: 0000000000000001 RDI: ffff88803cd9cac0 RBP: ffff88800674fbb0 R08: ffffffff828ce248 R09: 00000000ffffefff R10: ffffffff8285e260 R11: ffffffff828b8eb8 R12: ffff8880046d88d8 R13: 0000000000000000 R14: 0000000000000000 R15: ffff8880067281c0 FS: 00007f68601ea740(0000) GS:ffff88803cd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005614f3ebc740 CR3: 000000000773a000 CR4: 00000000000006f0 Call Trace: ? __warn+0x77/0xd0 ? lockdep_init_map_type+0x185/0x220 ? report_bug+0x189/0x1a0 ? handle_bug+0x3c/0x70 ? exc_invalid_op+0x18/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? lockdep_init_map_type+0x185/0x220 __kernfs_create_file+0x79/0x100 cgroup_addrm_files+0x163/0x380 ? find_held_lock+0x2b/0x80 ? find_held_lock+0x2b/0x80 ? find_held_lock+0x2b/0x80 css_populate_dir+0x73/0x180 cgroup_apply_control_enable+0x12f/0x3a0 cgroup_subtree_control_write+0x30b/0x440 kernfs_fop_write_iter+0x13a/0x1f0 vfs_write+0x341/0x450 ksys_write+0x64/0xe0 do_syscall_64+0x4b/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f68602d9833 Code: 8b 15 61 26 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 64 8b 04 25 18 00 00 00 85 c0 75 14 b8 01 00 00 00 08 RSP: 002b:00007fff9bbdf8e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 00007f68602d9833 RDX: 0000000000000009 RSI: 00005614f3ebc740 RDI: 0000000000000001 RBP: 00005614f3ebc740 R08: 000000000000000a R09: 0000000000000008 R10: 00005614f3db6ba0 R11: 0000000000000246 R12: 0000000000000009 R13: 00007f68603bd6a0 R14: 0000000000000009 R15: 00007f68603b8880 For lockdep, there is a sanity check in lockdep_init_map_type(), the lock-class key must either have been allocated statically or must have been registered as a dynamic key. However the commit e18df2889ff9 ("mm/hugetlb_cgroup: prepare cftypes based on template") has changed the cftypes from static allocated objects to dynamic allocated objects, so the cft->lockdep_key must be registered proactively. [xiujianfeng@huawei.com: fix BUG()] Link: https://lkml.kernel.org/r/20240619015527.2212698-1-xiujianfeng@huawei.com Link: https://lkml.kernel.org/r/20240618071922.2127289-1-xiujianfeng@huawei.com Link: https://lore.kernel.org/all/602186b3-5ce3-41b3-90a3-134792cc2a48@samsung.com/ Fixes: e18df2889ff9 ("mm/hugetlb_cgroup: prepare cftypes based on template") Signed-off-by: Xiu Jianfeng Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202406181046.8d8b2492-oliver.sang@intel.com Tested-by: Marek Szyprowski Tested-by: SeongJae Park Closes: https://lore.kernel.org/20240618233608.400367-1-sj@kernel.org Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/cgroup-defs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ea48c861cd36..dc151cb0ef09 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -676,9 +676,7 @@ struct cftype { __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); -#ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; -#endif }; /* -- cgit v1.2.3 From b79d715c43354010775862cfcd2d01cb04d0de47 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 12 Jun 2024 09:24:09 +0000 Subject: mm/hugetlb_cgroup: switch to the new cftypes The previous patch has already reconstructed the cftype attributes based on the templates and saved them in dfl_cftypes and legacy_cftypes. then remove the old procedure and switch to the new cftypes. Link: https://lkml.kernel.org/r/20240612092409.2027592-4-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 279aca379b95..a951c0d06061 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -686,11 +686,6 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; -#ifdef CONFIG_CGROUP_HUGETLB - /* cgroup control files */ - struct cftype cgroup_files_dfl[8]; - struct cftype cgroup_files_legacy[10]; -#endif char name[HSTATE_NAME_LEN]; }; -- cgit v1.2.3 From ceb32d6aa93ce3282a724f3a0828b4b84d5035f1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 12 Jun 2024 15:18:24 +0800 Subject: mm/memory-failure: remove MF_MSG_SLAB Since commit 46df8e73a4a3 ("mm: free up PG_slab"), MF_MSG_SLAB becomes unused. Remove it. No functional change intended. Link: https://lkml.kernel.org/r/20240612071835.157004-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Borislav Petkov (AMD) Cc: David Hildenbrand Cc: kernel test robot Cc: Naoya Horiguchi Cc: Tony Luck Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7f864d5f52b7..8a38adaf5480 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4067,7 +4067,6 @@ enum mf_result { enum mf_action_page_type { MF_MSG_KERNEL, MF_MSG_KERNEL_HIGH_ORDER, - MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, -- cgit v1.2.3 From 3a78f77fd1fb82c32cb7971a8a97c5cbc83ab69e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 12 Jun 2024 15:18:32 +0800 Subject: mm/memory-failure: move some function declarations into internal.h There are some functions only used inside mm. Move them into internal.h. No functional change intended. Link: https://lkml.kernel.org/r/20240612071835.157004-11-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405251049.hxjwX7zO-lkp@intel.com/ Cc: Borislav Petkov (AMD) Cc: David Hildenbrand Cc: Naoya Horiguchi Cc: Tony Luck Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 ------- include/linux/page-flags.h | 5 ----- include/linux/rmap.h | 2 -- 3 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a38adaf5480..51fe207026f8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4000,7 +4000,6 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); -struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); #else static inline void memory_failure_queue(unsigned long pfn, int flags) { @@ -4021,12 +4020,6 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i) } #endif -#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM) -void add_to_kill_ksm(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr); -#endif - #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) extern void memblk_nr_poison_inc(unsigned long pfn); extern void memblk_nr_poison_sub(unsigned long pfn, long i); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 813eea2efe26..b041bc058fb1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -616,11 +616,6 @@ PAGEFLAG_FALSE(Uncached, uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) -#define MAGIC_HWPOISON 0x48575053U /* HWPS */ -extern void SetPageHWPoisonTakenOff(struct page *page); -extern void ClearPageHWPoisonTakenOff(struct page *page); -extern bool take_page_off_buddy(struct page *page); -extern bool put_page_back_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bf46787c8eba..694d3d886245 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -747,8 +747,6 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); -unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); - /* * rmap_walk_control: To control rmap traversing for specific needs * -- cgit v1.2.3 From e36287c6e12d00f2087dc0c4899d8a9c54e22e1c Mon Sep 17 00:00:00 2001 From: Hyeongtak Ji Date: Fri, 14 Jun 2024 12:00:05 +0900 Subject: mm/damon/sysfs-schemes: add target_nid on sysfs-schemes This patch adds target_nid under /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes// The 'target_nid' can be used as the destination node for DAMOS actions such as DAMOS_MIGRATE_{HOT,COLD} in the follow up patches. [sj@kernel.org: document target_nid file] Link: https://lkml.kernel.org/r/20240618213630.84846-3-sj@kernel.org Link: https://lkml.kernel.org/r/20240614030010.751-4-honggyu.kim@sk.com Signed-off-by: Hyeongtak Ji Signed-off-by: Honggyu Kim Signed-off-by: SeongJae Park Cc: Gregory Price Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Rakie Kim Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index f7da65e1ac04..21d6b69a015c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -374,6 +374,7 @@ struct damos_access_pattern { * @apply_interval_us: The time between applying the @action. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. + * @target_nid: Destination node if @action is "migrate_{hot,cold}". * @filters: Additional set of &struct damos_filter for &action. * @stat: Statistics of this scheme. * @list: List head for siblings. @@ -389,6 +390,10 @@ struct damos_access_pattern { * monitoring context are inactive, DAMON stops monitoring either, and just * repeatedly checks the watermarks. * + * @target_nid is used to set the migration target node for migrate_hot or + * migrate_cold actions, which means it's only meaningful when @action is either + * "migrate_hot" or "migrate_cold". + * * Before applying the &action to a memory region, &struct damon_operations * implementation could check pages of the region and skip &action to respect * &filters @@ -410,6 +415,9 @@ struct damos { /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; + union { + int target_nid; + }; struct list_head filters; struct damos_stat stat; struct list_head list; @@ -726,7 +734,8 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, unsigned long apply_interval_us, struct damos_quota *quota, - struct damos_watermarks *wmarks); + struct damos_watermarks *wmarks, + int target_nid); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); -- cgit v1.2.3 From ced816a76b8fd1288d48523cc48ed308964d12ed Mon Sep 17 00:00:00 2001 From: Honggyu Kim Date: Fri, 14 Jun 2024 12:00:06 +0900 Subject: mm/migrate: add MR_DAMON to migrate_reason The current patch series introduces DAMON based migration across NUMA nodes so it'd be better to have a new migrate_reason in trace events. Link: https://lkml.kernel.org/r/20240614030010.751-5-honggyu.kim@sk.com Signed-off-by: Honggyu Kim Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Cc: Gregory Price Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Hyeongtak Ji Cc: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Rakie Kim Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/migrate_mode.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index 9fb482bb7323..265c4328b36a 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -24,6 +24,7 @@ enum migrate_reason { MR_CONTIG_RANGE, MR_LONGTERM_PIN, MR_DEMOTION, + MR_DAMON, MR_TYPES }; -- cgit v1.2.3 From b51820ebea656be3b48bb16dcdc5ad3f203c4fd7 Mon Sep 17 00:00:00 2001 From: Honggyu Kim Date: Fri, 14 Jun 2024 12:00:07 +0900 Subject: mm/damon/paddr: introduce DAMOS_MIGRATE_COLD action for demotion This patch introduces DAMOS_MIGRATE_COLD action, which is similar to DAMOS_PAGEOUT, but migrate folios to the given 'target_nid' in the sysfs instead of swapping them out. The 'target_nid' sysfs knob informs the migration target node ID. Here is one of the example usage of this 'migrate_cold' action. $ cd /sys/kernel/mm/damon/admin/kdamonds/ $ cat contexts//schemes//action migrate_cold $ echo 2 > contexts//schemes//target_nid $ echo commit > state $ numactl -p 0 ./hot_cold 500M 600M & $ numastat -c -p hot_cold Per-node process memory usage (in MBs) PID Node 0 Node 1 Node 2 Total -------------- ------ ------ ------ ----- 701 (hot_cold) 501 0 601 1101 Since there are some common routines with pageout, many functions have similar logics between pageout and migrate cold. damon_pa_migrate_folio_list() is a minimized version of shrink_folio_list(). Link: https://lkml.kernel.org/r/20240614030010.751-6-honggyu.kim@sk.com Signed-off-by: Honggyu Kim Signed-off-by: Hyeongtak Ji Signed-off-by: SeongJae Park Cc: Gregory Price Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Rakie Kim Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 21d6b69a015c..56714b6eb0d7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -105,6 +105,7 @@ struct damon_target { * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. + * @DAMOS_MIGRATE_COLD: Migrate the regions prioritizing colder regions. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions * @@ -122,6 +123,7 @@ enum damos_action { DAMOS_NOHUGEPAGE, DAMOS_LRU_PRIO, DAMOS_LRU_DEPRIO, + DAMOS_MIGRATE_COLD, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, }; -- cgit v1.2.3 From b696722d784fb3610183281d2fd514b0efe97e3b Mon Sep 17 00:00:00 2001 From: Hyeongtak Ji Date: Fri, 14 Jun 2024 12:00:08 +0900 Subject: mm/damon/paddr: introduce DAMOS_MIGRATE_HOT action for promotion This patch introduces DAMOS_MIGRATE_HOT action, which is similar to DAMOS_MIGRATE_COLD, but proritizes hot pages. It migrates pages inside the given region to the 'target_nid' NUMA node in the sysfs. Here is one of the example usage of this 'migrate_hot' action. $ cd /sys/kernel/mm/damon/admin/kdamonds/ $ cat contexts//schemes//action migrate_hot $ echo 0 > contexts//schemes//target_nid $ echo commit > state $ numactl -p 2 ./hot_cold 500M 600M & $ numastat -c -p hot_cold Per-node process memory usage (in MBs) PID Node 0 Node 1 Node 2 Total -------------- ------ ------ ------ ----- 701 (hot_cold) 501 0 601 1101 Link: https://lkml.kernel.org/r/20240614030010.751-7-honggyu.kim@sk.com Signed-off-by: Hyeongtak Ji Signed-off-by: Honggyu Kim Signed-off-by: SeongJae Park Cc: Gregory Price Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Rakie Kim Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 56714b6eb0d7..3d62d98d6359 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -105,6 +105,7 @@ struct damon_target { * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. + * @DAMOS_MIGRATE_HOT: Migrate the regions prioritizing warmer regions. * @DAMOS_MIGRATE_COLD: Migrate the regions prioritizing colder regions. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions @@ -123,6 +124,7 @@ enum damos_action { DAMOS_NOHUGEPAGE, DAMOS_LRU_PRIO, DAMOS_LRU_DEPRIO, + DAMOS_MIGRATE_HOT, DAMOS_MIGRATE_COLD, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, -- cgit v1.2.3 From 3ad1dce6c30175dfd3da29d76853a69affbf7489 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 18 Jun 2024 11:17:58 -0700 Subject: mm/damon/core: implement DAMOS quota goals online commit function Patch series "mm/damon: introduce DAMON parameters online commit function". DAMON context struct (damon_ctx) contains user requests (parameters), internal status, and operation results. For flexible usages, DAMON API users are encouraged to manually manipulate the struct. That works well for simple use cases. However, it has turned out that it is not that simple at least for online parameters udpate. It is easy to forget properly maintaining internal status and operation results. Also, such manual manipulation for online tuning is implemented multiple times on DAMON API users including DAMON sysfs interface, DAMON_RECLAIM and DAMON_LRU_SORT. As a result, we have multiple sources of bugs for same problem. Actually we found and fixed a few bugs from online parameter updating of DAMON API users. Implement a function for online DAMON parameters update in core layer, and replace DAMON API users' manual manipulation code for the use case. The core layer function could still have bugs, but this change reduces the source of bugs for the problem to one place. This patch (of 12): Implement functions for supporting online DAMOS quota goals parameters update. The function receives two DAMOS quota structs. One is the struct that currently being used by a kdamond and therefore to be updated. The other one contains the parameters to be applied to the first one. The function applies the new parameters to the destination struct while keeping/updating the internal status. The function should be called from parameters-update safe place, like DAMON callbacks. Link: https://lkml.kernel.org/r/20240618181809.82078-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240618181809.82078-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 3d62d98d6359..ce12c9f1b4e4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -742,6 +742,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, int target_nid); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); +int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src); struct damon_target *damon_new_target(void); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); -- cgit v1.2.3 From 9cb3d0b9dfce6a3258d91e6d69e418d0b4cce46a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 18 Jun 2024 11:17:59 -0700 Subject: mm/damon/core: implement DAMON context commit function Implement functions for supporting online DAMON context level parameters update. The function receives two DAMON context structs. One is the struct that currently being used by a kdamond and therefore to be updated. The other one contains the parameters to be applied to the first one. The function applies the new parameters to the destination struct while keeping/updating the internal status and operation results. The function should be called from DAMON context-update-safe place, like DAMON callbacks. Link: https://lkml.kernel.org/r/20240618181809.82078-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index ce12c9f1b4e4..27c546bfc6d4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -756,6 +756,7 @@ void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs); void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); +int damon_commit_ctx(struct damon_ctx *old_ctx, struct damon_ctx *new_ctx); int damon_nr_running_ctxs(void); bool damon_is_registered_ops(enum damon_ops_id id); int damon_register_ops(struct damon_operations *ops); -- cgit v1.2.3 From 6d21dde7adc0a2deb9e0c6b06f42be3f88ca180d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 29 May 2024 13:18:59 +0200 Subject: mm: update _mapcount and page_type documentation Patch series "mm: page_type, zsmalloc and page_mapcount_reset()", v2. Wanting to remove the remaining abuser of _mapcount/page_type along with page_mapcount_reset(), I stumbled over zsmalloc, which is yet to be converted away from "struct page" [1]. Unfortunately, we cannot stop using the page_type field in zsmalloc code completely for its own purposes. All other fields in "struct page" are used one way or the other. Could we simply store a 2-byte offset value at the beginning of each page? Likely, but that will require a bit more work; and once we have memdesc we might want to move the offset in there (struct zsalloc?) again. ... but we can limit the abuse to 16 bit, glue it to a page type that must be set, and document it. page_has_type() will always successfully indicate such zsmalloc pages, and such zsmalloc pages only. We lose zsmalloc support for PAGE_SIZE > 64KB, which should be tolerable. We could use more bits from the page type, but 16 bit sounds like a good idea for now. So clarify the _mapcount/page_type documentation, use a proper page_type for zsmalloc, and remove page_mapcount_reset(). [1] https://lore.kernel.org/all/20231130101242.2590384-1-42.hyeyoo@gmail.com/ This patch (of 6): Let's make it clearer that _mapcount must no longer be used for own purposes, and how _mapcount and page_type behaves nowadays (also in the context of hugetlb folios, which are typed folios that will be mapped to user space). Move the documentation regarding "-1" over from page_mapcount_reset(), which we will remove next. Move "page_type" before "mapcount", to make it clearer what typed folios are. Link: https://lkml.kernel.org/r/20240529111904.2069608-1-david@redhat.com Link: https://lkml.kernel.org/r/20240529111904.2069608-2-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Sergey Senozhatsky [zram/zsmalloc workloads] Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 ----- include/linux/mm_types.h | 28 +++++++++++++++++----------- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 51fe207026f8..61a7f680f0ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1210,11 +1210,6 @@ static inline int folio_entire_mapcount(const struct folio *folio) return atomic_read(&folio->_entire_mapcount) + 1; } -/* - * The atomic page->_mapcount, starts from -1: so that transitions - * both from it and to it can be tracked, using atomic_inc_and_test - * and atomic_add_negative(-1). - */ static inline void page_mapcount_reset(struct page *page) { atomic_set(&(page)->_mapcount, -1); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index af3a0256fa93..278eade2ad4c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -46,9 +46,7 @@ struct mem_cgroup; * which is guaranteed to be aligned. If you use the same storage as * page->mapping, you must restore it to NULL before freeing the page. * - * If your page will not be mapped to userspace, you can also use the four - * bytes in the mapcount union, but you must call page_mapcount_reset() - * before freeing it. + * The mapcount field must not be used for own purposes. * * If you want to use the refcount field, it must be used in such a way * that other CPUs temporarily incrementing and then decrementing the @@ -152,18 +150,26 @@ struct page { union { /* This union is 4 bytes in size. */ /* - * If the page can be mapped to userspace, encodes the number - * of times this page is referenced by a page table. + * For head pages of typed folios, the value stored here + * allows for determining what this page is used for. The + * tail pages of typed folios will not store a type + * (page_type == _mapcount == -1). + * + * See page-flags.h for a list of page types which are currently + * stored here. */ - atomic_t _mapcount; + unsigned int page_type; /* - * If the page is neither PageSlab nor mappable to userspace, - * the value stored here may help determine what this page - * is used for. See page-flags.h for a list of page types - * which are currently stored here. + * For pages that are part of non-typed folios for which mappings + * are tracked via the RMAP, encodes the number of times this page + * is directly referenced by a page table. + * + * Note that the mapcount is always initialized to -1, so that + * transitions both from it and to it can be tracked, using + * atomic_inc_and_test() and atomic_add_negative(-1). */ - unsigned int page_type; + atomic_t _mapcount; }; /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ -- cgit v1.2.3 From 8db00ad5646171880239b7f10e333278f63d8fcf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 29 May 2024 13:19:00 +0200 Subject: mm: allow reuse of the lower 16 bit of the page type with an actual type As long as the owner sets a page type first, we can allow reuse of the lower 16 bit: sufficient to store an offset into a 64 KiB page, which is the maximum base page size in *common* configurations (ignoring the 256 KiB variant). Restrict it to the head page. We'll use that for zsmalloc next, to set a proper type while still reusing that field to store information (offset into a base page) that cannot go elsewhere for now. Let's reserve the lower 16 bit for that purpose and for catching mapcount underflows, and let's reduce PAGE_TYPE_BASE to a single bit. Note that we will still have to overflow the mapcount quite a lot until we would actually indicate a valid page type. Start handing out the type bits from highest to lowest, to make it clearer how many bits for types we have left. Out of 15 bit we can use for types, we currently use 6. If we run out of bits before we have better typing (e.g., memdesc), we can always investigate storing a value instead [1]. [1] https://lore.kernel.org/all/00ba1dff-7c05-46e8-b0d9-a78ac1cfc198@redhat.com/ [akpm@linux-foundation.org: fix PG_hugetlb typo, per David] Link: https://lkml.kernel.org/r/20240529111904.2069608-3-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Sergey Senozhatsky [zram/zsmalloc workloads] Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 +++++ include/linux/page-flags.h | 25 +++++++++++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 278eade2ad4c..ef09c4eef6d3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -157,6 +157,11 @@ struct page { * * See page-flags.h for a list of page types which are currently * stored here. + * + * Owners of typed folios may reuse the lower 16 bit of the + * head page page_type field after setting the page type, + * but must reset these 16 bit to -1 before clearing the + * page type. */ unsigned int page_type; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b041bc058fb1..d221a6a14764 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -941,16 +941,21 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) */ enum pagetype { - PG_buddy = 0x00000080, - PG_offline = 0x00000100, - PG_table = 0x00000200, - PG_guard = 0x00000400, - PG_hugetlb = 0x00000800, - PG_slab = 0x00001000, - - PAGE_TYPE_BASE = 0xf0000000, - /* Reserve 0x0000007f to catch underflows of _mapcount */ - PAGE_MAPCOUNT_RESERVE = -128, + PG_buddy = 0x40000000, + PG_offline = 0x20000000, + PG_table = 0x10000000, + PG_guard = 0x08000000, + PG_hugetlb = 0x04000000, + PG_slab = 0x02000000, + + PAGE_TYPE_BASE = 0x80000000, + + /* + * Reserve 0xffff0000 - 0xfffffffe to catch _mapcount underflows and + * allow owners that set a type to reuse the lower 16 bit for their own + * purposes. + */ + PAGE_MAPCOUNT_RESERVE = ~0x0000ffff, }; #define PageType(page, flag) \ -- cgit v1.2.3 From 43d746dc49bb4c82034fce01a92fe67344d664cf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 29 May 2024 13:19:01 +0200 Subject: mm/zsmalloc: use a proper page type Let's clean it up: use a proper page type and store our data (offset into a page) in the lower 16 bit as documented. We won't be able to support 256 KiB base pages, which is acceptable. Teach Kconfig to handle that cleanly using a new CONFIG_HAVE_ZSMALLOC. Based on this, we should do a proper "struct zsdesc" conversion, as proposed in [1]. This removes the last _mapcount/page_type offender. [1] https://lore.kernel.org/all/20231130101242.2590384-1-42.hyeyoo@gmail.com/ Link: https://lkml.kernel.org/r/20240529111904.2069608-4-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Sergey Senozhatsky [zram/zsmalloc workloads] Reviewed-by: Sergey Senozhatsky Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d221a6a14764..0f7c7320391e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -947,6 +947,7 @@ enum pagetype { PG_guard = 0x08000000, PG_hugetlb = 0x04000000, PG_slab = 0x02000000, + PG_zsmalloc = 0x01000000, PAGE_TYPE_BASE = 0x80000000, @@ -1071,6 +1072,8 @@ FOLIO_TYPE_OPS(hugetlb, hugetlb) FOLIO_TEST_FLAG_FALSE(hugetlb) #endif +PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) + /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. -- cgit v1.2.3 From 11d5401b011e3557894d824dac210f0b18cc3911 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 29 May 2024 13:19:04 +0200 Subject: mm/mm_init: initialize page->_mapcount directly in __init_single_page() Let's simply reinitialize the page->_mapcount directly. We can now get rid of page_mapcount_reset(). Link: https://lkml.kernel.org/r/20240529111904.2069608-7-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Sergey Senozhatsky [zram/zsmalloc workloads] Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 61a7f680f0ba..4f75fee497f1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1210,11 +1210,6 @@ static inline int folio_entire_mapcount(const struct folio *folio) return atomic_read(&folio->_entire_mapcount) + 1; } -static inline void page_mapcount_reset(struct page *page) -{ - atomic_set(&(page)->_mapcount, -1); -} - static inline int folio_large_mapcount(const struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); -- cgit v1.2.3 From 2669324b81e5f67d409d3ad988da77c7c2a09045 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 4 Jun 2024 19:48:20 +0800 Subject: mm: remove page_maybe_dma_pinned() After the last user of page_maybe_dma_pinned() is converted to folio_maybe_dma_pinned(), remove page_maybe_dma_pinned() and update the document and comment. [wangkefeng.wang@huawei.com: fix pin_user_pages.rst underlining] Link: https://lkml.kernel.org/r/61b256c7-4989-44ec-83db-f34a1bd4be2d@huawei.com Link: https://lkml.kernel.org/r/20240604114822.2089819-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Cc: Daniel Vetter Cc: Helge Deller Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4f75fee497f1..cb4aa725252e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1918,8 +1918,8 @@ static inline struct folio *pfn_folio(unsigned long pfn) * * For more information, please see Documentation/core-api/pin_user_pages.rst. * - * Return: True, if it is likely that the page has been "dma-pinned". - * False, if the page is definitely not dma-pinned. + * Return: True, if it is likely that the folio has been "dma-pinned". + * False, if the folio is definitely not dma-pinned. */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { @@ -1938,11 +1938,6 @@ static inline bool folio_maybe_dma_pinned(struct folio *folio) GUP_PIN_COUNTING_BIAS; } -static inline bool page_maybe_dma_pinned(struct page *page) -{ - return folio_maybe_dma_pinned(page_folio(page)); -} - /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. -- cgit v1.2.3 From a929e0d10f3db1a53668f6b9845db27d7fb63759 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 4 Jun 2024 19:48:22 +0800 Subject: mm: remove page_mkclean() There are no more users of page_mkclean(), remove it and update the document and comment. Link: https://lkml.kernel.org/r/20240604114822.2089819-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Cc: Daniel Vetter Cc: Helge Deller Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- include/linux/rmap.h | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cb4aa725252e..101945baffc7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1577,7 +1577,7 @@ static inline void put_page(struct page *page) * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() - * provides safe operation for get_user_pages(), page_mkclean() and + * provides safe operation for get_user_pages(), folio_mkclean() and * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 694d3d886245..980fa5d75d69 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -802,8 +802,4 @@ static inline int folio_mkclean(struct folio *folio) } #endif /* CONFIG_MMU */ -static inline int page_mkclean(struct page *page) -{ - return folio_mkclean(page_folio(page)); -} #endif /* _LINUX_RMAP_H */ -- cgit v1.2.3 From 503b158fc30f203a1854c87183ca3467c6466001 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 11:09:37 +0200 Subject: mm/memory_hotplug: initialize memmap of !ZONE_DEVICE with PageOffline() instead of PageReserved() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently initialize the memmap such that PG_reserved is set and the refcount of the page is 1. In virtio-mem code, we have to manually clear that PG_reserved flag to make memory offlining with partially hotplugged memory blocks possible: has_unmovable_pages() would otherwise bail out on such pages. We want to avoid PG_reserved where possible and move to typed pages instead. Further, we want to further enlighten memory offlining code about PG_offline: offline pages in an online memory section. One example is handling managed page count adjustments in a cleaner way during memory offlining. So let's initialize the pages with PG_offline instead of PG_reserved. generic_online_page()->__free_pages_core() will now clear that flag before handing that memory to the buddy. Note that the page refcount is still 1 and would forbid offlining of such memory except when special care is take during GOING_OFFLINE as currently only implemented by virtio-mem. With this change, we can now get non-PageReserved() pages in the XEN balloon list. From what I can tell, that can already happen via decrease_reservation(), so that should be fine. HV-balloon should not really observe a change: partial online memory blocks still cannot get surprise-offlined, because the refcount of these PageOffline() pages is 1. Update virtio-mem, HV-balloon and XEN-balloon code to be aware that hotplugged pages are now PageOffline() instead of PageReserved() before they are handed over to the buddy. We'll leave the ZONE_DEVICE case alone for now. Note that self-hosted vmemmap pages will no longer be marked as reserved. This matches ordinary vmemmap pages allocated from the buddy during memory hotplug. Now, really only vmemmap pages allocated from memblock during early boot will be marked reserved. Existing PageReserved() checks seem to be handling all relevant cases correctly even after this change. Link: https://lkml.kernel.org/r/20240607090939.89524-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Oscar Salvador [generic memory-hotplug bits] Cc: Alexander Potapenko Cc: Dexuan Cui Cc: Dmitry Vyukov Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Juergen Gross Cc: "K. Y. Srinivasan" Cc: Marco Elver Cc: Michael S. Tsirkin Cc: Mike Rapoport (IBM) Cc: Oleksandr Tyshchenko Cc: Stefano Stabellini Cc: Wei Liu Cc: Xuan Zhuo Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0f7c7320391e..b23772b08cc1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -30,16 +30,11 @@ * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying * to read/write these pages might end badly. Don't touch! * - The zero page(s) - * - Pages not added to the page allocator when onlining a section because - * they were excluded via the online_page_callback() or because they are - * PG_hwpoison. * - Pages allocated in the context of kexec/kdump (loaded kernel image, * control pages, vmcoreinfo) * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are * not marked PG_reserved (as they might be in use by somebody else who does * not respect the caching strategy). - * - Pages part of an offline section (struct pages of offline sections should - * not be trusted as they will be initialized when first onlined). * - MCA pages on ia64 * - Pages holding CPU notes for POWER Firmware Assisted Dump * - Device memory (e.g. PMEM, DAX, HMM) @@ -1020,6 +1015,10 @@ PAGE_TYPE_OPS(Buddy, buddy, buddy) * The content of these pages is effectively stale. Such pages should not * be touched (read/write/dump/save) except by their owner. * + * When a memory block gets onlined, all pages are initialized with a + * refcount of 1 and PageOffline(). generic_online_page() will + * take care of clearing PageOffline(). + * * If a driver wants to allow to offline unmovable PageOffline() pages without * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the @@ -1027,8 +1026,7 @@ PAGE_TYPE_OPS(Buddy, buddy, buddy) * pages (now with a reference count of zero) are treated like free pages, * allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will - * require to re-set the pages PageOffline() and not giving them to the - * buddy via online_page_callback_t. + * require not giving them to the buddy via generic_online_page(). * * There are drivers that mark a page PageOffline() and expect there won't be * any further access to page content. PFN walkers that read content of random -- cgit v1.2.3 From 50625744220c101705a989d7c57a6c16e945f3b1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 11:09:38 +0200 Subject: mm/memory_hotplug: skip adjust_managed_page_count() for PageOffline() pages when offlining MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently have a hack for virtio-mem in place to handle memory offlining with PageOffline pages for which we already adjusted the managed page count. Let's enlighten memory offlining code so we can get rid of that hack, and document the situation. Link: https://lkml.kernel.org/r/20240607090939.89524-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Oscar Salvador Cc: Alexander Potapenko Cc: Dexuan Cui Cc: Dmitry Vyukov Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Juergen Gross Cc: "K. Y. Srinivasan" Cc: Marco Elver Cc: Michael S. Tsirkin Cc: Mike Rapoport (IBM) Cc: Oleksandr Tyshchenko Cc: Stefano Stabellini Cc: Wei Liu Cc: Xuan Zhuo Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 4 ++-- include/linux/page-flags.h | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7a9ff464608d..ebe876930e78 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -175,8 +175,8 @@ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); -extern void __offline_isolated_pages(unsigned long start_pfn, - unsigned long end_pfn); +extern unsigned long __offline_isolated_pages(unsigned long start_pfn, + unsigned long end_pfn); typedef void (*online_page_callback_t)(struct page *page, unsigned int order); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b23772b08cc1..1d441908b726 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1023,11 +1023,15 @@ PAGE_TYPE_OPS(Buddy, buddy, buddy) * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline() - * pages (now with a reference count of zero) are treated like free pages, - * allowing the containing memory block to get offlined. A driver that + * pages (now with a reference count of zero) are treated like free (unmanaged) + * pages, allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will * require not giving them to the buddy via generic_online_page(). * + * Memory offlining code will not adjust the managed page count for any + * PageOffline() pages, treating them like they were never exposed to the + * buddy using generic_online_page(). + * * There are drivers that mark a page PageOffline() and expect there won't be * any further access to page content. PFN walkers that read content of random * pages should check PageOffline() and synchronize with such drivers using -- cgit v1.2.3 From 15bde4abab734c687c1f81704886aba3a70c268e Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 18 Jun 2024 11:11:35 +1200 Subject: mm: extend rmap flags arguments for folio_add_new_anon_rmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: clarify folio_add_new_anon_rmap() and __folio_add_anon_rmap()", v2. This patchset is preparatory work for mTHP swapin. folio_add_new_anon_rmap() assumes that new anon rmaps are always exclusive. However, this assumption doesn’t hold true for cases like do_swap_page(), where a new anon might be added to the swapcache and is not necessarily exclusive. The patchset extends the rmap flags to allow folio_add_new_anon_rmap() to handle both exclusive and non-exclusive new anon folios. The do_swap_page() function is updated to use this extended API with rmap flags. Consequently, all new anon folios now consistently use folio_add_new_anon_rmap(). The special case for !folio_test_anon() in __folio_add_anon_rmap() can be safely removed. In conclusion, new anon folios always use folio_add_new_anon_rmap(), regardless of exclusivity. Old anon folios continue to use __folio_add_anon_rmap() via folio_add_anon_rmap_pmd() and folio_add_anon_rmap_ptes(). This patch (of 3): In the case of a swap-in, a new anonymous folio is not necessarily exclusive. This patch updates the rmap flags to allow a new anonymous folio to be treated as either exclusive or non-exclusive. To maintain the existing behavior, we always use EXCLUSIVE as the default setting. [akpm@linux-foundation.org: cleanup and constifications per David and akpm] [v-songbaohua@oppo.com: fix missing doc for flags of folio_add_new_anon_rmap()] Link: https://lkml.kernel.org/r/20240619210641.62542-1-21cnbao@gmail.com [v-songbaohua@oppo.com: enhance doc for extend rmap flags arguments for folio_add_new_anon_rmap] Link: https://lkml.kernel.org/r/20240622030256.43775-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-2-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: David Hildenbrand Tested-by: Shuai Yuan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 980fa5d75d69..0978c64f49d8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -244,7 +244,7 @@ void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, void folio_add_anon_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, - unsigned long address); + unsigned long address, rmap_t flags); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_add_file_rmap_pte(folio, page, vma) \ -- cgit v1.2.3 From 78fefd04c123493bbf28434768fa577b2153c79b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 18 Jun 2024 17:12:39 +0800 Subject: mm: memory: convert clear_huge_page() to folio_zero_user() Patch series "mm: improve clear and copy user folio", v2. Some folio conversions. An improvement is to move address alignment into the caller as it is only needed if we don't know which address will be accessed when clearing/copying user folios. This patch (of 4): Replace clear_huge_page() with folio_zero_user(), and take a folio instead of a page. Directly get number of pages by folio_nr_pages() to remove pages_per_huge_page argument, furthermore, move the address alignment from folio_zero_user() to the callers since the alignment is only needed when we don't know which address will be accessed. Link: https://lkml.kernel.org/r/20240618091242.2140164-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240618091242.2140164-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 101945baffc7..e2140ea6ae98 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4067,9 +4067,7 @@ enum mf_action_page_type { }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) -extern void clear_huge_page(struct page *page, - unsigned long addr_hint, - unsigned int pages_per_huge_page); +void folio_zero_user(struct folio *folio, unsigned long addr_hint); int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma); -- cgit v1.2.3 From dc9e6f7053dd540db2c9fba30c31a07de5edab01 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 31 May 2024 14:56:16 +0200 Subject: mm: read page_type using READ_ONCE KCSAN complains about possible data races: while we check for a page_type -- for example for sanity checks -- we might concurrently modify the mapcount that overlays page_type. Let's use READ_ONCE to avoid load tearing (shouldn't make a difference) and to make KCSAN happy. Likely, we might also want to use WRITE_ONCE for the writer side of page_type, if KCSAN ever complains about that. But we'll not mess with that for now. Note: nothing should really be broken besides wrong KCSAN complaints. The sanity check that triggers this was added in commit 68f0320824fa ("mm/rmap: convert folio_add_file_rmap_range() into folio_add_file_rmap_[pte|ptes|pmd]()"). Even before that similar races likely where possible, ever since we added page_type in commit 6e292b9be7f4 ("mm: split page_type out from _mapcount"). Link: https://lkml.kernel.org/r/20240531125616.2850153-1-david@redhat.com Signed-off-by: David Hildenbrand Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202405281431.c46a3be9-lkp@intel.com Reviewed-by: Oscar Salvador Reviewed-by: Miaohe Lin Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1d441908b726..5769fe6e4950 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -955,9 +955,9 @@ enum pagetype { }; #define PageType(page, flag) \ - ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + ((READ_ONCE(page->page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) #define folio_test_type(folio, flag) \ - ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + ((READ_ONCE(folio->page.page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) static inline int page_type_has_type(unsigned int page_type) { @@ -966,7 +966,7 @@ static inline int page_type_has_type(unsigned int page_type) static inline int page_has_type(const struct page *page) { - return page_type_has_type(page->page_type); + return page_type_has_type(READ_ONCE(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ -- cgit v1.2.3 From 6b1709d4b7fce27c6c79fc78c17c458f9848a4d8 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 21 Jun 2024 13:34:53 +0200 Subject: kmsan: expose kmsan_get_metadata() Each s390 CPU has lowcore pages associated with it. Each CPU sees its own lowcore at virtual address 0 through a hardware mechanism called prefixing. Additionally, all lowcores are mapped to non-0 virtual addresses stored in the lowcore_ptr[] array. When lowcore is accessed through virtual address 0, one needs to resolve metadata for lowcore_ptr[raw_smp_processor_id()]. Expose kmsan_get_metadata() to make it possible to do this from the arch code. Link: https://lkml.kernel.org/r/20240621113706.315500-10-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Reviewed-by: Alexander Potapenko Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Cc: Marco Elver Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt (Google) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index e0c23a32cdf0..fe6c2212bdb1 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -230,6 +230,15 @@ void kmsan_handle_urb(const struct urb *urb, bool is_out); */ void kmsan_unpoison_entry_regs(const struct pt_regs *regs); +/** + * kmsan_get_metadata() - Return a pointer to KMSAN shadow or origins. + * @addr: kernel address. + * @is_origin: whether to return origins or shadow. + * + * Return NULL if metadata cannot be found. + */ +void *kmsan_get_metadata(void *addr, bool is_origin); + #else static inline void kmsan_init_shadow(void) -- cgit v1.2.3 From ec3e837d8fd96c68599b2861dd412094b7bc335c Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 21 Jun 2024 13:34:55 +0200 Subject: kmsan: allow disabling KMSAN checks for the current task Like for KASAN, it's useful to temporarily disable KMSAN checks around, e.g., redzone accesses. Introduce kmsan_disable_current() and kmsan_enable_current(), which are similar to their KASAN counterparts. Make them reentrant in order to handle memory allocations in interrupt context. Repurpose the allow_reporting field for this. Link: https://lkml.kernel.org/r/20240621113706.315500-12-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Reviewed-by: Alexander Potapenko Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Cc: Marco Elver Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt (Google) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 24 ++++++++++++++++++++++++ include/linux/kmsan_types.h | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index fe6c2212bdb1..14b5ea6d3a43 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -239,6 +239,22 @@ void kmsan_unpoison_entry_regs(const struct pt_regs *regs); */ void *kmsan_get_metadata(void *addr, bool is_origin); +/** + * kmsan_enable_current(): Enable KMSAN for the current task. + * + * Each kmsan_enable_current() current call must be preceded by a + * kmsan_disable_current() call. These call pairs may be nested. + */ +void kmsan_enable_current(void); + +/** + * kmsan_disable_current(): Disable KMSAN for the current task. + * + * Each kmsan_disable_current() current call must be followed by a + * kmsan_enable_current() call. These call pairs may be nested. + */ +void kmsan_disable_current(void); + #else static inline void kmsan_init_shadow(void) @@ -338,6 +354,14 @@ static inline void kmsan_unpoison_entry_regs(const struct pt_regs *regs) { } +static inline void kmsan_enable_current(void) +{ +} + +static inline void kmsan_disable_current(void) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h index 929287981afe..dfc59918b3c0 100644 --- a/include/linux/kmsan_types.h +++ b/include/linux/kmsan_types.h @@ -31,7 +31,7 @@ struct kmsan_context_state { struct kmsan_ctx { struct kmsan_context_state cstate; int kmsan_in_runtime; - bool allow_reporting; + unsigned int depth; }; #endif /* _LINUX_KMSAN_TYPES_H */ -- cgit v1.2.3 From 1fdb3c7006d9914e4b070f7eee98dfbdf743ee16 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 21 Jun 2024 13:34:56 +0200 Subject: kmsan: introduce memset_no_sanitize_memory() Add a wrapper for memset() that prevents unpoisoning. This is useful for filling memory allocator redzones. Link: https://lkml.kernel.org/r/20240621113706.315500-13-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Reviewed-by: Alexander Potapenko Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Cc: Marco Elver Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt (Google) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 14b5ea6d3a43..7109644f4c19 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -255,6 +255,19 @@ void kmsan_enable_current(void); */ void kmsan_disable_current(void); +/** + * memset_no_sanitize_memory(): Fill memory without KMSAN instrumentation. + * @s: address of kernel memory to fill. + * @c: constant byte to fill the memory with. + * @n: number of bytes to fill. + * + * This is like memset(), but without KMSAN instrumentation. + */ +static inline void *memset_no_sanitize_memory(void *s, int c, size_t n) +{ + return __memset(s, c, n); +} + #else static inline void kmsan_init_shadow(void) @@ -362,6 +375,11 @@ static inline void kmsan_disable_current(void) { } +static inline void *memset_no_sanitize_memory(void *s, int c, size_t n) +{ + return memset(s, c, n); +} + #endif #endif /* _LINUX_KMSAN_H */ -- cgit v1.2.3 From e6553e2f79b2673b239c3dd47a88451119eb82d9 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 21 Jun 2024 13:35:00 +0200 Subject: kmsan: expose KMSAN_WARN_ON() KMSAN_WARN_ON() is required for implementing s390-specific KMSAN functions, but right now it's available only to the KMSAN internal functions. Expose it to subsystems through . Link: https://lkml.kernel.org/r/20240621113706.315500-17-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Reviewed-by: Alexander Potapenko Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Cc: Marco Elver Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt (Google) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 7109644f4c19..2b1432cc16d5 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -268,6 +268,29 @@ static inline void *memset_no_sanitize_memory(void *s, int c, size_t n) return __memset(s, c, n); } +extern bool kmsan_enabled; +extern int panic_on_kmsan; + +/* + * KMSAN performs a lot of consistency checks that are currently enabled by + * default. BUG_ON is normally discouraged in the kernel, unless used for + * debugging, but KMSAN itself is a debugging tool, so it makes little sense to + * recover if something goes wrong. + */ +#define KMSAN_WARN_ON(cond) \ + ({ \ + const bool __cond = WARN_ON(cond); \ + if (unlikely(__cond)) { \ + WRITE_ONCE(kmsan_enabled, false); \ + if (panic_on_kmsan) { \ + /* Can't call panic() here because */ \ + /* of uaccess checks. */ \ + BUG(); \ + } \ + } \ + __cond; \ + }) + #else static inline void kmsan_init_shadow(void) @@ -380,6 +403,8 @@ static inline void *memset_no_sanitize_memory(void *s, int c, size_t n) return memset(s, c, n); } +#define KMSAN_WARN_ON WARN_ON + #endif #endif /* _LINUX_KMSAN_H */ -- cgit v1.2.3 From ee86814b0562f18255b55c5e6a01a022895994cf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 20 Jun 2024 23:29:35 +0200 Subject: mm/migrate: move NUMA hinting fault folio isolation + checks under PTL Currently we always take a folio reference even if migration will not even be tried or isolation failed, requiring us to grab+drop an additional reference. Further, we end up calling folio_likely_mapped_shared() while the folio might have already been unmapped, because after we dropped the PTL, that can easily happen. We want to stop touching mapcounts and friends from such context, and only call folio_likely_mapped_shared() while the folio is still mapped: mapcount information is pretty much stale and unreliable otherwise. So let's move checks into numamigrate_isolate_folio(), rename that function to migrate_misplaced_folio_prepare(), and call that function from callsites where we call migrate_misplaced_folio(), but still with the PTL held. We can now stop taking temporary folio references, and really only take a reference if folio isolation succeeded. Doing the folio_likely_mapped_shared() + folio isolation under PT lock is now similar to how we handle MADV_PAGEOUT. While at it, combine the folio_is_file_lru() checks. [david@redhat.com: fix list_del() corruption] Link: https://lkml.kernel.org/r/8f85c31a-e603-4578-bf49-136dae0d4b69@redhat.com Link: https://lkml.kernel.org/r/20240626191129.658CFC32782@smtp.kernel.org Link: https://lkml.kernel.org/r/20240620212935.656243-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Baolin Wang Reviewed-by: Zi Yan Tested-by: Donet Tom Signed-off-by: Andrew Morton --- include/linux/migrate.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 517f70b70620..af2579ae93f2 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -140,9 +140,16 @@ const struct movable_operations *page_movable_ops(struct page *page) } #ifdef CONFIG_NUMA_BALANCING +int migrate_misplaced_folio_prepare(struct folio *folio, + struct vm_area_struct *vma, int node); int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, int node); #else +static inline int migrate_misplaced_folio_prepare(struct folio *folio, + struct vm_area_struct *vma, int node) +{ + return -EAGAIN; /* can't migrate now */ +} static inline int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, int node) { -- cgit v1.2.3 From bb82ac31ddcedd6a1e6ee30b7afec7acdef811e1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 25 Jun 2024 12:18:55 +0200 Subject: readahead: drop index argument of page_cache_async_readahead() The index argument of page_cache_async_readahead() is just folio->index so there's no point in passing is separately. Drop it. Link: https://lkml.kernel.org/r/20240625101909.12234-5-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Josef Bacik Tested-by: Zhang Peng Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e37e16ebff7a..eef99375dcf1 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1293,8 +1293,7 @@ void page_cache_sync_readahead(struct address_space *mapping, * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @file: Used by the filesystem for authentication. - * @folio: The folio at @index which triggered the readahead call. - * @index: Index of first page to be read. + * @folio: The folio which triggered the readahead call. * @req_count: Total number of pages being read by the caller. * * page_cache_async_readahead() should be called when a page is used which @@ -1305,9 +1304,9 @@ void page_cache_sync_readahead(struct address_space *mapping, static inline void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *file, - struct folio *folio, pgoff_t index, unsigned long req_count) + struct folio *folio, unsigned long req_count) { - DEFINE_READAHEAD(ractl, file, ra, mapping, index); + DEFINE_READAHEAD(ractl, file, ra, mapping, folio->index); page_cache_async_ra(&ractl, folio, req_count); } -- cgit v1.2.3 From 87024f5837485c2f9541283747428df54c0f9183 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 24 Jun 2024 17:58:55 -0700 Subject: mm: memcg: rename soft limit reclaim-related functions Rename exported function related to the softlimit reclaim to have memcg1_ prefix. Link: https://lkml.kernel.org/r/20240625005906.106920-4-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7403dd5926eb..83c8327455d8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1121,9 +1121,9 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, int old_order, int new_order); -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned); +unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned); #else /* CONFIG_MEMCG */ @@ -1572,9 +1572,9 @@ static inline void split_page_memcg(struct page *head, int old_order, int new_or } static inline -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned) +unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) { return 0; } -- cgit v1.2.3 From 66d60c428b23c5b171218985b6880958fb7edbe3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 24 Jun 2024 17:58:58 -0700 Subject: mm: memcg: move legacy memcg event code into memcontrol-v1.c Cgroup v1's memory controller contains a pretty complicated event notifications mechanism which is not used on cgroup v2. Let's move the corresponding code into memcontrol-v1.c. Please, note, that mem_cgroup_event_ratelimit() remains in memcontrol.c, otherwise it would require exporting too many details on memcg stats outside of memcontrol.c. Link: https://lkml.kernel.org/r/20240625005906.106920-7-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 83c8327455d8..588179d29849 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -69,18 +69,6 @@ struct mem_cgroup_id { refcount_t ref; }; -/* - * Per memcg event counter is incremented at every pagein/pageout. With THP, - * it will be incremented by the number of pages. This counter is used - * to trigger some periodic events. This is straightforward and better - * than using jiffies etc. to handle periodic memcg event. - */ -enum mem_cgroup_events_target { - MEM_CGROUP_TARGET_THRESH, - MEM_CGROUP_TARGET_SOFTLIMIT, - MEM_CGROUP_NTARGETS, -}; - struct memcg_vmstats_percpu; struct memcg_vmstats; struct lruvec_stats_percpu; -- cgit v1.2.3 From 6f1173d6845974aeb654a7f070c0c9f21283e1b3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 24 Jun 2024 17:59:04 -0700 Subject: mm: memcg: group cgroup v1 memcg related declarations Group all cgroup v1-related declarations at the end of memcontrol.h and mm/memcontrol-v1.h with an intention to put them all together under a config option later on. It should make things easier to follow and maintain too. Link: https://lkml.kernel.org/r/20240625005906.106920-13-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 144 ++++++++++++++++++++++++--------------------- 1 file changed, 76 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 588179d29849..a70d64ed04f5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -950,39 +950,13 @@ static inline void mem_cgroup_exit_user_fault(void) current->in_user_fault = 0; } -static inline bool task_in_memcg_oom(struct task_struct *p) -{ - return p->memcg_in_oom; -} - -bool mem_cgroup_oom_synchronize(bool wait); struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); -void folio_memcg_lock(struct folio *folio); -void folio_memcg_unlock(struct folio *folio); - void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val); -/* try to stablize folio_memcg() for all the pages in a memcg */ -static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) -{ - rcu_read_lock(); - - if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) - return true; - - rcu_read_unlock(); - return false; -} - -static inline void mem_cgroup_unlock_pages(void) -{ - rcu_read_unlock(); -} - /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) @@ -1109,10 +1083,6 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, int old_order, int new_order); -unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned); - #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1423,26 +1393,6 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } -static inline void folio_memcg_lock(struct folio *folio) -{ -} - -static inline void folio_memcg_unlock(struct folio *folio) -{ -} - -static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) -{ - /* to match folio_memcg_rcu() */ - rcu_read_lock(); - return true; -} - -static inline void mem_cgroup_unlock_pages(void) -{ - rcu_read_unlock(); -} - static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } @@ -1455,16 +1405,6 @@ static inline void mem_cgroup_exit_user_fault(void) { } -static inline bool task_in_memcg_oom(struct task_struct *p) -{ - return false; -} - -static inline bool mem_cgroup_oom_synchronize(bool wait) -{ - return false; -} - static inline struct mem_cgroup *mem_cgroup_get_oom_group( struct task_struct *victim, struct mem_cgroup *oom_domain) { @@ -1558,14 +1498,6 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } - -static inline -unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned) -{ - return 0; -} #endif /* CONFIG_MEMCG */ /* @@ -1916,4 +1848,80 @@ static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) } #endif + +/* Cgroup v1-related declarations */ + +#ifdef CONFIG_MEMCG +unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned); + +bool mem_cgroup_oom_synchronize(bool wait); + +static inline bool task_in_memcg_oom(struct task_struct *p) +{ + return p->memcg_in_oom; +} + +void folio_memcg_lock(struct folio *folio); +void folio_memcg_unlock(struct folio *folio); + +/* try to stablize folio_memcg() for all the pages in a memcg */ +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + rcu_read_lock(); + + if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) + return true; + + rcu_read_unlock(); + return false; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + +#else /* CONFIG_MEMCG */ +static inline +unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + return 0; +} + +static inline void folio_memcg_lock(struct folio *folio) +{ +} + +static inline void folio_memcg_unlock(struct folio *folio) +{ +} + +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + /* to match folio_memcg_rcu() */ + rcu_read_lock(); + return true; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + +static inline bool task_in_memcg_oom(struct task_struct *p) +{ + return false; +} + +static inline bool mem_cgroup_oom_synchronize(bool wait) +{ + return false; +} + +#endif /* CONFIG_MEMCG */ + #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.2.3 From e93d4166b40a84df83c4d5cb4c709d022808ef9b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 24 Jun 2024 17:59:05 -0700 Subject: mm: memcg: put cgroup v1-specific code under a config option Put legacy cgroup v1 memory controller code under a new CONFIG_MEMCG_V1 config option. The option is turned off by default. Nobody except those who are still using cgroup v1 should turn it on. If the option is not set, memory controller can still be mounted under cgroup v1, but none of memcg-specific control files are present. Please note, that not all cgroup v1's memory controller code is guarded yet (but most of it), it's a subject for some follow-up work. Thanks to Michal Hocko for providing a better Kconfig option description. [roman.gushchin@linux.dev: better config option description provided by Michal] Link: https://lkml.kernel.org/r/ZnxXNtvqllc9CDoo@google.com Link: https://lkml.kernel.org/r/20240625005906.106920-14-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a70d64ed04f5..796cfa842346 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1851,7 +1851,7 @@ static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) /* Cgroup v1-related declarations */ -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); @@ -1883,7 +1883,7 @@ static inline void mem_cgroup_unlock_pages(void) rcu_read_unlock(); } -#else /* CONFIG_MEMCG */ +#else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1922,6 +1922,6 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } -#endif /* CONFIG_MEMCG */ +#endif /* CONFIG_MEMCG_V1 */ #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.2.3 From 410abb20acaea9679fc1b2276d47e70fba331451 Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Wed, 3 Jan 2024 08:48:36 -0800 Subject: mm: add defines for min/max swappiness Patch series "Add swappiness argument to memory.reclaim", v6. This patch proposes augmenting the memory.reclaim interface with a swappiness= argument that overrides the swappiness value for that instance of proactive reclaim. Userspace proactive reclaimers use the memory.reclaim interface to trigger reclaim. The memory.reclaim interface does not allow for any way to effect the balance of file vs anon during proactive reclaim. The only approach is to adjust the vm.swappiness setting. However, there are a few reasons we look to control the balance of file vs anon during proactive reclaim, separately from reactive reclaim: * Swapout should be limited to manage SSD write endurance. In near-OOM situations we are fine with lots of swap-out to avoid OOMs. As these are typically rare events, they have relatively little impact on write endurance. However, proactive reclaim runs continuously and so its impact on SSD write endurance is more significant. Therefore it is desireable to control swap-out for proactive reclaim separately from reactive reclaim * Some userspace OOM killers like systemd-oomd[1] support OOM killing on swap exhaustion. This makes sense if the swap exhaustion is triggered due to reactive reclaim but less so if it is triggered due to proactive reclaim (e.g. one could see OOMs when free memory is ample but anon is just particularly cold). Therefore, it's desireable to have proactive reclaim reduce or stop swap-out before the threshold at which OOM killing occurs. In the case of Meta's Senpai proactive reclaimer, we adjust vm.swappiness before writes to memory.reclaim[2]. This has been in production for nearly two years and has addressed our needs to control proactive vs reactive reclaim behavior but is still not ideal for a number of reasons: * vm.swappiness is a global setting, adjusting it can race/interfere with other system administration that wishes to control vm.swappiness. In our case, we need to disable Senpai before adjusting vm.swappiness. * vm.swappiness is stateful - so a crash or restart of Senpai can leave a misconfigured setting. This requires some additional management to record the "desired" setting and ensure Senpai always adjusts to it. With this patch, we avoid these downsides of adjusting vm.swappiness globally. Previously, this exact interface addition was proposed by Yosry[3]. In response, Roman proposed instead an interface to specify precise file/anon/slab reclaim amounts[4]. More recently Huan also proposed this as well[5] and others similarly questioned if this was the proper interface. Previous proposals sought to use this to allow proactive reclaimers to effectively perform a custom reclaim algorithm by issuing proactive reclaim with different settings to control file vs anon reclaim (e.g. to only reclaim anon from some applications). Responses argued that adjusting swappiness is a poor interface for custom reclaim. In contrast, I argue in favor of a swappiness setting not as a way to implement custom reclaim algorithms but rather to bias the balance of anon vs file due to differences of proactive vs reactive reclaim. In this context, swappiness is the existing interface for controlling this balance and this patch simply allows for it to be configured differently for proactive vs reactive reclaim. Specifying explicit amounts of anon vs file pages to reclaim feels inappropriate for this prupose. Proactive reclaimers are un-aware of the relative age of file vs anon for a cgroup which makes it difficult to manage proactive reclaim of different memory pools. A proactive reclaimer would need some amount of anon reclaim attempts separate from the amount of file reclaim attempts which seems brittle given that it's difficult to observe the impact. [1]https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html [2]https://github.com/facebookincubator/oomd/blob/main/src/oomd/plugins/Senpai.cpp#L585-L598 [3]https://lore.kernel.org/linux-mm/CAJD7tkbDpyoODveCsnaqBBMZEkDvshXJmNdbk51yKSNgD7aGdg@mail.gmail.com/ [4]https://lore.kernel.org/linux-mm/YoPHtHXzpK51F%2F1Z@carbon/ [5]https://lore.kernel.org/lkml/20231108065818.19932-1-link@vivo.com/ This patch (of 2): We use the constants 0 and 200 in a few places in the mm code when referring to the min and max swappiness. This patch adds MIN_SWAPPINESS and MAX_SWAPPINESS #defines to improve clarity. There are no functional changes. Link: https://lkml.kernel.org/r/20240103164841.2800183-1-schatzberg.dan@gmail.com Link: https://lkml.kernel.org/r/20240103164841.2800183-2-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Acked-by: David Rientjes Acked-by: Chris Li Reviewed-by: Nhat Pham Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shakeel Butt Cc: Tejun Heo Cc: Yosry Ahmed Cc: Yue Zhao Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index d33ce740b695..e97db0fa7659 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -404,6 +404,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) +#define MIN_SWAPPINESS 0 +#define MAX_SWAPPINESS 200 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, -- cgit v1.2.3 From 68cd9050d871e4db5433420b5ceb32f5512d18bc Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Wed, 3 Jan 2024 08:48:37 -0800 Subject: mm: add swappiness= arg to memory.reclaim Allow proactive reclaimers to submit an additional swappiness= argument to memory.reclaim. This overrides the global or per-memcg swappiness setting for that reclaim attempt. For example: echo "2M swappiness=0" > /sys/fs/cgroup/memory.reclaim will perform reclaim on the rootcg with a swappiness setting of 0 (no swap) regardless of the vm.swappiness sysctl setting. Userspace proactive reclaimers use the memory.reclaim interface to trigger reclaim. The memory.reclaim interface does not allow for any way to effect the balance of file vs anon during proactive reclaim. The only approach is to adjust the vm.swappiness setting. However, there are a few reasons we look to control the balance of file vs anon during proactive reclaim, separately from reactive reclaim: * Swapout should be limited to manage SSD write endurance. In near-OOM situations we are fine with lots of swap-out to avoid OOMs. As these are typically rare events, they have relatively little impact on write endurance. However, proactive reclaim runs continuously and so its impact on SSD write endurance is more significant. Therefore it is desireable to control swap-out for proactive reclaim separately from reactive reclaim * Some userspace OOM killers like systemd-oomd[1] support OOM killing on swap exhaustion. This makes sense if the swap exhaustion is triggered due to reactive reclaim but less so if it is triggered due to proactive reclaim (e.g. one could see OOMs when free memory is ample but anon is just particularly cold). Therefore, it's desireable to have proactive reclaim reduce or stop swap-out before the threshold at which OOM killing occurs. In the case of Meta's Senpai proactive reclaimer, we adjust vm.swappiness before writes to memory.reclaim[2]. This has been in production for nearly two years and has addressed our needs to control proactive vs reactive reclaim behavior but is still not ideal for a number of reasons: * vm.swappiness is a global setting, adjusting it can race/interfere with other system administration that wishes to control vm.swappiness. In our case, we need to disable Senpai before adjusting vm.swappiness. * vm.swappiness is stateful - so a crash or restart of Senpai can leave a misconfigured setting. This requires some additional management to record the "desired" setting and ensure Senpai always adjusts to it. With this patch, we avoid these downsides of adjusting vm.swappiness globally. [1]https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html [2]https://github.com/facebookincubator/oomd/blob/main/src/oomd/plugins/Senpai.cpp#L585-L598 Link: https://lkml.kernel.org/r/20240103164841.2800183-3-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Suggested-by: Yosry Ahmed Acked-by: Michal Hocko Acked-by: David Rientjes Acked-by: Chris Li Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shakeel Butt Cc: Tejun Heo Cc: Yue Zhao Cc: Zefan Li Cc: Nhat Pham Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index e97db0fa7659..3df75d62a835 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -409,7 +409,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options); + unsigned int reclaim_options, + int *swappiness); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, -- cgit v1.2.3 From 773e9ae77fe777ac09739d8c32b32fff147f2d66 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Jun 2024 21:03:10 +0000 Subject: mm: memcg: factor out legacy socket memory accounting code Move out the legacy cgroup v1 socket memory accounting code into mm/memcontrol-v1.c. This commit introduces three new functions: memcg1_tcpmem_active(), memcg1_charge_skmem() and memcg1_uncharge_skmem(), which contain all cgroup v1-specific code and become trivial if CONFIG_MEMCG_V1 isn't set. Note, that !!memcg->tcpmem_pressure check in mem_cgroup_under_socket_pressure() can't be easily moved into memcontrol-v1.h without including memcontrol-v1.h from memcontrol.h which isn't a good idea, so it's better to just #ifdef it. Link: https://lkml.kernel.org/r/20240628210317.272856-3-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 796cfa842346..44ab6394c9ed 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1650,8 +1650,10 @@ void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { +#ifdef CONFIG_MEMCG_V1 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return !!memcg->tcpmem_pressure; +#endif /* CONFIG_MEMCG_V1 */ do { if (time_before(jiffies, READ_ONCE(memcg->socket_pressure))) return true; -- cgit v1.2.3 From 94b7e5bf09b08aa4cce4625d0a4b307399b77e2c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Jun 2024 21:03:14 +0000 Subject: mm: memcg: put memcg1-specific struct mem_cgroup's members under CONFIG_MEMCG_V1 Put memcg1-specific members of struct mem_cgroup under the CONFIG_MEMCG_V1 config option. Also group them close to the end of struct mem_cgroup just before the dynamic per-node part. Link: https://lkml.kernel.org/r/20240628210317.272856-7-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 103 +++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 44ab6394c9ed..107b0c5d6eab 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -188,10 +188,6 @@ struct mem_cgroup { struct page_counter memsw; /* v1 only */ }; - /* Legacy consumer-oriented counters */ - struct page_counter kmem; /* v1 only */ - struct page_counter tcpmem; /* v1 only */ - /* Range enforcement for interrupt charges */ struct work_struct high_work; @@ -205,8 +201,6 @@ struct mem_cgroup { bool zswap_writeback; #endif - unsigned long soft_limit; - /* vmpressure notifications */ struct vmpressure vmpressure; @@ -215,13 +209,7 @@ struct mem_cgroup { */ bool oom_group; - /* protected by memcg_oom_lock */ - bool oom_lock; - int under_oom; - - int swappiness; - /* OOM-Killer disable */ - int oom_kill_disable; + int swappiness; /* memory.events and memory.events.local */ struct cgroup_file events_file; @@ -230,27 +218,6 @@ struct mem_cgroup { /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; - /* protect arrays of thresholds */ - struct mutex thresholds_lock; - - /* thresholds for memory usage. RCU-protected */ - struct mem_cgroup_thresholds thresholds; - - /* thresholds for mem+swap usage. RCU-protected */ - struct mem_cgroup_thresholds memsw_thresholds; - - /* For oom notifier event fd */ - struct list_head oom_notify; - - /* - * Should we move charges of a task when a task is moved into this - * mem_cgroup ? And what type of charges should we move ? - */ - unsigned long move_charge_at_immigrate; - /* taken only while moving_account > 0 */ - spinlock_t move_lock; - unsigned long move_lock_flags; - CACHELINE_PADDING(_pad1_); /* memory.stat */ @@ -267,10 +234,6 @@ struct mem_cgroup { */ unsigned long socket_pressure; - /* Legacy tcp memory accounting */ - bool tcpmem_active; - int tcpmem_pressure; - #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; /* @@ -284,14 +247,6 @@ struct mem_cgroup { struct list_head objcg_list; #endif - CACHELINE_PADDING(_pad2_); - - /* - * set > 0 if pages under this cgroup are moving to other cgroup. - */ - atomic_t moving_account; - struct task_struct *move_lock_task; - struct memcg_vmstats_percpu __percpu *vmstats_percpu; #ifdef CONFIG_CGROUP_WRITEBACK @@ -300,10 +255,6 @@ struct mem_cgroup { struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif - /* List of events which userspace want to receive */ - struct list_head event_list; - spinlock_t event_list_lock; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif @@ -313,6 +264,58 @@ struct mem_cgroup { struct lru_gen_mm_list mm_list; #endif +#ifdef CONFIG_MEMCG_V1 + /* Legacy consumer-oriented counters */ + struct page_counter kmem; /* v1 only */ + struct page_counter tcpmem; /* v1 only */ + + unsigned long soft_limit; + + /* protected by memcg_oom_lock */ + bool oom_lock; + int under_oom; + + /* OOM-Killer disable */ + int oom_kill_disable; + + /* protect arrays of thresholds */ + struct mutex thresholds_lock; + + /* thresholds for memory usage. RCU-protected */ + struct mem_cgroup_thresholds thresholds; + + /* thresholds for mem+swap usage. RCU-protected */ + struct mem_cgroup_thresholds memsw_thresholds; + + /* For oom notifier event fd */ + struct list_head oom_notify; + + /* + * Should we move charges of a task when a task is moved into this + * mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; + /* taken only while moving_account > 0 */ + spinlock_t move_lock; + unsigned long move_lock_flags; + + /* Legacy tcp memory accounting */ + bool tcpmem_active; + int tcpmem_pressure; + + CACHELINE_PADDING(_pad2_); + + /* + * set > 0 if pages under this cgroup are moving to other cgroup. + */ + atomic_t moving_account; + struct task_struct *move_lock_task; + + /* List of events which userspace want to receive */ + struct list_head event_list; + spinlock_t event_list_lock; +#endif /* CONFIG_MEMCG_V1 */ + struct mem_cgroup_per_node *nodeinfo[]; }; -- cgit v1.2.3 From 98c9daf5ae6be008f78c07b744bcff7bcc6e98da Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Jun 2024 21:03:15 +0000 Subject: mm: memcg: guard memcg1-specific members of struct mem_cgroup_per_node Put memcg1-specific members of struct mem_cgroup_per_node under the CONFIG_MEMCG_V1 config option. Link: https://lkml.kernel.org/r/20240628210317.272856-8-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 107b0c5d6eab..c7ef628ee882 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -92,6 +92,7 @@ struct mem_cgroup_per_node { struct lruvec_stats *lruvec_stats; struct shrinker_info __rcu *shrinker_info; +#ifdef CONFIG_MEMCG_V1 /* * Memcg-v1 only stuff in middle as buffer between read mostly fields * and update often fields to avoid false sharing. Once v1 stuff is @@ -102,6 +103,7 @@ struct mem_cgroup_per_node { unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; +#endif /* Fields which get updated often at the end. */ struct lruvec lruvec; -- cgit v1.2.3 From 1c3a0b3d0b11fb98c40360f849563185218c2dd4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Jun 2024 21:03:16 +0000 Subject: mm: memcg: put struct task_struct::memcg_in_oom under CONFIG_MEMCG_V1 The memcg_in_oom field of the struct task_struct is not used by the cgroup v2's memory controller, so it can be happily compiled out if CONFIG_MEMCG_V1 is not set. Link: https://lkml.kernel.org/r/20240628210317.272856-9-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/sched.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 61591ac6eab6..2a16023e8620 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1447,9 +1447,11 @@ struct task_struct { unsigned int kcov_softirq; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 struct mem_cgroup *memcg_in_oom; +#endif +#ifdef CONFIG_MEMCG /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; -- cgit v1.2.3 From 1419ff984aad4c08d121793f71a520b432ead88a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Jun 2024 21:03:17 +0000 Subject: mm: memcg: put struct task_struct::in_user_fault under CONFIG_MEMCG_V1 The struct task_struct's in_user_fault member is not used by the cgroup v2's memory controller, so it can be put under the CONFIG_MEMCG_V1 config option. To do so, mem_cgroup_enter_user_fault() and mem_cgroup_exit_user_fault() are moved under the CONFIG_MEMCG_V1 option as well. Link: https://lkml.kernel.org/r/20240628210317.272856-10-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 40 ++++++++++++++++++++-------------------- include/linux/sched.h | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c7ef628ee882..d0c9365ff039 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -943,18 +943,6 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg); -static inline void mem_cgroup_enter_user_fault(void) -{ - WARN_ON(current->in_user_fault); - current->in_user_fault = 1; -} - -static inline void mem_cgroup_exit_user_fault(void) -{ - WARN_ON(!current->in_user_fault); - current->in_user_fault = 0; -} - struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); @@ -1402,14 +1390,6 @@ static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } -static inline void mem_cgroup_enter_user_fault(void) -{ -} - -static inline void mem_cgroup_exit_user_fault(void) -{ -} - static inline struct mem_cgroup *mem_cgroup_get_oom_group( struct task_struct *victim, struct mem_cgroup *oom_domain) { @@ -1890,6 +1870,18 @@ static inline void mem_cgroup_unlock_pages(void) rcu_read_unlock(); } +static inline void mem_cgroup_enter_user_fault(void) +{ + WARN_ON(current->in_user_fault); + current->in_user_fault = 1; +} + +static inline void mem_cgroup_exit_user_fault(void) +{ + WARN_ON(!current->in_user_fault); + current->in_user_fault = 0; +} + #else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, @@ -1929,6 +1921,14 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } +static inline void mem_cgroup_enter_user_fault(void) +{ +} + +static inline void mem_cgroup_exit_user_fault(void) +{ +} + #endif /* CONFIG_MEMCG_V1 */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a16023e8620..a7770c566c4d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -934,7 +934,7 @@ struct task_struct { #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN -- cgit v1.2.3 From 28bdacbcb36d093e23734acccecd139f5fc05f67 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 26 Jun 2024 16:53:23 +0800 Subject: mm: move memory_failure_queue() into copy_mc_[user]_highpage() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: migrate: support poison recover from migrate folio", v5. The folio migration is widely used in kernel, memory compaction, memory hotplug, soft offline page, numa balance, memory demote/promotion, etc, but once access a poisoned source folio when migrating, the kernel will panic. There is a mechanism in the kernel to recover from uncorrectable memory errors, ARCH_HAS_COPY_MC(eg, Machine Check Safe Memory Copy on x86), which is already used in NVDIMM or core-mm paths(eg, CoW, khugepaged, coredump, ksm copy), see copy_mc_to_{user,kernel}, copy_mc_{user_}highpage callers. This series of patches provide the recovery mechanism from folio copy for the widely used folio migration. Please note, because folio migration is no guarantee of success, so we could chose to make folio migration tolerant of memory failures, adding folio_mc_copy() which is a #MC versions of folio_copy(), once accessing a poisoned source folio, we could return error and make the folio migration fail, and this could avoid the similar panic shown below. CPU: 1 PID: 88343 Comm: test_softofflin Kdump: loaded Not tainted 6.6.0 pc : copy_page+0x10/0xc0 lr : copy_highpage+0x38/0x50 ... Call trace: copy_page+0x10/0xc0 folio_copy+0x78/0x90 migrate_folio_extra+0x54/0xa0 move_to_new_folio+0xd8/0x1f0 migrate_folio_move+0xb8/0x300 migrate_pages_batch+0x528/0x788 migrate_pages_sync+0x8c/0x258 migrate_pages+0x440/0x528 soft_offline_in_use_page+0x2ec/0x3c0 soft_offline_page+0x238/0x310 soft_offline_page_store+0x6c/0xc0 dev_attr_store+0x20/0x40 sysfs_kf_write+0x4c/0x68 kernfs_fop_write_iter+0x130/0x1c8 new_sync_write+0xa4/0x138 vfs_write+0x238/0x2d8 ksys_write+0x74/0x110 This patch (of 5): There is a memory_failure_queue() call after copy_mc_[user]_highpage(), see callers, eg, CoW/KSM page copy, it is used to mark the source page as h/w poisoned and unmap it from other tasks, and the upcomming poison recover from migrate folio will do the similar thing, so let's move the memory_failure_queue() into the copy_mc_[user]_highpage() instead of adding it into each user, this should also enhance the handling of poisoned page in khugepaged. Link: https://lkml.kernel.org/r/20240626085328.608006-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240626085328.608006-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Reviewed-by: Miaohe Lin Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Tony Luck Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/highmem.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index fa6891e06316..930a591b9b61 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -352,6 +352,9 @@ static inline int copy_mc_user_highpage(struct page *to, struct page *from, kunmap_local(vto); kunmap_local(vfrom); + if (ret) + memory_failure_queue(page_to_pfn(from), 0); + return ret; } @@ -368,6 +371,9 @@ static inline int copy_mc_highpage(struct page *to, struct page *from) kunmap_local(vto); kunmap_local(vfrom); + if (ret) + memory_failure_queue(page_to_pfn(from), 0); + return ret; } #else -- cgit v1.2.3 From 02f4ee5a144cef6b26421cb42cca64bb4138d459 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 26 Jun 2024 16:53:24 +0800 Subject: mm: add folio_mc_copy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a #MC variant of folio_copy() which uses copy_mc_highpage() to support #MC handled during folio copy, it will be used in folio migration soon. Link: https://lkml.kernel.org/r/20240626085328.608006-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Reviewed-by: Miaohe Lin Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Tony Luck Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e2140ea6ae98..a2e3ebead410 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1291,6 +1291,7 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); +int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); -- cgit v1.2.3 From 3f59493713534b61abcbe2e57582cf7e31a42d51 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 26 Jun 2024 16:53:28 +0800 Subject: mm: migrate: remove folio_migrate_copy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The folio_migrate_copy() is just a wrapper of folio_copy() and folio_migrate_flags(), it is simple and only aio use it for now, unfold it and remove folio_migrate_copy(). Link: https://lkml.kernel.org/r/20240626085328.608006-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Tony Luck Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/migrate.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index af2579ae93f2..644be30b69c8 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -76,7 +76,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); -void folio_migrate_copy(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count); -- cgit v1.2.3 From 9fa001cf3bb0598aad09d15b2896f7db9ef87637 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 1 Jul 2024 18:59:31 +0000 Subject: mm: memcg: drop obsolete cache line padding in struct mem_cgroup After the grouping of the cgroup v1-related fields and the corresponding reorganization of the struct mem_cgroup, the existing cache line padding doesn't make much sense anymore. Let's drop it for now and put back to new places, if necessary. Link: https://lkml.kernel.org/r/20240701185932.704807-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Suggested-by: Shakeel Butt Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d0c9365ff039..8b5b3ddeba05 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -220,8 +220,6 @@ struct mem_cgroup { /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; - CACHELINE_PADDING(_pad1_); - /* memory.stat */ struct memcg_vmstats *vmstats; @@ -305,8 +303,6 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure; - CACHELINE_PADDING(_pad2_); - /* * set > 0 if pages under this cgroup are moving to other cgroup. */ -- cgit v1.2.3 From 6df13230b612af81ce04f20bb37a02e58ef71925 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 1 Jul 2024 18:59:32 +0000 Subject: mm: memcg: add cache line padding to mem_cgroup_per_node Memcg v1-specific fields serve a buffer function between read-mostly and update often parts of the mem_cgroup_per_node structure. If CONFIG_MEMCG_V1 is not set and these fields are not present, an explicit cacheline padding is needed. Link: https://lkml.kernel.org/r/20240701185932.704807-2-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Suggested-by: Shakeel Butt Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8b5b3ddeba05..60418934827c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -95,14 +95,16 @@ struct mem_cgroup_per_node { #ifdef CONFIG_MEMCG_V1 /* * Memcg-v1 only stuff in middle as buffer between read mostly fields - * and update often fields to avoid false sharing. Once v1 stuff is - * moved in a separate struct, an explicit padding is needed. + * and update often fields to avoid false sharing. If v1 stuff is + * not present, an explicit padding is needed. */ struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; +#else + CACHELINE_PADDING(_pad1_); #endif /* Fields which get updated often at the end. */ -- cgit v1.2.3 From 3a3b7fec3974f954600844e41d773c00857ef48a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 1 Jul 2024 11:31:15 -0400 Subject: mm: remove CONFIG_MEMCG_KMEM CONFIG_MEMCG_KMEM used to be a user-visible option for whether slab tracking is enabled. It has been default-enabled and equivalent to CONFIG_MEMCG for almost a decade. We've only grown more kernel memory accounting sites since, and there is no imaginable cgroup usecase going forward that wants to track user pages but not the multitude of user-drivable kernel allocations. Link: https://lkml.kernel.org/r/20240701153148.452230-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: David Hildenbrand Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/bpf.h | 4 ++-- include/linux/list_lru.h | 2 +- include/linux/memcontrol.h | 22 +++++----------------- include/linux/sched.h | 3 +-- include/linux/slab.h | 12 ++++++------ 5 files changed, 15 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5e694a308081..b8637555c9c2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -275,7 +275,7 @@ struct bpf_map { u32 btf_value_type_id; u32 btf_vmlinux_value_type_id; struct btf *btf; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; @@ -2252,7 +2252,7 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 792b67ceb631..5099a8ccd5f4 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -50,7 +50,7 @@ struct list_lru_node { struct list_lru { struct list_lru_node *node; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct list_head list; int shrinker_id; bool memcg_aware; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 60418934827c..7e2eb091049a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -195,7 +195,7 @@ struct mem_cgroup { /* Range enforcement for interrupt charges */ struct work_struct high_work; -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP unsigned long zswap_max; /* @@ -236,7 +236,6 @@ struct mem_cgroup { */ unsigned long socket_pressure; -#ifdef CONFIG_MEMCG_KMEM int kmemcg_id; /* * memcg->objcg is wiped out as a part of the objcg repaprenting @@ -247,7 +246,6 @@ struct mem_cgroup { struct obj_cgroup *orig_objcg; /* list of inherited objcgs, protected by objcg_lock */ struct list_head objcg_list; -#endif struct memcg_vmstats_percpu __percpu *vmstats_percpu; @@ -532,7 +530,6 @@ retry: return memcg; } -#ifdef CONFIG_MEMCG_KMEM /* * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. * @folio: Pointer to the folio. @@ -548,15 +545,6 @@ static inline bool folio_memcg_kmem(struct folio *folio) return folio->memcg_data & MEMCG_DATA_KMEM; } - -#else -static inline bool folio_memcg_kmem(struct folio *folio) -{ - return false; -} - -#endif - static inline bool PageMemcgKmem(struct page *page) { return folio_memcg_kmem(page_folio(page)); @@ -1488,7 +1476,7 @@ static inline void split_page_memcg(struct page *head, int old_order, int new_or * if MEMCG_DATA_OBJEXTS is set. */ struct slabobj_ext { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING @@ -1663,7 +1651,7 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg, } #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); @@ -1806,9 +1794,9 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); diff --git a/include/linux/sched.h b/include/linux/sched.h index a7770c566c4d..82da65131a6b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1457,9 +1457,8 @@ struct task_struct { /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; -#endif -#ifdef CONFIG_MEMCG_KMEM + /* Cache for current->cgroups->memcg->objcg lookups: */ struct obj_cgroup *objcg; #endif diff --git a/include/linux/slab.h b/include/linux/slab.h index 7247e217e21b..a332dd2fa6cd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -41,7 +41,7 @@ enum _slab_flag_bits { #ifdef CONFIG_FAILSLAB _SLAB_FAILSLAB, #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG _SLAB_ACCOUNT, #endif #ifdef CONFIG_KASAN_GENERIC @@ -171,7 +171,7 @@ enum _slab_flag_bits { # define SLAB_FAILSLAB __SLAB_FLAG_UNUSED #endif /* Account to memcg */ -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG # define SLAB_ACCOUNT __SLAB_FLAG_BIT(_SLAB_ACCOUNT) #else # define SLAB_ACCOUNT __SLAB_FLAG_UNUSED @@ -407,7 +407,7 @@ enum kmalloc_cache_type { #ifndef CONFIG_ZONE_DMA KMALLOC_DMA = KMALLOC_NORMAL, #endif -#ifndef CONFIG_MEMCG_KMEM +#ifndef CONFIG_MEMCG KMALLOC_CGROUP = KMALLOC_NORMAL, #endif KMALLOC_RANDOM_START = KMALLOC_NORMAL, @@ -420,7 +420,7 @@ enum kmalloc_cache_type { #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG KMALLOC_CGROUP, #endif NR_KMALLOC_TYPES @@ -435,7 +435,7 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; #define KMALLOC_NOT_NORMAL_BITS \ (__GFP_RECLAIMABLE | \ (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ - (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) + (IS_ENABLED(CONFIG_MEMCG) ? __GFP_ACCOUNT : 0)) extern unsigned long random_kmalloc_seed; @@ -463,7 +463,7 @@ static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigne */ if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA)) return KMALLOC_DMA; - if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE)) + if (!IS_ENABLED(CONFIG_MEMCG) || (flags & __GFP_RECLAIMABLE)) return KMALLOC_RECLAIM; else return KMALLOC_CGROUP; -- cgit v1.2.3 From 259043e3b730e0aa6408bff27af7edf7a5c9101c Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 30 Jun 2024 11:22:31 +1200 Subject: mm: zswap: fix zswap_never_enabled() for CONFIG_ZSWAP==N If CONFIG_ZSWAP is set to N, it means zswap cannot be enabled. zswap_never_enabled() should return true. The only effect of this issue is that with Barry's latest large folio swapin patches for zram ("mm: support mTHP swap-in for zRAM-like swapfile"), we will always fallback to order-0 swapin, even mistakenly when !CONFIG_ZSWAP. Basically this bug makes Barry's in progress patches not work at all. The API was created to inform the mm core that zswap has never been enabled, allowing the mm core to perform mTHP swap-in. This is a transitional solution until zswap supports mTHP. If zswap has been enabled, performing mTHP swap-in will result in corrupted data. You may find the answer in the mTHP swap-in series: https://lore.kernel.org/linux-mm/CAJD7tkZ4FQr6HZpduOdvmqgg_-whuZYE-Bz5O2t6yzw6Yg+v1A@mail.gmail.com/ Link: https://lkml.kernel.org/r/20240629232231.42394-1-21cnbao@gmail.com Fixes: 0300e17d67c3 ("mm: zswap: add zswap_never_enabled()") Signed-off-by: Barry Song Reviewed-by: Chengming Zhou Acked-by: Yosry Ahmed Acked-by: Chris Li Acked-by: David Hildenbrand Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/zswap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index bf83ae5e285d..6cecb4a4f68b 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -68,7 +68,7 @@ static inline bool zswap_is_enabled(void) static inline bool zswap_never_enabled(void) { - return false; + return true; } #endif -- cgit v1.2.3 From 6cc040542ba7b2c60e5119cd04d841fcf048c872 Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Sun, 23 Jun 2024 23:36:09 -0700 Subject: mm/gup: introduce unpin_folio/unpin_folios helpers Patch series "mm/gup: Introduce memfd_pin_folios() for pinning memfd folios", v16. Currently, some drivers (e.g, Udmabuf) that want to longterm-pin the pages/folios associated with a memfd, do so by simply taking a reference on them. This is not desirable because the pages/folios may reside in Movable zone or CMA block. Therefore, having drivers use memfd_pin_folios() API ensures that the folios are appropriately pinned via FOLL_PIN for longterm DMA. This patchset also introduces a few helpers and converts the Udmabuf driver to use folios and memfd_pin_folios() API to longterm-pin the folios for DMA. Two new Udmabuf selftests are also included to test the driver and the new API. This patch (of 9): These helpers are the folio versions of unpin_user_page/unpin_user_pages. They are currently only useful for unpinning folios pinned by memfd_pin_folios() or other associated routines. However, they could find new uses in the future, when more and more folio-only helpers are added to GUP. We should probably sanity check the folio as part of unpin similar to how it is done in unpin_user_page/unpin_user_pages but we cannot cleanly do that at the moment without also checking the subpage. Therefore, sanity checking needs to be added to these routines once we have a way to determine if any given folio is anon-exclusive (via a per folio AnonExclusive flag). Link: https://lkml.kernel.org/r/20240624063952.1572359-1-vivek.kasireddy@intel.com Link: https://lkml.kernel.org/r/20240624063952.1572359-2-vivek.kasireddy@intel.com Signed-off-by: Vivek Kasireddy Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Acked-by: Dave Airlie Acked-by: Gerd Hoffmann Cc: Matthew Wilcox Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Peter Xu Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dongwon Kim Cc: Hugh Dickins Cc: Junxiao Chang Cc: Oscar Salvador Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a2e3ebead410..7b84379e3a1b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1584,11 +1584,13 @@ static inline void put_page(struct page *page) #define GUP_PIN_COUNTING_BIAS (1U << 10) void unpin_user_page(struct page *page); +void unpin_folio(struct folio *folio); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); +void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) { -- cgit v1.2.3 From 89c1905d9c140372b7f50ef48f42378cf85d9bc5 Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Sun, 23 Jun 2024 23:36:11 -0700 Subject: mm/gup: introduce memfd_pin_folios() for pinning memfd folios For drivers that would like to longterm-pin the folios associated with a memfd, the memfd_pin_folios() API provides an option to not only pin the folios via FOLL_PIN but also to check and migrate them if they reside in movable zone or CMA block. This API currently works with memfds but it should work with any files that belong to either shmemfs or hugetlbfs. Files belonging to other filesystems are rejected for now. The folios need to be located first before pinning them via FOLL_PIN. If they are found in the page cache, they can be immediately pinned. Otherwise, they need to be allocated using the filesystem specific APIs and then pinned. [akpm@linux-foundation.org: improve the CONFIG_MMU=n situation, per SeongJae] [vivek.kasireddy@intel.com: return -EINVAL if the end offset is greater than the size of memfd] Link: https://lkml.kernel.org/r/IA0PR11MB71850525CBC7D541CAB45DF1F8DB2@IA0PR11MB7185.namprd11.prod.outlook.com Link: https://lkml.kernel.org/r/20240624063952.1572359-4-vivek.kasireddy@intel.com Signed-off-by: Vivek Kasireddy Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe (v2) Reviewed-by: David Hildenbrand (v3) Reviewed-by: Christoph Hellwig (v6) Acked-by: Dave Airlie Acked-by: Gerd Hoffmann Cc: Matthew Wilcox (Oracle) Cc: Daniel Vetter Cc: Hugh Dickins Cc: Peter Xu Cc: Dongwon Kim Cc: Junxiao Chang Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Mike Kravetz Cc: Oscar Salvador Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/memfd.h | 5 +++++ include/linux/mm.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index e7abf6fa4c52..3f2cf339ceaf 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -6,11 +6,16 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); +struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { return -EINVAL; } +static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) +{ + return ERR_PTR(-EINVAL); +} #endif #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b84379e3a1b..5f1075d19600 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2500,6 +2500,9 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); +long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, + struct folio **folios, unsigned int max_folios, + pgoff_t *offset); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); -- cgit v1.2.3 From f216c845f3c772e54d27fe209fd300b10e7bf54a Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 28 Jun 2024 21:07:49 +0800 Subject: mm: add per-order mTHP split counters Patch series "mm: introduce per-order mTHP split counters", v3. At present, the split counters in THP statistics no longer include PTE-mapped mTHP. Therefore, we want to introduce per-order mTHP split counters to monitor the frequency of mTHP splits. This will assist developers in better analyzing and optimizing system performance. /sys/kernel/mm/transparent_hugepage/hugepages-/stats split split_failed split_deferred This patch (of 2): Currently, the split counters in THP statistics no longer include PTE-mapped mTHP. Therefore, we propose introducing per-order mTHP split counters to monitor the frequency of mTHP splits. This will help developers better analyze and optimize system performance. /sys/kernel/mm/transparent_hugepage/hugepages-/stats split split_failed split_deferred [ioworker0@gmail.com: make things more readable, per Barry and Baolin] Link: https://lkml.kernel.org/r/20240704012905.42971-2-ioworker0@gmail.com [ioworker0@gmail.com: use == for `order' test, per David] Link: https://lkml.kernel.org/r/20240705113119.82210-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240704012905.42971-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240704012905.42971-2-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240628130750.73097-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240628130750.73097-2-ioworker0@gmail.com Signed-off-by: Mingzhe Yang Signed-off-by: Lance Yang Reviewed-by: Ryan Roberts Acked-by: Barry Song Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Bang Li Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 212cca384d7e..cee3c5da8f0e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -284,6 +284,9 @@ enum mthp_stat_item { MTHP_STAT_FILE_ALLOC, MTHP_STAT_FILE_FALLBACK, MTHP_STAT_FILE_FALLBACK_CHARGE, + MTHP_STAT_SPLIT, + MTHP_STAT_SPLIT_FAILED, + MTHP_STAT_SPLIT_DEFERRED, __MTHP_STAT_COUNT }; -- cgit v1.2.3 From 18d095b2556e5e1292003c8e9f5d845ed42ef89b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Jul 2024 15:51:19 +0200 Subject: mm: define __pte_leaf_size() to also take a PMD entry On powerpc 8xx, when a page is 8M size, the information is in the PMD entry. So allow architectures to provide __pte_leaf_size() instead of pte_leaf_size() and provide the PMD entry to that function. When __pte_leaf_size() is not defined, define it as a pte_leaf_size() so that architectures not interested in the PMD arguments are not impacted. Only define a default pte_leaf_size() when __pte_leaf_size() is not defined to make sure nobody adds new calls to pte_leaf_size() in the core. Link: https://lkml.kernel.org/r/c7c008f0a314bf8029ad7288fdc908db1ec7e449.1719928057.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Oscar Salvador Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2f32eaccf0b9..2a6a3cccfc36 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1907,9 +1907,12 @@ typedef unsigned int pgtbl_mod_mask; #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif +#ifndef __pte_leaf_size #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif +#define __pte_leaf_size(x,y) pte_leaf_size(y) +#endif /* * We always define pmd_pfn for all archs as it's used in lots of generic -- cgit v1.2.3 From e6c0c03245b14d6c205814aee67128257d0bea84 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Jul 2024 15:51:20 +0200 Subject: mm: provide mm_struct and address to huge_ptep_get() On powerpc 8xx huge_ptep_get() will need to know whether the given ptep is a PTE entry or a PMD entry. This cannot be known with the PMD entry itself because there is no easy way to know it from the content of the entry. So huge_ptep_get() will need to know either the size of the page or get the pmd. In order to be consistent with huge_ptep_get_and_clear(), give mm and address to huge_ptep_get(). Link: https://lkml.kernel.org/r/cc00c70dd384298796a4e1b25d6c4eb306d3af85.1719928057.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Oscar Salvador Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/swapops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index a5c560a2f8c2..cb468e418ea1 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -334,7 +334,7 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); +extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); #else /* CONFIG_MIGRATION */ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -359,7 +359,7 @@ static inline int is_migration_entry(swp_entry_t swp) static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, - pte_t *pte) { } + unsigned long addr, pte_t *pte) { } static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; -- cgit v1.2.3 From 8268614b408be6b76c1cee6f67de7deb0d6593b3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Jul 2024 15:51:35 +0200 Subject: mm: remove CONFIG_ARCH_HAS_HUGEPD powerpc was the only user of CONFIG_ARCH_HAS_HUGEPD and doesn't use it anymore, so remove all related code. Link: https://lkml.kernel.org/r/4b10c54c794780b955f3ad6c657d0199dd792146.1719928057.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Acked-by: Oscar Salvador Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a951c0d06061..ecd0ceeea206 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -20,12 +20,6 @@ struct user_struct; struct mmu_gather; struct node; -#ifndef CONFIG_ARCH_HAS_HUGEPD -typedef struct { unsigned long pd; } hugepd_t; -#define is_hugepd(hugepd) (0) -#define __hugepd(x) ((hugepd_t) { (x) }) -#endif - void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3 From 3b0ba54d5f8ff60553c01d3ec3c607ab7bb3b452 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 3 Jul 2024 10:42:25 -0700 Subject: mm: add comments for allocation helpers explaining why they are macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A number of allocation helper functions were converted into macros to account them at the call sites. Add a comment for each converted allocation helper explaining why it has to be a macro and why we typecast the return value wherever required. The patch also moves acpi_os_acquire_object() closer to other allocation helpers to group them together under the same comment. The patch has no functional changes. Link: https://lkml.kernel.org/r/20240703174225.3891393-1-surenb@google.com Fixes: 2c321f3f70bc ("mm: change inlined allocation helpers to account at the call site") Signed-off-by: Suren Baghdasaryan Suggested-by: Andrew Morton Cc: Christian König Cc: Christoph Hellwig Cc: Jan Kara Cc: Kent Overstreet Cc: Thorsten Blum Signed-off-by: Andrew Morton --- include/linux/bpf.h | 4 ++++ include/linux/dma-fence-chain.h | 4 ++++ include/linux/hid_bpf.h | 5 +++++ include/linux/jbd2.h | 10 ++++++++++ include/linux/skbuff.h | 8 ++++++++ include/linux/skmsg.h | 5 +++++ 6 files changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b8637555c9c2..50b82ff4551a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2261,6 +2261,10 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags); #else +/* + * These specialized allocators have to be macros for their allocations to be + * accounted separately (to have separate alloc_tag). + */ #define bpf_map_kmalloc_node(_map, _size, _flags, _node) \ kmalloc_node(_size, _flags, _node) #define bpf_map_kzalloc(_map, _size, _flags) \ diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h index ad9e2506c2f4..68c3c1e41014 100644 --- a/include/linux/dma-fence-chain.h +++ b/include/linux/dma-fence-chain.h @@ -85,6 +85,10 @@ dma_fence_chain_contained(struct dma_fence *fence) * dma_fence_chain_alloc * * Returns a new struct dma_fence_chain object or NULL on failure. + * + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. */ #define dma_fence_chain_alloc() \ ((struct dma_fence_chain *)kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL)) diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index eec2592dec12..99a3edb6cf07 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -152,6 +152,11 @@ static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; } static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {} static inline void hid_bpf_destroy_device(struct hid_device *hid) {} static inline void hid_bpf_device_init(struct hid_device *hid) {} +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ #define call_hid_bpf_rdesc_fixup(_hdev, _rdesc, _size) \ ((u8 *)kmemdup(_rdesc, *(_size), GFP_KERNEL)) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index ab04c1c27fae..a280f42c8c76 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1588,6 +1588,11 @@ void jbd2_journal_put_journal_head(struct journal_head *jh); */ extern struct kmem_cache *jbd2_handle_cache; +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ #define jbd2_alloc_handle(_gfp_flags) \ ((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags)) @@ -1602,6 +1607,11 @@ static inline void jbd2_free_handle(handle_t *handle) */ extern struct kmem_cache *jbd2_inode_cache; +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ #define jbd2_alloc_inode(_gfp_flags) \ ((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags)) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1c2902eaebd3..fd51f2de51da 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3400,6 +3400,10 @@ static inline struct page *__dev_alloc_pages_noprof(gfp_t gfp_mask, } #define __dev_alloc_pages(...) alloc_hooks(__dev_alloc_pages_noprof(__VA_ARGS__)) +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). + */ #define dev_alloc_pages(_order) __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, _order) /** @@ -3416,6 +3420,10 @@ static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask) } #define __dev_alloc_page(...) alloc_hooks(__dev_alloc_page_noprof(__VA_ARGS__)) +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). + */ #define dev_alloc_page() dev_alloc_pages(0) /** diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c9efda9df285..d9b03e0746e7 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -414,6 +414,11 @@ void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ #define sk_psock_init_link() \ ((struct sk_psock_link *)kzalloc(sizeof(struct sk_psock_link), \ GFP_ATOMIC | __GFP_NOWARN)) -- cgit v1.2.3 From a8585ac68621983587f1701b7567978fcbcd9573 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 3 Jul 2024 13:25:10 +0200 Subject: mm/page_counter: move calculating protection values to page_counter It's a lot of math, and there is nothing memcontrol specific about it. This makes it easier to use inside of the drm cgroup controller. [akpm@linux-foundation.org: fix kerneldoc, per Jeff Johnson] Link: https://lkml.kernel.org/r/20240703112510.36424-1-maarten.lankhorst@linux.intel.com Signed-off-by: Maarten Lankhorst Acked-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Michal Hocko Cc: Johannes Weiner Cc: Muchun Song Cc: Jeff Johnson Signed-off-by: Andrew Morton --- include/linux/page_counter.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 8cd858d912c4..904c52f97284 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -81,4 +81,8 @@ static inline void page_counter_reset_watermark(struct page_counter *counter) counter->watermark = page_counter_read(counter); } +void page_counter_calculate_protection(struct page_counter *root, + struct page_counter *counter, + bool recursive_protection); + #endif /* _LINUX_PAGE_COUNTER_H */ -- cgit v1.2.3 From 823430c8e9d98c5865af518c782d0493b76aa511 Mon Sep 17 00:00:00 2001 From: "Ho-Ren (Jack) Chuang" Date: Thu, 4 Jul 2024 07:26:44 +0000 Subject: memory tier: consolidate the initialization of memory tiers The current memory tier initialization process is distributed across two different functions, memory_tier_init() and memory_tier_late_init(). This design is hard to maintain. Thus, this patch is proposed to reduce the possible code paths by consolidating different initialization patches into one. The earlier discussion with Jonathan and Ying is listed here: https://lore.kernel.org/lkml/20240405150244.00004b49@Huawei.com/ If we want to put these two initializations together, they must be placed together in the later function. Because only at that time, the HMAT information will be ready, adist between nodes can be calculated, and memory tiering can be established based on the adist. So we position the initialization at memory_tier_init() to the memory_tier_late_init() call. Moreover, it's natural to keep memory_tier initialization in drivers at device_initcall() level. If we simply move the set_node_memory_tier() from memory_tier_init() to late_initcall(), it will result in HMAT not registering the mt_adistance_algorithm callback function, because set_node_memory_tier() is not performed during the memory tiering initialization phase, leading to a lack of correct default_dram information. Therefore, we introduced a nodemask to pass the information of the default DRAM nodes. The reason for not choosing to reuse default_dram_type->nodes is that it is not clean enough. So in the end, we use a __initdata variable, which is a variable that is released once initialization is complete, including both CPU and memory nodes for HMAT to iterate through. Link: https://lkml.kernel.org/r/20240704072646.437579-1-horen.chuang@linux.dev Signed-off-by: Ho-Ren (Jack) Chuang Suggested-by: Jonathan Cameron Reviewed-by: "Huang, Ying" Reviewed-by: Jonathan Cameron Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Dan Williams Cc: Dave Jiang Cc: Gregory Price Cc: Len Brown Cc: Michal Hocko Cc: Rafael J. Wysocki Cc: Ravi Jonnalagadda Cc: SeongJae Park Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 0d70788558f4..0dc0cf2863e2 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -38,6 +38,7 @@ struct access_coordinate; #ifdef CONFIG_NUMA extern bool numa_demotion_enabled; extern struct memory_dev_type *default_dram_type; +extern nodemask_t default_dram_nodes; struct memory_dev_type *alloc_memory_type(int adistance); void put_memory_type(struct memory_dev_type *memtype); void init_node_memory_type(int node, struct memory_dev_type *default_type); @@ -76,6 +77,7 @@ static inline bool node_is_toptier(int node) #define numa_demotion_enabled false #define default_dram_type NULL +#define default_dram_nodes NODE_MASK_NONE /* * CONFIG_NUMA implementation returns non NULL error. */ -- cgit v1.2.3 From 00f58104202c472e487f0866fbd38832523fd4f9 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 4 Jul 2024 10:10:50 +0100 Subject: mm: fix khugepaged activation policy Since the introduction of mTHP, the docuementation has stated that khugepaged would be enabled when any mTHP size is enabled, and disabled when all mTHP sizes are disabled. There are 2 problems with this; 1. this is not what was implemented by the code and 2. this is not the desirable behavior. Desirable behavior is for khugepaged to be enabled when any PMD-sized THP is enabled, anon or file. (Note that file THP is still controlled by the top-level control so we must always consider that, as well as the PMD-size mTHP control for anon). khugepaged only supports collapsing to PMD-sized THP so there is no value in enabling it when PMD-sized THP is disabled. So let's change the code and documentation to reflect this policy. Further, per-size enabled control modification events were not previously forwarded to khugepaged to give it an opportunity to start or stop. Consequently the following was resulting in khugepaged eroneously not being activated: echo never > /sys/kernel/mm/transparent_hugepage/enabled echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled [ryan.roberts@arm.com: v3] Link: https://lkml.kernel.org/r/20240705102849.2479686-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240705102849.2479686-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240704091051.2411934-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Fixes: 3485b88390b0 ("mm: thp: introduce multi-size THP sysfs interface") Closes: https://lore.kernel.org/linux-mm/7a0bbe69-1e3d-4263-b206-da007791a5c4@redhat.com/ Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Barry Song Cc: Jonathan Corbet Cc: Lance Yang Cc: Yang Shi Cc: Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cee3c5da8f0e..acb6ac24a07e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -128,18 +128,6 @@ static inline bool hugepage_global_always(void) (1< Date: Fri, 5 Jul 2024 11:23:09 +0800 Subject: mm: thp: support "THPeligible" semantics for mTHP with anonymous shmem After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for anonymous shmem"), we can configure different policies through the multi-size THP sysfs interface for anonymous shmem. But currently "THPeligible" indicates only whether the mapping is eligible for allocating THP-pages as well as the THP is PMD mappable or not for anonymous shmem, we need to support semantics for mTHP with anonymous shmem similar to those for mTHP with anonymous memory. Link: https://lkml.kernel.org/r/20240705032309.24933-1-libang.li@antgroup.com Signed-off-by: Bang Li Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 3fb18f7eb73e..1d06b1e5408a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -113,12 +113,21 @@ int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags); +unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge); #else static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags) { return false; } +static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge) +{ + return 0; +} #endif #ifdef CONFIG_SHMEM -- cgit v1.2.3 From 63d9866ab01ffd0d0835d5564107283a4afc0a38 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 10 Jul 2024 10:55:01 +0100 Subject: mm: shmem: rename mTHP shmem counters The legacy PMD-sized THP counters at /proc/vmstat include thp_file_alloc, thp_file_fallback and thp_file_fallback_charge, which rather confusingly refer to shmem THP and do not include any other types of file pages. This is inconsistent since in most other places in the kernel, THP counters are explicitly separated for anon, shmem and file flavours. However, we are stuck with it since it constitutes a user ABI. Recently, commit 66f44583f9b6 ("mm: shmem: add mTHP counters for anonymous shmem") added equivalent mTHP stats for shmem, keeping the same "file_" prefix in the names. But in future, we may want to add extra stats to cover actual file pages, at which point, it would all become very confusing. So let's take the opportunity to rename these new counters "shmem_" before the change makes it upstream and the ABI becomes immutable. While we are at it, let's improve the documentation for the legacy counters to make it clear that they count shmem pages only. Link: https://lkml.kernel.org/r/20240710095503.3193901-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Baolin Wang Reviewed-by: Lance Yang Reviewed-by: Zi Yan Reviewed-by: Barry Song Acked-by: David Hildenbrand Cc: Daniel Gomez Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index acb6ac24a07e..cff002be83eb 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -269,9 +269,9 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, - MTHP_STAT_FILE_ALLOC, - MTHP_STAT_FILE_FALLBACK, - MTHP_STAT_FILE_FALLBACK_CHARGE, + MTHP_STAT_SHMEM_ALLOC, + MTHP_STAT_SHMEM_FALLBACK, + MTHP_STAT_SHMEM_FALLBACK_CHARGE, MTHP_STAT_SPLIT, MTHP_STAT_SPLIT_FAILED, MTHP_STAT_SPLIT_DEFERRED, -- cgit v1.2.3 From a7526fe8b94eced7d82aa00b2bcca44e39ae0769 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2024 18:35:30 +0200 Subject: mm, slab: put should_failslab() back behind CONFIG_SHOULD_FAILSLAB Patch series "revert unconditional slab and page allocator fault injection calls". These two patches largely revert commits that added function call overhead into slab and page allocation hotpaths and that cannot be currently disabled even though related CONFIG_ options do exist. A much more involved solution that can keep the callsites always existing but hidden behind a static key if unused, is possible [1] and can be pursued by anyone who believes it's necessary. Meanwhile the fact the should_failslab() error injection is already not functional on kernels built with current gcc without anyone noticing [2], and lukewarm response to [1] suggests the need is not there. I believe it will be more fair to have the state after this series as a baseline for possible further optimisation, instead of the unconditional overhead. For example a possible compromise for anyone who's fine with an empty function call overhead but not the full CONFIG_FAILSLAB / CONFIG_FAIL_PAGE_ALLOC overhead is to reuse patch 1 from [1] but insert a static key check only inside should_failslab() and should_fail_alloc_page() before performing the more expensive checks. [1] https://lore.kernel.org/all/20240620-fault-injection-statickeys-v2-0-e23947d3d84b@suse.cz/#t [2] https://github.com/bpftrace/bpftrace/issues/3258 This patch (of 2): This mostly reverts commit 4f6923fbb352 ("mm: make should_failslab always available for fault injection"). The commit made should_failslab() a noinline function that's always called from the slab allocation hotpath, even if it's empty because CONFIG_SHOULD_FAILSLAB is not enabled, and there is no option to disable that call. This is visible in profiles and the function call overhead can be noticeable especially with cpu mitigations. Meanwhile the bpftrace program example in the commit silently does not work without CONFIG_SHOULD_FAILSLAB anyway with a recent gcc, because the empty function gets a .constprop clone that is actually being called (uselessly) from the slab hotpath, while the error injection is hooked to the original function that's not being called at all [1]. Thus put the whole should_failslab() function back behind CONFIG_SHOULD_FAILSLAB. It's not a complete revert of 4f6923fbb352 - the int return type that returns -ENOMEM on failure is preserved, as well ALLOW_ERROR_INJECTION annotation. The BTF_ID() record that was meanwhile added is also guarded by CONFIG_SHOULD_FAILSLAB. [1] https://github.com/bpftrace/bpftrace/issues/3258 Link: https://lkml.kernel.org/r/20240711-b4-fault-injection-reverts-v1-0-9e2651945d68@suse.cz Link: https://lkml.kernel.org/r/20240711-b4-fault-injection-reverts-v1-1-9e2651945d68@suse.cz Signed-off-by: Vlastimil Babka Cc: Akinobu Mita Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Christoph Lameter Cc: Daniel Borkmann Cc: David Rientjes Cc: Eduard Zingerman Cc: Hao Luo Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Martin KaFai Lau Cc: Mateusz Guzik Cc: Roman Gushchin Cc: Song Liu Cc: Stanislav Fomichev Cc: Yonghong Song Signed-off-by: Andrew Morton --- include/linux/fault-inject.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 6d5edef09d45..be6d0bc111ad 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -102,11 +102,10 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) } #endif /* CONFIG_FAIL_PAGE_ALLOC */ -int should_failslab(struct kmem_cache *s, gfp_t gfpflags); #ifdef CONFIG_FAILSLAB -extern bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags); +int should_failslab(struct kmem_cache *s, gfp_t gfpflags); #else -static inline bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) +static inline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) { return false; } -- cgit v1.2.3 From 53dabce2652fb854eae84609ce9c37429d5d87ba Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2024 18:35:31 +0200 Subject: mm, page_alloc: put should_fail_alloc_page() back behing CONFIG_FAIL_PAGE_ALLOC This mostly reverts commit af3b854492f3 ("mm/page_alloc.c: allow error injection"). The commit made should_fail_alloc_page() a noinline function that's always called from the page allocation hotpath, even if it's empty because CONFIG_FAIL_PAGE_ALLOC is not enabled, and there is no option to disable it and prevent the associated function call overhead. As with the preceding patch "mm, slab: put should_failslab back behind CONFIG_SHOULD_FAILSLAB" and for the same reasons, put the should_fail_alloc_page() back behind the config option. When enabled, the ALLOW_ERROR_INJECTION and BTF_ID records are preserved so it's not a complete revert. Link: https://lkml.kernel.org/r/20240711-b4-fault-injection-reverts-v1-2-9e2651945d68@suse.cz Signed-off-by: Vlastimil Babka Cc: Akinobu Mita Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Christoph Lameter Cc: Daniel Borkmann Cc: David Rientjes Cc: Eduard Zingerman Cc: Hao Luo Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Martin KaFai Lau Cc: Mateusz Guzik Cc: Roman Gushchin Cc: Song Liu Cc: Stanislav Fomichev Cc: Yonghong Song Signed-off-by: Andrew Morton --- include/linux/fault-inject.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index be6d0bc111ad..354413950d34 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -91,12 +91,10 @@ static inline void fault_config_init(struct fault_config *config, struct kmem_cache; -bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); - #ifdef CONFIG_FAIL_PAGE_ALLOC -bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); +bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); #else -static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return false; } -- cgit v1.2.3 From 4810a82c8a8ae06fe6496a23fcb89a4952603e60 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 11 Jul 2024 15:04:55 -0700 Subject: lib: add missing newline character in the warning message Link: https://lkml.kernel.org/r/20240711220457.1751071-1-surenb@google.com Fixes: 22d407b164ff ("lib: add allocation tagging support for memory allocation profiling") Signed-off-by: Suren Baghdasaryan Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index abd24016a900..8c61ccd161ba 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -122,7 +122,7 @@ static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag "alloc_tag was not cleared (got tag for %s:%u)\n", ref->ct->filename, ref->ct->lineno); - WARN_ONCE(!tag, "current->alloc_tag not set"); + WARN_ONCE(!tag, "current->alloc_tag not set\n"); } static inline void alloc_tag_sub_check(union codetag_ref *ref) -- cgit v1.2.3 From fd8acc0097b91fab3104fa8a66ce2fd9cf8b0c11 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 11 Jul 2024 15:04:56 -0700 Subject: lib: reuse page_ext_data() to obtain codetag_ref codetag_ref_from_page_ext() reimplements the same calculation as page_ext_data(). Reuse existing function instead. Link: https://lkml.kernel.org/r/20240711220457.1751071-2-surenb@google.com Fixes: dcfe378c81f7 ("lib: introduce support for page allocation tagging") Signed-off-by: Suren Baghdasaryan Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 9cacadbd61f8..acb1e9ce7981 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -15,7 +15,7 @@ extern struct page_ext_operations page_alloc_tagging_ops; static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext) { - return (void *)page_ext + page_alloc_tagging_ops.offset; + return (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); } static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref) -- cgit v1.2.3 From 6ab42fe21c84d72da752923b4bd7075344f4a362 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 11 Jul 2024 15:04:57 -0700 Subject: alloc_tag: fix page_ext_get/page_ext_put sequence during page splitting pgalloc_tag_sub() might call page_ext_put() using a page different from the one used in page_ext_get() call. This does not pose an issue since page_ext_put() ignores this parameter as long as it's non-NULL but technically this is wrong. Fix it by storing the original page used in page_ext_get() and passing it to page_ext_put(). Link: https://lkml.kernel.org/r/20240711220457.1751071-3-surenb@google.com Fixes: be25d1d4e822 ("mm: create new codetag references during page splitting") Signed-off-by: Suren Baghdasaryan Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index acb1e9ce7981..18cd0c0c73d9 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -71,6 +71,7 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) static inline void pgalloc_tag_split(struct page *page, unsigned int nr) { int i; + struct page_ext *first_page_ext; struct page_ext *page_ext; union codetag_ref *ref; struct alloc_tag *tag; @@ -78,7 +79,7 @@ static inline void pgalloc_tag_split(struct page *page, unsigned int nr) if (!mem_alloc_profiling_enabled()) return; - page_ext = page_ext_get(page); + first_page_ext = page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; @@ -94,7 +95,7 @@ static inline void pgalloc_tag_split(struct page *page, unsigned int nr) page_ext = page_ext_next(page_ext); } out: - page_ext_put(page_ext); + page_ext_put(first_page_ext); } static inline struct alloc_tag *pgalloc_tag_get(struct page *page) -- cgit v1.2.3 From 667574e873b5f77a220b2a93329689f36fb56d5d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 12 Jul 2024 11:13:14 +0800 Subject: mm/hugetlb: fix possible recursive locking detected warning When tries to demote 1G hugetlb folios, a lockdep warning is observed: ============================================ WARNING: possible recursive locking detected 6.10.0-rc6-00452-ga4d0275fa660-dirty #79 Not tainted -------------------------------------------- bash/710 is trying to acquire lock: ffffffff8f0a7850 (&h->resize_lock){+.+.}-{3:3}, at: demote_store+0x244/0x460 but task is already holding lock: ffffffff8f0a6f48 (&h->resize_lock){+.+.}-{3:3}, at: demote_store+0xae/0x460 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&h->resize_lock); lock(&h->resize_lock); *** DEADLOCK *** May be due to missing lock nesting notation 4 locks held by bash/710: #0: ffff8f118439c3f0 (sb_writers#5){.+.+}-{0:0}, at: ksys_write+0x64/0xe0 #1: ffff8f11893b9e88 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0xf8/0x1d0 #2: ffff8f1183dc4428 (kn->active#98){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x100/0x1d0 #3: ffffffff8f0a6f48 (&h->resize_lock){+.+.}-{3:3}, at: demote_store+0xae/0x460 stack backtrace: CPU: 3 PID: 710 Comm: bash Not tainted 6.10.0-rc6-00452-ga4d0275fa660-dirty #79 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x68/0xa0 __lock_acquire+0x10f2/0x1ca0 lock_acquire+0xbe/0x2d0 __mutex_lock+0x6d/0x400 demote_store+0x244/0x460 kernfs_fop_write_iter+0x12c/0x1d0 vfs_write+0x380/0x540 ksys_write+0x64/0xe0 do_syscall_64+0xb9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fa61db14887 RSP: 002b:00007ffc56c48358 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fa61db14887 RDX: 0000000000000002 RSI: 000055a030050220 RDI: 0000000000000001 RBP: 000055a030050220 R08: 00007fa61dbd1460 R09: 000000007fffffff R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002 R13: 00007fa61dc1b780 R14: 00007fa61dc17600 R15: 00007fa61dc16a00 Lockdep considers this an AA deadlock because the different resize_lock mutexes reside in the same lockdep class, but this is a false positive. Place them in distinct classes to avoid these warnings. Link: https://lkml.kernel.org/r/20240712031314.2570452-1-linmiaohe@huawei.com Fixes: 8531fc6f52f5 ("hugetlb: add hugetlb demote page support") Signed-off-by: Miaohe Lin Acked-by: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ecd0ceeea206..c9bf68c239a0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -663,6 +663,7 @@ HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable) /* Defines one hugetlb page size */ struct hstate { struct mutex resize_lock; + struct lock_class_key resize_key; int next_nid_to_alloc; int next_nid_to_free; unsigned int order; -- cgit v1.2.3