From d155df53f31068c3340733d586eb9b3ddfd70fc5 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 17 Feb 2023 10:56:15 +0800 Subject: x86/mm/pat: clear VM_PAT if copy_p4d_range failed Syzbot reports a warning in untrack_pfn(). Digging into the root we found that this is due to memory allocation failure in pmd_alloc_one. And this failure is produced due to failslab. In copy_page_range(), memory alloaction for pmd failed. During the error handling process in copy_page_range(), mmput() is called to remove all vmas. While untrack_pfn this empty pfn, warning happens. Here's a simplified flow: dup_mm dup_mmap copy_page_range copy_p4d_range copy_pud_range copy_pmd_range pmd_alloc __pmd_alloc pmd_alloc_one page = alloc_pages(gfp, 0); if (!page) return NULL; mmput exit_mmap unmap_vmas unmap_single_vma untrack_pfn follow_phys WARN_ON_ONCE(1); Since this vma is not generate successfully, we can clear flag VM_PAT. In this case, untrack_pfn() will not be called while cleaning this vma. Function untrack_pfn_moved() has also been renamed to fit the new logic. Link: https://lkml.kernel.org/r/20230217025615.1595558-1-mawupeng1@huawei.com Signed-off-by: Ma Wupeng Reported-by: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Peter Zijlstra Cc: Suresh Siddha Cc: Toshi Kani Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c63cd44777ec..9dc936bc77d1 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1191,9 +1191,10 @@ static inline void untrack_pfn(struct vm_area_struct *vma, } /* - * untrack_pfn_moved is called while mremapping a pfnmap for a new region. + * untrack_pfn_clear is called while mremapping a pfnmap for a new region + * or fails to copy pgtable during duplicate vm area. */ -static inline void untrack_pfn_moved(struct vm_area_struct *vma) +static inline void untrack_pfn_clear(struct vm_area_struct *vma) { } #else @@ -1205,7 +1206,7 @@ extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked); -extern void untrack_pfn_moved(struct vm_area_struct *vma); +extern void untrack_pfn_clear(struct vm_area_struct *vma); #endif #ifdef CONFIG_MMU -- cgit v1.2.3 From 9a52b2f32a0942047348b30f866b846da5fcf4e3 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Tue, 14 Feb 2023 03:54:44 +0000 Subject: mm: multi-gen LRU: clean up sysfs code This patch cleans up the sysfs code. Specifically, 1. use sysfs_emit(), 2. use __ATTR_RW(), and 3. constify multi-gen LRU struct attribute_group. Link: https://lkml.kernel.org/r/20230214035445.1250139-1-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9fb1b03b83b2..bf8786d45b31 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1369,7 +1369,7 @@ typedef struct pglist_data { #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ - struct lru_gen_mm_walk mm_walk; + struct lru_gen_mm_walk mm_walk; /* lru_gen_folio list */ struct lru_gen_memcg memcg_lru; #endif -- cgit v1.2.3 From aa464ba9a1e444d5ef95bb63ee3b2ef26fc96ed7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 3 Feb 2023 17:18:34 +1000 Subject: lazy tlb: introduce lazy tlb mm refcount helper functions Add explicit _lazy_tlb annotated functions for lazy tlb mm refcounting. This makes the lazy tlb mm references more obvious, and allows the refcounting scheme to be modified in later changes. There is no functional change with this patch. Link: https://lkml.kernel.org/r/20230203071837.1136453-3-npiggin@gmail.com Signed-off-by: Nicholas Piggin Acked-by: Linus Torvalds Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Michael Ellerman Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/sched/mm.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2a243616f222..5376caf6fcf3 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -79,6 +79,22 @@ static inline void mmdrop_sched(struct mm_struct *mm) } #endif +/* Helpers for lazy TLB mm refcounting */ +static inline void mmgrab_lazy_tlb(struct mm_struct *mm) +{ + mmgrab(mm); +} + +static inline void mmdrop_lazy_tlb(struct mm_struct *mm) +{ + mmdrop(mm); +} + +static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm) +{ + mmdrop_sched(mm); +} + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. -- cgit v1.2.3 From 88e3009b5283bbd41447f0352d0b9df16cf6f183 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 3 Feb 2023 17:18:35 +1000 Subject: lazy tlb: allow lazy tlb mm refcounting to be configurable Add CONFIG_MMU_TLB_REFCOUNT which enables refcounting of the lazy tlb mm when it is context switched. This can be disabled by architectures that don't require this refcounting if they clean up lazy tlb mms when the last refcount is dropped. Currently this is always enabled, so the patch introduces no functional change. Link: https://lkml.kernel.org/r/20230203071837.1136453-4-npiggin@gmail.com Signed-off-by: Nicholas Piggin Acked-by: Linus Torvalds Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Michael Ellerman Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/sched/mm.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 5376caf6fcf3..689dbe812563 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -82,17 +82,29 @@ static inline void mmdrop_sched(struct mm_struct *mm) /* Helpers for lazy TLB mm refcounting */ static inline void mmgrab_lazy_tlb(struct mm_struct *mm) { - mmgrab(mm); + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) + mmgrab(mm); } static inline void mmdrop_lazy_tlb(struct mm_struct *mm) { - mmdrop(mm); + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) { + mmdrop(mm); + } else { + /* + * mmdrop_lazy_tlb must provide a full memory barrier, see the + * membarrier comment finish_task_switch which relies on this. + */ + smp_mb(); + } } static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm) { - mmdrop_sched(mm); + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) + mmdrop_sched(mm); + else + smp_mb(); /* see mmdrop_lazy_tlb() above */ } /** -- cgit v1.2.3 From 4c85c0be3d7a9a7ffe48bfe0954eacc0ba9d3c75 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Mon, 30 Jan 2023 13:25:13 +0900 Subject: mm, printk: introduce new format %pGt for page_type %pGp format is used to display 'flags' field of a struct page. However, some page flags (i.e. PG_buddy, see page-flags.h for more details) are stored in page_type field. To display human-readable output of page_type, introduce %pGt format. It is important to note the meaning of bits are different in page_type. if page_type is 0xffffffff, no flags are set. Setting PG_buddy (0x00000080) flag results in a page_type of 0xffffff7f. Clearing a bit actually means setting a flag. Bits in page_type are inverted when displaying type names. Only values for which page_type_has_type() returns true are considered as page_type, to avoid confusion with mapcount values. if it returns false, only raw values are displayed and not page type names. Link: https://lkml.kernel.org/r/20230130042514.2418-3-42.hyeyoo@gmail.com Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Petr Mladek [vsprintf part] Cc: Andy Shevchenko Cc: David Hildenbrand Cc: Joe Perches Cc: John Ogness Cc: Matthew Wilcox Cc: Sergey Senozhatsky Cc: Steven Rostedt (Google) Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a7e3a3405520..57287102c5bd 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -926,9 +926,14 @@ static inline bool is_page_hwpoison(struct page *page) #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) +static inline int page_type_has_type(unsigned int page_type) +{ + return (int)page_type < PAGE_MAPCOUNT_RESERVE; +} + static inline int page_has_type(struct page *page) { - return (int)page->page_type < PAGE_MAPCOUNT_RESERVE; + return page_type_has_type(page->page_type); } #define PAGE_TYPE_OPS(uname, lname) \ -- cgit v1.2.3 From 16d91faf09be148d9c1cf4999f4d15fb24a7cd72 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 16 Feb 2023 11:59:24 -0800 Subject: kasan: call clear_page with a match-all tag instead of changing page tag Instead of changing the page's tag solely in order to obtain a pointer with a match-all tag and then changing it back again, just convert the pointer that we get from kmap_atomic() into one with a match-all tag before passing it to clear_page(). On a certain microarchitecture, this has been observed to cause a measurable improvement in microbenchmark performance, presumably as a result of being able to avoid the atomic operations on the page tag. Link: https://lkml.kernel.org/r/20230216195924.3287772-1-pcc@google.com Signed-off-by: Peter Collingbourne Link: https://linux-review.googlesource.com/id/I0249822cc29097ca7a04ad48e8eb14871f80e711 Reviewed-by: Andrey Konovalov Reviewed-by: Catalin Marinas Cc: Andrey Ryabinin Cc: Evgenii Stepanov Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/highmem.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 8fc10089e19e..9c7cdaa3de8c 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -243,12 +243,10 @@ static inline void clear_highpage(struct page *page) static inline void clear_highpage_kasan_tagged(struct page *page) { - u8 tag; + void *kaddr = kmap_local_page(page); - tag = page_kasan_tag(page); - page_kasan_tag_reset(page); - clear_highpage(page); - page_kasan_tag_set(page, tag); + clear_page(kasan_reset_tag(kaddr)); + kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE -- cgit v1.2.3 From 3e4fb13ac34bad94cf390415b1e6bc3ca9520d65 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 2 Mar 2023 19:58:35 +0800 Subject: mm: swap: remove unneeded cgroup_throttle_swaprate() All the callers of cgroup_throttle_swaprate() are converted to folio_throttle_swaprate(), so make __cgroup_throttle_swaprate() to take a folio, and rename it to __folio_throttle_swaprate(), also rename gfp_mask to gfp and drop redundant extern keyword. finally, drop unused cgroup_throttle_swaprate(). Link: https://lkml.kernel.org/r/20230302115835.105364-8-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 209a425739a9..d5d0b54e90e8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -641,22 +641,18 @@ extern atomic_t zswap_stored_pages; #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); -static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) +void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp); +static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { if (mem_cgroup_disabled()) return; - __cgroup_throttle_swaprate(page, gfp_mask); + __folio_throttle_swaprate(folio, gfp); } #else -static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) -{ -} -#endif static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { - cgroup_throttle_swaprate(&folio->page, gfp); } +#endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); -- cgit v1.2.3 From 99c29133639a29fa803ea27ec79bf9e732efd062 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 6 Mar 2023 17:15:48 +0100 Subject: mm: add PTE pointer parameter to flush_tlb_fix_spurious_fault() s390 can do more fine-grained handling of spurious TLB protection faults, when there also is the PTE pointer available. Therefore, pass on the PTE pointer to flush_tlb_fix_spurious_fault() as an additional parameter. This will add no functional change to other architectures, but those with private flush_tlb_fix_spurious_fault() implementations need to be made aware of the new parameter. Link: https://lkml.kernel.org/r/20230306161548.661740-1-gerald.schaefer@linux.ibm.com Signed-off-by: Gerald Schaefer Reviewed-by: Alexander Gordeev Acked-by: Catalin Marinas [arm64] Acked-by: Michael Ellerman [powerpc] Acked-by: David Hildenbrand Cc: Anshuman Khandual Cc: Borislav Petkov (AMD) Cc: Christophe Leroy Cc: Dave Hansen Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 9dc936bc77d1..c5a51481bbb9 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -817,7 +817,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) #endif #ifndef flush_tlb_fix_spurious_fault -#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) +#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif /* -- cgit v1.2.3 From 82b3aa2681ca92421c4b508e7c390000273c53fe Mon Sep 17 00:00:00 2001 From: Yue Zhao Date: Mon, 6 Mar 2023 23:41:36 +0800 Subject: mm, memcg: Prevent memory.swappiness load/store tearing The knob for cgroup v1 memory controller: memory.swappiness is not protected by any locking so it can be modified while it is used. This is not an actual problem because races are unlikely. But it is better to use [READ|WRITE]_ONCE to prevent compiler from doing anything funky. The access of memcg->swappiness and vm_swappiness is lockless, so both of them can be concurrently set at the same time as we are trying to read them. All occurrences of memcg->swappiness and vm_swappiness are updated with [READ|WRITE]_ONCE. [findns94@gmail.com: v3] Link: https://lkml.kernel.org/r/20230308162555.14195-3-findns94@gmail.com Link: https://lkml.kernel.org/r/20230306154138.3775-3-findns94@gmail.com Signed-off-by: Yue Zhao Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Tang Yizhou Signed-off-by: Andrew Morton --- include/linux/swap.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index d5d0b54e90e8..bfc3b06b5f8f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -620,18 +620,18 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* Cgroup2 doesn't have per-cgroup swappiness */ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return vm_swappiness; + return READ_ONCE(vm_swappiness); /* root ? */ if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) - return vm_swappiness; + return READ_ONCE(vm_swappiness); - return memcg->swappiness; + return READ_ONCE(memcg->swappiness); } #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { - return vm_swappiness; + return READ_ONCE(vm_swappiness); } #endif -- cgit v1.2.3 From 452a8f40728065800b5a5b81f1152e9a16d39656 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2023 15:31:25 +0100 Subject: mm,jfs: move write_one_page/folio_write_one to jfs The last remaining user of folio_write_one through the write_one_page wrapper is jfs, so move the functionality there and hard code the call to metapage_writepage. Note that the use of the pagecache by the JFS 'metapage' buffer cache is a bit odd, and we could probably do without VM-level dirty tracking at all, but that's a change for another time. Link: https://lkml.kernel.org/r/20230307143125.27778-4-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Dave Kleikamp Cc: Changwei Ge Cc: Evgeniy Dushistov Cc: Gang He Cc: Jan Kara Cc: Jan Kara via Ocfs2-devel Cc: Joel Becker Cc: Joseph Qi Cc: Joseph Qi Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0acb8e1fb7af..853184a46411 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1066,12 +1066,6 @@ static inline void folio_cancel_dirty(struct folio *folio) bool folio_clear_dirty_for_io(struct folio *folio); bool clear_page_dirty_for_io(struct page *page); void folio_invalidate(struct folio *folio, size_t offset, size_t length); -int __must_check folio_write_one(struct folio *folio); -static inline int __must_check write_one_page(struct page *page) -{ - return folio_write_one(page_folio(page)); -} - int __set_page_dirty_nobuffers(struct page *page); bool noop_dirty_folio(struct address_space *mapping, struct folio *folio); -- cgit v1.2.3 From 2c6efe9cf2d7841b75fe38ed1adbd41a90f51ba0 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 9 Mar 2023 15:05:45 -0800 Subject: shmem: add support to ignore swap In doing experimentations with shmem having the option to avoid swap becomes a useful mechanism. One of the *raves* about brd over shmem is you can avoid swap, but that's not really a good reason to use brd if we can instead use shmem. Using brd has its own good reasons to exist, but just because "tmpfs" doesn't let you do that is not a great reason to avoid it if we can easily add support for it. I don't add support for reconfiguring incompatible options, but if we really wanted to we can add support for that. To avoid swap we use mapping_set_unevictable() upon inode creation, and put a WARN_ON_ONCE() stop-gap on writepages() for reclaim. Link: https://lkml.kernel.org/r/20230309230545.2930737-7-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Acked-by: Christian Brauner Tested-by: Xin Hao Reviewed-by: Davidlohr Bueso Cc: Adam Manzanares Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Kees Cook Cc: Matthew Wilcox Cc: Pankaj Raghav Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 103d1000a5a2..50bf82b36995 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -45,6 +45,7 @@ struct shmem_sb_info { kuid_t uid; /* Mount uid for root directory */ kgid_t gid; /* Mount gid for root directory */ bool full_inums; /* If i_ino should be uint or ino_t */ + bool noswap; /* ignores VM reclaim / swap requests */ ino_t next_ino; /* The next per-sb inode number to use */ ino_t __percpu *ino_batch; /* The next per-cpu inode number to use */ struct mempolicy *mpol; /* default memory policy for mappings */ -- cgit v1.2.3 From 7eb16f23b9a415f062db22739e59bb144e0b24ab Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 10 Mar 2023 17:29:05 +0100 Subject: io-mapping: don't disable preempt on RT in io_mapping_map_atomic_wc(). io_mapping_map_atomic_wc() disables preemption and pagefaults for historical reasons. The conversion to io_mapping_map_local_wc(), which only disables migration, cannot be done wholesale because quite some call sites need to be updated to accommodate with the changed semantics. On PREEMPT_RT enabled kernels the io_mapping_map_atomic_wc() semantics are problematic due to the implicit disabling of preemption which makes it impossible to acquire 'sleeping' spinlocks within the mapped atomic sections. PREEMPT_RT replaces the preempt_disable() with a migrate_disable() for more than a decade. It could be argued that this is a justification to do this unconditionally, but PREEMPT_RT covers only a limited number of architectures and it disables some functionality which limits the coverage further. Limit the replacement to PREEMPT_RT for now. This is also done kmap_atomic(). Link: https://lkml.kernel.org/r/20230310162905.O57Pj7hh@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reported-by: Richard Weinberger Link: https://lore.kernel.org/CAFLxGvw0WMxaMqYqJ5WgvVSbKHq2D2xcXTOgMCpgq9nDC-MWTQ@mail.gmail.com Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/io-mapping.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 09d4f17c8d3b..7376c1df9c90 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -69,7 +69,10 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping, BUG_ON(offset >= mapping->size); phys_addr = mapping->base + offset; - preempt_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + else + migrate_disable(); pagefault_disable(); return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); } @@ -79,7 +82,10 @@ io_mapping_unmap_atomic(void __iomem *vaddr) { kunmap_local_indexed((void __force *)vaddr); pagefault_enable(); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + else + migrate_enable(); } static inline void __iomem * @@ -162,7 +168,10 @@ static inline void __iomem * io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset) { - preempt_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + else + migrate_disable(); pagefault_disable(); return io_mapping_map_wc(mapping, offset, PAGE_SIZE); } @@ -172,7 +181,10 @@ io_mapping_unmap_atomic(void __iomem *vaddr) { io_mapping_unmap(vaddr); pagefault_enable(); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + else + migrate_enable(); } static inline void __iomem * -- cgit v1.2.3 From 0a54864f8dfb64b64c84c9db6ff70e0e93690a33 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 9 Mar 2023 20:29:14 -0800 Subject: kasan: remove PG_skip_kasan_poison flag Code inspection reveals that PG_skip_kasan_poison is redundant with kasantag, because the former is intended to be set iff the latter is the match-all tag. It can also be observed that it's basically pointless to poison pages which have kasantag=0, because any pages with this tag would have been pointed to by pointers with match-all tags, so poisoning the pages would have little to no effect in terms of bug detection. Therefore, change the condition in should_skip_kasan_poison() to check kasantag instead, and remove PG_skip_kasan_poison and associated flags. Link: https://lkml.kernel.org/r/20230310042914.3805818-3-pcc@google.com Link: https://linux-review.googlesource.com/id/I57f825f2eaeaf7e8389d6cf4597c8a5821359838 Signed-off-by: Peter Collingbourne Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Evgenii Stepanov Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 30 +++++++++++++----------------- include/linux/page-flags.h | 9 --------- 2 files changed, 13 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 5088637fe5c2..6583a58670c5 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -47,16 +47,14 @@ typedef unsigned int __bitwise gfp_t; #define ___GFP_ACCOUNT 0x400000u #define ___GFP_ZEROTAGS 0x800000u #ifdef CONFIG_KASAN_HW_TAGS -#define ___GFP_SKIP_ZERO 0x1000000u -#define ___GFP_SKIP_KASAN_UNPOISON 0x2000000u -#define ___GFP_SKIP_KASAN_POISON 0x4000000u +#define ___GFP_SKIP_ZERO 0x1000000u +#define ___GFP_SKIP_KASAN 0x2000000u #else -#define ___GFP_SKIP_ZERO 0 -#define ___GFP_SKIP_KASAN_UNPOISON 0 -#define ___GFP_SKIP_KASAN_POISON 0 +#define ___GFP_SKIP_ZERO 0 +#define ___GFP_SKIP_KASAN 0 #endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x8000000u +#define ___GFP_NOLOCKDEP 0x4000000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -234,25 +232,24 @@ typedef unsigned int __bitwise gfp_t; * memory tags at the same time as zeroing memory has minimal additional * performace impact. * - * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation. - * Only effective in HW_TAGS mode. - * - * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation. - * Typically, used for userspace pages. Only effective in HW_TAGS mode. + * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. + * Used for userspace and vmalloc pages; the latter are unpoisoned by + * kasan_unpoison_vmalloc instead. For userspace pages, results in + * poisoning being skipped as well, see should_skip_kasan_poison for + * details. Only effective in HW_TAGS mode. */ #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS) #define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO) -#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON) -#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) +#define __GFP_SKIP_KASAN ((__force gfp_t)___GFP_SKIP_KASAN) /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** @@ -335,8 +332,7 @@ typedef unsigned int __bitwise gfp_t; #define GFP_DMA __GFP_DMA #define GFP_DMA32 __GFP_DMA32 #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) -#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE | \ - __GFP_SKIP_KASAN_POISON | __GFP_SKIP_KASAN_UNPOISON) +#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE | __GFP_SKIP_KASAN) #define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM) #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 57287102c5bd..dcda20c47b8f 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -135,9 +135,6 @@ enum pageflags { #ifdef CONFIG_ARCH_USES_PG_ARCH_X PG_arch_2, PG_arch_3, -#endif -#ifdef CONFIG_KASAN_HW_TAGS - PG_skip_kasan_poison, #endif __NR_PAGEFLAGS, @@ -594,12 +591,6 @@ TESTCLEARFLAG(Young, young, PF_ANY) PAGEFLAG(Idle, idle, PF_ANY) #endif -#ifdef CONFIG_KASAN_HW_TAGS -PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD) -#else -PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison) -#endif - /* * PageReported() is used to track reported free pages within the Buddy * allocator. We can use the non-atomic version of the test and set -- cgit v1.2.3 From 42c9db39704839eeb77b27db4c1d57bfa2a54a5b Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 13 Mar 2023 19:28:12 +0800 Subject: mm: vmscan: add a map_nr_max field to shrinker_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "make slab shrink lockless", v5. This patch series aims to make slab shrink lockless. 1. Background ============= On our servers, we often find the following system cpu hotspots: 52.22% [kernel] [k] down_read_trylock 19.60% [kernel] [k] up_read 8.86% [kernel] [k] shrink_slab 2.44% [kernel] [k] idr_find 1.25% [kernel] [k] count_shadow_nodes 1.18% [kernel] [k] shrink lruvec 0.71% [kernel] [k] mem_cgroup_iter 0.71% [kernel] [k] shrink_node 0.55% [kernel] [k] find_next_bit And we used bpftrace to capture its calltrace as follows: @[ down_read_trylock+1 shrink_slab+128 shrink_node+371 do_try_to_free_pages+232 try_to_free_pages+243 _alloc_pages_slowpath+771 _alloc_pages_nodemask+702 pagecache_get_page+255 filemap_fault+1361 ext4_filemap_fault+44 __do_fault+76 handle_mm_fault+3543 do_user_addr_fault+442 do_page_fault+48 page_fault+62 ]: 1161690 @[ down_read_trylock+1 shrink_slab+128 shrink_node+371 balance_pgdat+690 kswapd+389 kthread+246 ret_from_fork+31 ]: 8424884 @[ down_read_trylock+1 shrink_slab+128 shrink_node+371 do_try_to_free_pages+232 try_to_free_pages+243 __alloc_pages_slowpath+771 __alloc_pages_nodemask+702 __do_page_cache_readahead+244 filemap_fault+1674 ext4_filemap_fault+44 __do_fault+76 handle_mm_fault+3543 do_user_addr_fault+442 do_page_fault+48 page_fault+62 ]: 20917631 We can see that down_read_trylock() of shrinker_rwsem is being called with high frequency at that time. Because of the poor multicore scalability of atomic operations, this can lead to a significant drop in IPC (instructions per cycle). And more, the shrinker_rwsem is a global read-write lock in shrinkers subsystem, which protects most operations such as slab shrink, registration and unregistration of shrinkers, etc. This can easily cause problems in the following cases. 1) When the memory pressure is high and there are many filesystems mounted or unmounted at the same time, slab shrink will be affected (down_read_trylock() failed). Such as the real workload mentioned by Kirill Tkhai: ``` One of the real workloads from my experience is start of an overcommitted node containing many starting containers after node crash (or many resuming containers after reboot for kernel update). In these cases memory pressure is huge, and the node goes round in long reclaim. ``` 2) If a shrinker is blocked (such as the case mentioned in [1]) and a writer comes in (such as mount a fs), then this writer will be blocked and cause all subsequent shrinker-related operations to be blocked. [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/ All the above cases can be solved by replacing the shrinker_rwsem trylocks with SRCU. 2. Survey ========= Before doing the code implementation, I found that there were many similar submissions in the community: a. Davidlohr Bueso submitted a patch in 2015. Subject: [PATCH -next v2] mm: srcu-ify shrinkers Link: https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/ Result: It was finally merged into the linux-next branch, but failed on arm allnoconfig (without CONFIG_SRCU) b. Tetsuo Handa submitted a patchset in 2017. Subject: [PATCH 1/2] mm,vmscan: Kill global shrinker lock. Link: https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/ Result: Finally chose to use the current simple way (break when rwsem_is_contended()). And Christoph Hellwig suggested to using SRCU, but SRCU was not unconditionally enabled at the time. c. Kirill Tkhai submitted a patchset in 2018. Subject: [PATCH RFC 00/10] Introduce lockless shrink_slab() Link: https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/ Result: At that time, SRCU was not unconditionally enabled, and there were some objections to enabling SRCU. Later, because Kirill's focus was moved to other things, this patchset was not continued to be updated. d. Sultan Alsawaf submitted a patch in 2021. Subject: [PATCH] mm: vmscan: Replace shrinker_rwsem trylocks with SRCU protection Link: https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/ Result: Rejected because SRCU was not unconditionally enabled. We can find that almost all these historical commits were abandoned because SRCU was not unconditionally enabled. But now SRCU has been unconditionally enable by Paul E. McKenney in 2023 [2], so it's time to replace shrinker_rwsem trylocks with SRCU. [2] https://lore.kernel.org/lkml/20230105003759.GA1769545@paulmck-ThinkPad-P17-Gen-1/ 3. Reproduction and testing =========================== We can reproduce the down_read_trylock() hotspot through the following script: ``` #!/bin/bash DIR="/root/shrinker/memcg/mnt" do_create() { mkdir -p /sys/fs/cgroup/memory/test mkdir -p /sys/fs/cgroup/perf_event/test echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes for i in `seq 0 $1`; do mkdir -p /sys/fs/cgroup/memory/test/$i; echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs; mkdir -p $DIR/$i; done } do_mount() { for i in `seq $1 $2`; do mount -t tmpfs $i $DIR/$i; done } do_touch() { for i in `seq $1 $2`; do echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs; dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & done } case "$1" in touch) do_touch $2 $3 ;; test) do_create 4000 do_mount 0 4000 do_touch 0 3000 ;; *) exit 1 ;; esac ``` Save the above script, then run test and touch commands. Then we can use the following perf command to view hotspots: perf top -U -F 999 1) Before applying this patchset: 32.31% [kernel] [k] down_read_trylock 19.40% [kernel] [k] pv_native_safe_halt 16.24% [kernel] [k] up_read 15.70% [kernel] [k] shrink_slab 4.69% [kernel] [k] _find_next_bit 2.62% [kernel] [k] shrink_node 1.78% [kernel] [k] shrink_lruvec 0.76% [kernel] [k] do_shrink_slab 2) After applying this patchset: 27.83% [kernel] [k] _find_next_bit 16.97% [kernel] [k] shrink_slab 15.82% [kernel] [k] pv_native_safe_halt 9.58% [kernel] [k] shrink_node 8.31% [kernel] [k] shrink_lruvec 5.64% [kernel] [k] do_shrink_slab 3.88% [kernel] [k] mem_cgroup_iter At the same time, we use the following perf command to capture IPC information: perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10 1) Before applying this patchset: Performance counter stats for 'system wide' (5 runs): 454187219766 cycles test ( +- 1.84% ) 78896433101 instructions test # 0.17 insn per cycle ( +- 0.44% ) 10.0020430 +- 0.0000366 seconds time elapsed ( +- 0.00% ) 2) After applying this patchset: Performance counter stats for 'system wide' (5 runs): 841954709443 cycles test ( +- 15.80% ) (98.69%) 527258677936 instructions test # 0.63 insn per cycle ( +- 15.11% ) (98.68%) 10.01064 +- 0.00831 seconds time elapsed ( +- 0.08% ) We can see that IPC drops very seriously when calling down_read_trylock() at high frequency. After using SRCU, the IPC is at a normal level. This patch (of 8): To prepare for the subsequent lockless memcg slab shrink, add a map_nr_max field to struct shrinker_info to records its own real shrinker_nr_max. Link: https://lkml.kernel.org/r/20230313112819.38938-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230313112819.38938-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Suggested-by: Kirill Tkhai Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Cc: Christian König Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Paul E. McKenney Cc: Qi Zheng Cc: Shakeel Butt Cc: Sultan Alsawaf Cc: Tetsuo Handa Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b6eda2ab205d..aa69ea98e2d8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -97,6 +97,7 @@ struct shrinker_info { struct rcu_head rcu; atomic_long_t *nr_deferred; unsigned long *map; + int map_nr_max; }; struct lruvec_stats_percpu { -- cgit v1.2.3 From 3c556d2425b04054e22045d4ef7d34f163b7a71a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 15 Mar 2023 13:16:42 -0400 Subject: mm/thp: rename TRANSPARENT_HUGEPAGE_NEVER_DAX to _UNSUPPORTED TRANSPARENT_HUGEPAGE_NEVER_DAX has nothing to do with DAX. It's set when has_transparent_hugepage() returns false, checked in hugepage_vma_check() and will disable THP completely if false. Rename it to TRANSPARENT_HUGEPAGE_UNSUPPORTED to reflect its real purpose. [peterx@redhat.com: fix comment, per David] Link: https://lkml.kernel.org/r/ZBMzQW674oHQJV7F@x1n Link: https://lkml.kernel.org/r/20230315171642.1244625-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Aneesh Kumar K.V Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 70bd867eba94..9a3a3af2dd80 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -79,7 +79,7 @@ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, } enum transparent_hugepage_flag { - TRANSPARENT_HUGEPAGE_NEVER_DAX, + TRANSPARENT_HUGEPAGE_UNSUPPORTED, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, -- cgit v1.2.3 From 263e721e3ba1f3b42ad61aed3d504f3c8c9c0eb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2023 15:34:05 +0100 Subject: mm: make mapping_get_entry available outside of filemap.c mapping_get_entry is useful for page cache API users that need to know about xa_value internals. Rename it and make it available in pagemap.h. Link: https://lkml.kernel.org/r/20230307143410.28031-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Hugh Dickins Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 853184a46411..bcfd798ce0b9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -507,6 +507,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_ENTRY 0x00000080 #define FGP_STABLE 0x00000100 +void *filemap_get_entry(struct address_space *mapping, pgoff_t index); struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, -- cgit v1.2.3 From 48c9d11375fc66f1e59d0e9b27d121e015a50904 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2023 15:34:09 +0100 Subject: mm: remove FGP_ENTRY FGP_ENTRY is unused now, so remove it. Link: https://lkml.kernel.org/r/20230307143410.28031-7-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andreas Gruenbacher Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bcfd798ce0b9..306a0f63cea8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -504,8 +504,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 -#define FGP_ENTRY 0x00000080 -#define FGP_STABLE 0x00000100 +#define FGP_STABLE 0x00000080 void *filemap_get_entry(struct address_space *mapping, pgoff_t index); struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, -- cgit v1.2.3 From 66dabbb65d673aef40dd17bf62c042be8f6d4a4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2023 15:34:10 +0100 Subject: mm: return an ERR_PTR from __filemap_get_folio Instead of returning NULL for all errors, distinguish between: - no entry found and not asked to allocated (-ENOENT) - failed to allocate memory (-ENOMEM) - would block (-EAGAIN) so that callers don't have to guess the error based on the passed in flags. Also pass through the error through the direct callers: filemap_get_folio, filemap_lock_folio filemap_grab_folio and filemap_get_incore_folio. [hch@lst.de: fix null-pointer deref] Link: https://lkml.kernel.org/r/20230310070023.GA13563@lst.de Link: https://lkml.kernel.org/r/20230310043137.GA1624890@u2004 Link: https://lkml.kernel.org/r/20230307143410.28031-8-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Ryusuke Konishi [nilfs2] Cc: Andreas Gruenbacher Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 306a0f63cea8..fdcd595d2294 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -520,7 +520,8 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, * Looks up the page cache entry at @mapping & @index. If a folio is * present, it is returned with an increased refcount. * - * Otherwise, %NULL is returned. + * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for + * this index. Will not return a shadow, swap or DAX entry. */ static inline struct folio *filemap_get_folio(struct address_space *mapping, pgoff_t index) @@ -537,8 +538,8 @@ static inline struct folio *filemap_get_folio(struct address_space *mapping, * present, it is returned locked with an increased refcount. * * Context: May sleep. - * Return: A folio or %NULL if there is no folio in the cache for this - * index. Will not return a shadow, swap or DAX entry. + * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for + * this index. Will not return a shadow, swap or DAX entry. */ static inline struct folio *filemap_lock_folio(struct address_space *mapping, pgoff_t index) @@ -555,8 +556,8 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping, * a new folio is created. The folio is locked, marked as accessed, and * returned. * - * Return: A found or created folio. NULL if no folio is found and failed to - * create a folio. + * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found + * and failed to create a folio. */ static inline struct folio *filemap_grab_folio(struct address_space *mapping, pgoff_t index) -- cgit v1.2.3 From 2bad466cc9d9b4c3b4b16eb9c03c919b59561316 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 9 Mar 2023 17:37:10 -0500 Subject: mm/uffd: UFFD_FEATURE_WP_UNPOPULATED Patch series "mm/uffd: Add feature bit UFFD_FEATURE_WP_UNPOPULATED", v4. The new feature bit makes anonymous memory acts the same as file memory on userfaultfd-wp in that it'll also wr-protect none ptes. It can be useful in two cases: (1) Uffd-wp app that needs to wr-protect none ptes like QEMU snapshot, so pre-fault can be replaced by enabling this flag and speed up protections (2) It helps to implement async uffd-wp mode that Muhammad is working on [1] It's debatable whether this is the most ideal solution because with the new feature bit set, wr-protect none pte needs to pre-populate the pgtables to the last level (PAGE_SIZE). But it seems fine so far to service either purpose above, so we can leave optimizations for later. The series brings pte markers to anonymous memory too. There's some change in the common mm code path in the 1st patch, great to have some eye looking at it, but hopefully they're still relatively straightforward. This patch (of 2): This is a new feature that controls how uffd-wp handles none ptes. When it's set, the kernel will handle anonymous memory the same way as file memory, by allowing the user to wr-protect unpopulated ptes. File memories handles none ptes consistently by allowing wr-protecting of none ptes because of the unawareness of page cache being exist or not. For anonymous it was not as persistent because we used to assume that we don't need protections on none ptes or known zero pages. One use case of such a feature bit was VM live snapshot, where if without wr-protecting empty ptes the snapshot can contain random rubbish in the holes of the anonymous memory, which can cause misbehave of the guest when the guest OS assumes the pages should be all zeros. QEMU worked it around by pre-populate the section with reads to fill in zero page entries before starting the whole snapshot process [1]. Recently there's another need raised on using userfaultfd wr-protect for detecting dirty pages (to replace soft-dirty in some cases) [2]. In that case if without being able to wr-protect none ptes by default, the dirty info can get lost, since we cannot treat every none pte to be dirty (the current design is identify a page dirty based on uffd-wp bit being cleared). In general, we want to be able to wr-protect empty ptes too even for anonymous. This patch implements UFFD_FEATURE_WP_UNPOPULATED so that it'll make uffd-wp handling on none ptes being consistent no matter what the memory type is underneath. It doesn't have any impact on file memories so far because we already have pte markers taking care of that. So it only affects anonymous. The feature bit is by default off, so the old behavior will be maintained. Sometimes it may be wanted because the wr-protect of none ptes will contain overheads not only during UFFDIO_WRITEPROTECT (by applying pte markers to anonymous), but also on creating the pgtables to store the pte markers. So there's potentially less chance of using thp on the first fault for a none pmd or larger than a pmd. The major implementation part is teaching the whole kernel to understand pte markers even for anonymously mapped ranges, meanwhile allowing the UFFDIO_WRITEPROTECT ioctl to apply pte markers for anonymous too when the new feature bit is set. Note that even if the patch subject starts with mm/uffd, there're a few small refactors to major mm path of handling anonymous page faults. But they should be straightforward. With WP_UNPOPUATED, application like QEMU can avoid pre-read faults all the memory before wr-protect during taking a live snapshot. Quotting from Muhammad's test result here [3] based on a simple program [4]: (1) With huge page disabled echo madvise > /sys/kernel/mm/transparent_hugepage/enabled ./uffd_wp_perf Test DEFAULT: 4 Test PRE-READ: 1111453 (pre-fault 1101011) Test MADVISE: 278276 (pre-fault 266378) Test WP-UNPOPULATE: 11712 (2) With Huge page enabled echo always > /sys/kernel/mm/transparent_hugepage/enabled ./uffd_wp_perf Test DEFAULT: 4 Test PRE-READ: 22521 (pre-fault 22348) Test MADVISE: 4909 (pre-fault 4743) Test WP-UNPOPULATE: 14448 There'll be a great perf boost for no-thp case, while for thp enabled with extreme case of all-thp-zero WP_UNPOPULATED can be slower than MADVISE, but that's low possibility in reality, also the overhead was not reduced but postponed until a follow up write on any huge zero thp, so potentially it is faster by making the follow up writes slower. [1] https://lore.kernel.org/all/20210401092226.102804-4-andrey.gruzdev@virtuozzo.com/ [2] https://lore.kernel.org/all/Y+v2HJ8+3i%2FKzDBu@x1n/ [3] https://lore.kernel.org/all/d0eb0a13-16dc-1ac1-653a-78b7273781e3@collabora.com/ [4] https://github.com/xzpeter/clibs/blob/master/uffd-test/uffd-wp-perf.c [peterx@redhat.com: comment changes, oneliner fix to khugepaged] Link: https://lkml.kernel.org/r/ZB2/8jPhD3fpx5U8@x1n Link: https://lkml.kernel.org/r/20230309223711.823547-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20230309223711.823547-2-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Muhammad Usama Anjum Cc: Nadav Amit Cc: Paul Gofman Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 6 ++++++ include/linux/userfaultfd_k.h | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index de1e622dd366..0e1d239a882c 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -557,6 +557,12 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(*pte)); + /* + * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole + * thing, because when zapping either it means it's dropping the + * page, or in TTU where the present pte will be quickly replaced + * with a swap pte. There's no way of leaking the bit. + */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) return; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 3767f18114ef..0cf8880219da 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -179,6 +179,7 @@ extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); +extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); #else /* CONFIG_USERFAULTFD */ @@ -274,8 +275,30 @@ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) +{ + return false; +} + #endif /* CONFIG_USERFAULTFD */ +static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) +{ + /* Only wr-protect mode uses pte markers */ + if (!userfaultfd_wp(vma)) + return false; + + /* File-based uffd-wp always need markers */ + if (!vma_is_anonymous(vma)) + return true; + + /* + * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED + * enabled (to apply markers on zero pages). + */ + return userfaultfd_wp_unpopulated(vma); +} + static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) { #ifdef CONFIG_PTE_MARKER_UFFD_WP -- cgit v1.2.3 From 23baf831a32c04f9a968812511540b1b3e648bf5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Mar 2023 14:31:33 +0300 Subject: mm, treewide: redefine MAX_ORDER sanely MAX_ORDER currently defined as number of orders page allocator supports: user can ask buddy allocator for page order between 0 and MAX_ORDER-1. This definition is counter-intuitive and lead to number of bugs all over the kernel. Change the definition of MAX_ORDER to be inclusive: the range of orders user can ask from buddy allocator is 0..MAX_ORDER now. [kirill@shutemov.name: fix min() warning] Link: https://lkml.kernel.org/r/20230315153800.32wib3n5rickolvh@box [akpm@linux-foundation.org: fix another min_t warning] [kirill@shutemov.name: fixups per Zi Yan] Link: https://lkml.kernel.org/r/20230316232144.b7ic4cif4kjiabws@box.shutemov.name [akpm@linux-foundation.org: fix underlining in docs] Link: https://lore.kernel.org/oe-kbuild-all/202303191025.VRCTk6mP-lkp@intel.com/ Link: https://lkml.kernel.org/r/20230315113133.11326-11-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Michael Ellerman [powerpc] Cc: "Kirill A. Shutemov" Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- include/linux/mmzone.h | 10 +++++----- include/linux/pageblock-flags.h | 4 ++-- include/linux/slab.h | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7c977d234aba..8fb7d91cd0b1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -818,7 +818,7 @@ static inline unsigned huge_page_shift(struct hstate *h) static inline bool hstate_is_gigantic(struct hstate *h) { - return huge_page_order(h) >= MAX_ORDER; + return huge_page_order(h) > MAX_ORDER; } static inline unsigned int pages_per_huge_page(const struct hstate *h) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bf8786d45b31..35b11cc210d9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -26,11 +26,11 @@ /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER -#define MAX_ORDER 11 +#define MAX_ORDER 10 #else #define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif -#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) +#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed @@ -93,7 +93,7 @@ static inline bool migratetype_is_mergeable(int mt) } #define for_each_migratetype_order(order, type) \ - for (order = 0; order < MAX_ORDER; order++) \ + for (order = 0; order <= MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; @@ -922,7 +922,7 @@ struct zone { CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ - struct free_area free_area[MAX_ORDER]; + struct free_area free_area[MAX_ORDER + 1]; /* zone flags, see below */ unsigned long flags; @@ -1745,7 +1745,7 @@ static inline bool movable_only_nodes(nodemask_t *nodes) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) -#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS +#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS #error Allocator MAX_ORDER exceeds SECTION_SIZE #endif diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 5f1ae07d724b..e83c4c095041 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -41,14 +41,14 @@ extern unsigned int pageblock_order; * Huge pages are a constant size, but don't exceed the maximum allocation * granularity. */ -#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER - 1) +#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ #else /* CONFIG_HUGETLB_PAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order (MAX_ORDER-1) +#define pageblock_order MAX_ORDER #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 45af70315a94..aa4575ef2965 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -284,7 +284,7 @@ static inline unsigned int arch_slab_minalign(void) * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif @@ -292,7 +292,7 @@ static inline unsigned int arch_slab_minalign(void) #ifdef CONFIG_SLUB #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif @@ -305,7 +305,7 @@ static inline unsigned int arch_slab_minalign(void) * be allocated from the same page. */ #define KMALLOC_SHIFT_HIGH PAGE_SHIFT -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif -- cgit v1.2.3 From a734991ccaec1985fff42fb26bb6d789d35defb4 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 14 Mar 2023 15:12:47 -0700 Subject: mm: userfaultfd: rename functions for clarity + consistency Patch series "mm: userfaultfd: refactor and add UFFDIO_CONTINUE_MODE_WP", v5. - Commits 1-3 refactor userfaultfd ioctl code without behavior changes, with the main goal of improving consistency and reducing the number of function args. - Commit 4 adds UFFDIO_CONTINUE_MODE_WP. This patch (of 4): The basic problem is, over time we've added new userfaultfd ioctls, and we've refactored the code so functions which used to handle only one case are now re-used to deal with several cases. While this happened, we didn't bother to rename the functions. Similarly, as we added new functions, we cargo-culted pieces of the now-inconsistent naming scheme, so those functions too ended up with names that don't make a lot of sense. A key point here is, "copy" in most userfaultfd code refers specifically to UFFDIO_COPY, where we allocate a new page and copy its contents from userspace. There are many functions with "copy" in the name that don't actually do this (at least in some cases). So, rename things into a consistent scheme. The high level idea is that the call stack for userfaultfd ioctls becomes: userfaultfd_ioctl -> userfaultfd_(particular ioctl) -> mfill_atomic_(particular kind of fill operation) -> mfill_atomic /* loops over pages in range */ -> mfill_atomic_pte /* deals with single pages */ -> mfill_atomic_pte_(particular kind of fill operation) -> mfill_atomic_install_pte There are of course some special cases (shmem, hugetlb), but this is the general structure which all function names now adhere to. Link: https://lkml.kernel.org/r/20230314221250.682452-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20230314221250.682452-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Acked-by: Mike Rapoport (IBM) Cc: Al Viro Cc: Hugh Dickins Cc: James Houghton Cc: Jan Kara Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 30 +++++++++++++++--------------- include/linux/userfaultfd_k.h | 18 +++++++++--------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8fb7d91cd0b1..152434396c48 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -158,13 +158,13 @@ unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); #ifdef CONFIG_USERFAULTFD -int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - enum mcopy_atomic_mode mode, - struct page **pagep, - bool wp_copy); +int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + enum mcopy_atomic_mode mode, + struct page **pagep, + bool wp_copy); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, @@ -393,14 +393,14 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, } #ifdef CONFIG_USERFAULTFD -static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, - pte_t *dst_pte, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - enum mcopy_atomic_mode mode, - struct page **pagep, - bool wp_copy) +static inline int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, + pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + enum mcopy_atomic_mode mode, + struct page **pagep, + bool wp_copy) { BUG(); return 0; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 0cf8880219da..ac178d810dc7 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -61,15 +61,15 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, unsigned long dst_addr, struct page *page, bool newly_allocated, bool wp_copy); -extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, __u64 mode); -extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, - unsigned long dst_start, - unsigned long len, - atomic_t *mmap_changing); -extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, atomic_t *mmap_changing); +extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long src_start, unsigned long len, + atomic_t *mmap_changing, __u64 mode); +extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long len, + atomic_t *mmap_changing); +extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long len, atomic_t *mmap_changing); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); -- cgit v1.2.3 From 61c5004022f56c443b86800e8985d8803f3a22aa Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 14 Mar 2023 15:12:48 -0700 Subject: mm: userfaultfd: don't pass around both mm and vma Quite a few userfaultfd functions took both mm and vma pointers as arguments. Since the mm is trivially accessible via vma->vm_mm, there's no reason to pass both; it just needlessly extends the already long argument list. Get rid of the mm pointer, where possible, to shorten the argument list. Link: https://lkml.kernel.org/r/20230314221250.682452-3-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Acked-by: Mike Rapoport (IBM) Cc: Al Viro Cc: Hugh Dickins Cc: James Houghton Cc: Jan Kara Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 ++--- include/linux/shmem_fs.h | 4 ++-- include/linux/userfaultfd_k.h | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 152434396c48..3cb7cd853fa8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -158,7 +158,7 @@ unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); #ifdef CONFIG_USERFAULTFD -int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, +int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, @@ -393,8 +393,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, } #ifdef CONFIG_USERFAULTFD -static inline int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, - pte_t *dst_pte, +static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 50bf82b36995..922a2b45fe6f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -152,14 +152,14 @@ extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM -extern int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, +extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, bool zeropage, bool wp_copy, struct page **pagep); #else /* !CONFIG_SHMEM */ -#define shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \ +#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; }) #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac178d810dc7..9458cd94a508 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -56,7 +56,7 @@ enum mcopy_atomic_mode { MCOPY_ATOMIC_CONTINUE, }; -extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, +extern int mfill_atomic_install_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, struct page *page, bool newly_allocated, bool wp_copy); @@ -73,7 +73,7 @@ extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); -extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, +extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* mm helpers */ -- cgit v1.2.3 From d9712937037e0ce887920f321429826e9dbfd960 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 14 Mar 2023 15:12:49 -0700 Subject: mm: userfaultfd: combine 'mode' and 'wp_copy' arguments Many userfaultfd ioctl functions take both a 'mode' and a 'wp_copy' argument. In future commits we plan to plumb the flags through to more places, so we'd be proliferating the very long argument list even further. Let's take the time to simplify the argument list. Combine the two arguments into one - and generalize, so when we add more flags in the future, it doesn't imply more function arguments. Since the modes (copy, zeropage, continue) are mutually exclusive, store them as an integer value (0, 1, 2) in the low bits. Place combine-able flag bits in the high bits. This is quite similar to an earlier patch proposed by Nadav Amit ("userfaultfd: introduce uffd_flags" [1]). The main difference is that patch only handled flags, whereas this patch *also* combines the "mode" argument into the same type to shorten the argument list. [1]: https://lore.kernel.org/all/20220619233449.181323-2-namit@vmware.com/ Link: https://lkml.kernel.org/r/20230314221250.682452-4-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: James Houghton Acked-by: Peter Xu Acked-by: Mike Rapoport (IBM) Cc: Al Viro Cc: Hugh Dickins Cc: Jan Kara Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 10 ++++------ include/linux/shmem_fs.h | 5 +++-- include/linux/userfaultfd_k.h | 46 ++++++++++++++++++++++++++++--------------- 3 files changed, 37 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3cb7cd853fa8..2a758bcd6719 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -162,9 +162,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - enum mcopy_atomic_mode mode, - struct page **pagep, - bool wp_copy); + uffd_flags_t flags, + struct page **pagep); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, @@ -397,9 +396,8 @@ static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - enum mcopy_atomic_mode mode, - struct page **pagep, - bool wp_copy) + uffd_flags_t flags, + struct page **pagep) { BUG(); return 0; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 922a2b45fe6f..3bb8d21edbb3 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -9,6 +9,7 @@ #include #include #include +#include /* inode in-kernel data */ @@ -156,11 +157,11 @@ extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - bool zeropage, bool wp_copy, + uffd_flags_t flags, struct page **pagep); #else /* !CONFIG_SHMEM */ #define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ - src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; }) + src_addr, flags, pagep) ({ BUG(); 0; }) #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 9458cd94a508..4c477dece540 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -40,30 +40,44 @@ extern int sysctl_unprivileged_userfaultfd; extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); -/* - * The mode of operation for __mcopy_atomic and its helpers. - * - * This is almost an implementation detail (mcopy_atomic below doesn't take this - * as a parameter), but it's exposed here because memory-kind-specific - * implementations (e.g. hugetlbfs) need to know the mode of operation. - */ -enum mcopy_atomic_mode { - /* A normal copy_from_user into the destination range. */ - MCOPY_ATOMIC_NORMAL, - /* Don't copy; map the destination range to the zero page. */ - MCOPY_ATOMIC_ZEROPAGE, - /* Just install pte(s) with the existing page(s) in the page cache. */ - MCOPY_ATOMIC_CONTINUE, +/* A combined operation mode + behavior flags. */ +typedef unsigned int __bitwise uffd_flags_t; + +/* Mutually exclusive modes of operation. */ +enum mfill_atomic_mode { + MFILL_ATOMIC_COPY, + MFILL_ATOMIC_ZEROPAGE, + MFILL_ATOMIC_CONTINUE, + NR_MFILL_ATOMIC_MODES, }; +#define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) +#define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr)) +#define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) +#define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1)) + +static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected) +{ + return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected); +} + +static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode) +{ + flags &= ~MFILL_ATOMIC_MODE_MASK; + return flags | ((__force uffd_flags_t) mode); +} + +/* Flags controlling behavior. These behavior changes are mode-independent. */ +#define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) + extern int mfill_atomic_install_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, struct page *page, - bool newly_allocated, bool wp_copy); + bool newly_allocated, uffd_flags_t flags); extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, __u64 mode); + atomic_t *mmap_changing, uffd_flags_t flags); extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, -- cgit v1.2.3 From 0289184476c845968ad6ac9083c96cc0f75ca505 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 14 Mar 2023 15:12:50 -0700 Subject: mm: userfaultfd: add UFFDIO_CONTINUE_MODE_WP to install WP PTEs UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new PTE to resolve a missing fault, one can install a write-protected one. This is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in combination. This was motivated by testing HugeTLB HGM [1], and in particular its interaction with userfaultfd features. Existing userfaultfd code supports using WP and MINOR modes together (i.e. you can register an area with both enabled), but without this CONTINUE flag the combination is in practice unusable. So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing as UFFDIO_COPY_MODE_WP, but for *minor* faults. Update the selftest to do some very basic exercising of the new flag. Update Documentation/ to describe how these flags are used (neither the COPY nor the new CONTINUE versions of this mode flag were described there before). [1]: https://patchwork.kernel.org/project/linux-mm/cover/20230218002819.1486479-1-jthoughton@google.com/ Link: https://lkml.kernel.org/r/20230314221250.682452-5-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Acked-by: Mike Rapoport (IBM) Cc: Al Viro Cc: Hugh Dickins Cc: Jan Kara Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 4c477dece540..a2c53e98dfd6 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -83,7 +83,8 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long len, atomic_t *mmap_changing); extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, atomic_t *mmap_changing); + unsigned long len, atomic_t *mmap_changing, + uffd_flags_t flags); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); -- cgit v1.2.3 From 5d671eb4ef7e1423a78fc0c12dd419c1dc1f6967 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 19 Mar 2023 13:42:14 +0200 Subject: mm: move get_page_from_free_area() to mm/page_alloc.c The get_page_from_free_area() helper is only used in mm/page_alloc.c so move it there to reduce noise in include/linux/mmzone.h Link: https://lkml.kernel.org/r/20230319114214.2133332-1-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Lorenzo Stoakes Acked-by: Kirill A. Shutemov Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 35b11cc210d9..2d3d78d01283 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -108,13 +108,6 @@ struct free_area { unsigned long nr_free; }; -static inline struct page *get_page_from_free_area(struct free_area *area, - int migratetype) -{ - return list_first_entry_or_null(&area->free_list[migratetype], - struct page, lru); -} - static inline bool free_area_empty(struct free_area *area, int migratetype) { return list_empty(&area->free_list[migratetype]); -- cgit v1.2.3 From 9420f89db2dd611c5b436a13e13f74d65ecc3a6a Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 21 Mar 2023 19:05:02 +0200 Subject: mm: move most of core MM initialization to mm/mm_init.c The bulk of memory management initialization code is spread all over mm/page_alloc.c and makes navigating through page allocator functionality difficult. Move most of the functions marked __init and __meminit to mm/mm_init.c to make it better localized and allow some more spare room before mm/page_alloc.c reaches 10k lines. No functional changes. Link: https://lkml.kernel.org/r/20230321170513.2401534-4-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Doug Berger Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/gfp.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 65a78773dcca..7c554e4bd49f 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -361,9 +361,4 @@ extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, #endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); -#ifdef CONFIG_CMA -/* CMA stuff */ -extern void init_cma_reserved_pageblock(struct page *page); -#endif - #endif /* __LINUX_GFP_H */ -- cgit v1.2.3 From c4fbed4b0284d91797ee3cf60983b94c84c396b6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 21 Mar 2023 19:05:04 +0200 Subject: mm/page_alloc: rename page_alloc_init() to page_alloc_init_cpuhp() The page_alloc_init() name is really misleading because all this function does is sets up CPU hotplug callbacks for the page allocator. Rename it to page_alloc_init_cpuhp() so that name will reflect what the function does. Link: https://lkml.kernel.org/r/20230321170513.2401534-6-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Doug Berger Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7c554e4bd49f..ed8cb537c6a7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -319,7 +319,7 @@ extern void page_frag_free(void *addr); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) -void page_alloc_init(void); +void page_alloc_init_cpuhp(void); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); -- cgit v1.2.3 From b7ec1bf3e7b9dd9d3335c937f5d834680d74addf Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 21 Mar 2023 19:05:06 +0200 Subject: init,mm: move mm_init() to mm/mm_init.c and rename it to mm_core_init() Make mm_init() a part of mm/ codebase. mm_core_init() better describes what the function does and does not clash with mm_init() in kernel/fork.c Link: https://lkml.kernel.org/r/20230321170513.2401534-8-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Doug Berger Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f79667824eb..6be179cec081 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -38,6 +38,7 @@ struct pt_regs; extern int sysctl_page_lock_unfairness; +void mm_core_init(void); void init_mm_internals(void); #ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ -- cgit v1.2.3 From 4cd1e9edf60efb20fad35cf5e9ade7ad75b34cd1 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 21 Mar 2023 19:05:07 +0200 Subject: mm: call {ptlock,pgtable}_cache_init() directly from mm_core_init() and drop pgtable_init() as it has no real value and its name is misleading. Link: https://lkml.kernel.org/r/20230321170513.2401534-9-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Doug Berger Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Thomas Bogendoerfer Cc: Sergei Shtylyov Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6be179cec081..dfff8bcaa766 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2637,12 +2637,6 @@ static inline bool ptlock_init(struct page *page) { return true; } static inline void ptlock_free(struct page *page) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ -static inline void pgtable_init(void) -{ - ptlock_cache_init(); - pgtable_cache_init(); -} - static inline bool pgtable_pte_page_ctor(struct page *page) { if (!ptlock_init(page)) -- cgit v1.2.3 From f2fc4b44ec2bb94c51c7ae1af9b1177d72705992 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 21 Mar 2023 19:05:08 +0200 Subject: mm: move init_mem_debugging_and_hardening() to mm/mm_init.c init_mem_debugging_and_hardening() is only called from mm_core_init(). Move it close to the caller, make it static and rename it to mem_debugging_and_hardening_init() for consistency with surrounding convention. Link: https://lkml.kernel.org/r/20230321170513.2401534-10-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Doug Berger Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index dfff8bcaa766..0fa6a6f94eda 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3251,7 +3251,6 @@ extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); -extern void __init init_mem_debugging_and_hardening(void); #ifdef CONFIG_PAGE_POISONING extern void __kernel_poison_pages(struct page *page, int numpages); extern void __kernel_unpoison_pages(struct page *page, int numpages); -- cgit v1.2.3 From de57807e6f267a658a046dbca44dc40fe806d60f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 21 Mar 2023 19:05:09 +0200 Subject: init,mm: fold late call to page_ext_init() to page_alloc_init_late() When deferred initialization of struct pages is enabled, page_ext_init() must be called after all the deferred initialization is done, but there is no point to keep it a separate call from kernel_init_freeable() right after page_alloc_init_late(). Fold the call to page_ext_init() into page_alloc_init_late() and localize deferred_struct_pages variable. Link: https://lkml.kernel.org/r/20230321170513.2401534-11-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Doug Berger Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index bc2e39090a1f..67314f648aeb 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -29,8 +29,6 @@ struct page_ext_operations { bool need_shared_flags; }; -extern bool deferred_struct_pages; - #ifdef CONFIG_PAGE_EXTENSION /* -- cgit v1.2.3 From eb8589b4f8c107c346421881963c0ee0b8367c2c Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 21 Mar 2023 19:05:10 +0200 Subject: mm: move mem_init_print_info() to mm_init.c mem_init_print_info() is only called from mm_core_init(). Move it close to the caller and make it static. Link: https://lkml.kernel.org/r/20230321170513.2401534-12-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Doug Berger Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0fa6a6f94eda..c3abad7e6ccb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2780,7 +2780,6 @@ extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); extern void adjust_managed_page_count(struct page *page, long count); -extern void mem_init_print_info(void); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); -- cgit v1.2.3 From d5d2c02a4980c2e22037679457bf2d921b86a503 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 21 Mar 2023 19:05:11 +0200 Subject: mm: move kmem_cache_init() declaration to mm/slab.h kmem_cache_init() is called only from mm_core_init(), there is no need to declare it in include/linux/slab.h Move kmem_cache_init() declaration to mm/slab.h Link: https://lkml.kernel.org/r/20230321170513.2401534-13-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Doug Berger Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/slab.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index aa4575ef2965..f8b1d63c63a3 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -167,7 +167,6 @@ struct mem_cgroup; /* * struct kmem_cache related prototypes */ -void __init kmem_cache_init(void); bool slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, -- cgit v1.2.3 From b671491199acd3609f87754d268f2f5cb10de18d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 21 Mar 2023 19:05:12 +0200 Subject: mm: move vmalloc_init() declaration to mm/internal.h vmalloc_init() is called only from mm_core_init(), there is no need to declare it in include/linux/vmalloc.h Move vmalloc_init() declaration to mm/internal.h Link: https://lkml.kernel.org/r/20230321170513.2401534-14-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Doug Berger Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 69250efa03d1..351fc7697214 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -131,12 +131,8 @@ extern void *vm_map_ram(struct page **pages, unsigned int count, int node); extern void vm_unmap_aliases(void); #ifdef CONFIG_MMU -extern void __init vmalloc_init(void); extern unsigned long vmalloc_nr_pages(void); #else -static inline void vmalloc_init(void) -{ -} static inline unsigned long vmalloc_nr_pages(void) { return 0; } #endif -- cgit v1.2.3 From bd23024b9774e681cbe6cc3afcb24244dfcb2390 Mon Sep 17 00:00:00 2001 From: Tomas Mudrunka Date: Tue, 21 Mar 2023 11:34:30 +0100 Subject: mm/memtest: add results of early memtest to /proc/meminfo Currently the memtest results were only presented in dmesg. When running a large fleet of devices without ECC RAM it's currently not easy to do bulk monitoring for memory corruption. You have to parse dmesg, but that's a ring buffer so the error might disappear after some time. In general I do not consider dmesg to be a great API to query RAM status. In several companies I've seen such errors remain undetected and cause issues for way too long. So I think it makes sense to provide a monitoring API, so that we can safely detect and act upon them. This adds /proc/meminfo entry which can be easily used by scripts. Link: https://lkml.kernel.org/r/20230321103430.7130-1-tomas.mudrunka@gmail.com Signed-off-by: Tomas Mudrunka Cc: Jonathan Corbet Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/memblock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 50ad19662a32..f82ee3fac1cd 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -597,6 +597,8 @@ extern int hashdist; /* Distribute hashes across NUMA nodes? */ #endif #ifdef CONFIG_MEMTEST +extern phys_addr_t early_memtest_bad_size; /* Size of faulty ram found by memtest */ +extern bool early_memtest_done; /* Was early memtest done? */ extern void early_memtest(phys_addr_t start, phys_addr_t end); #else static inline void early_memtest(phys_addr_t start, phys_addr_t end) -- cgit v1.2.3 From 28d8b812e97b31231e95864f36a6b32f4b307daa Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 12 Mar 2023 23:40:13 +0000 Subject: mm: remove unused vmf_insert_mixed_prot() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Remove drm/ttm-specific mm changes". Functionality was added specifically for the DRM TTM driver to support mapping memory for VM_MIXEDMAP VMAs with customised protection flags, however this has now been rolled back as issues were found with this approach. This series removes the mm changes too, retaining some of the useful comments. This patch (of 3): The sole user of vmf_insert_mixed_prot(), the drm ttm module, stopped using this in commit f91142c62161 ("drm/ttm: nuke VM_MIXEDMAP on BO mappings v3") citing use of VM_MIXEDMAP in this case being terribly broken. Remove this now-dead code and references to it, but retain the useful description of the prot != vma->vm_page_prot case, moving it to vmf_insert_pfn_prot() instead. Link: https://lkml.kernel.org/r/cover.1678661628.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/a069644388e6f1593a7020d15840e6fc9f39bcaf.1678661628.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Cc: Christian König Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Thomas Hellström Cc: Aaron Tomlin Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Heiko Carstens Cc: Huacai Chen Cc: Marcelo Tosatti Cc: Peter Xu Cc: "Russell King (Oracle)" Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- include/linux/mm_types.h | 7 +------ 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c3abad7e6ccb..55d02356146c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3179,8 +3179,6 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); -vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0722859c3647..651a5c256732 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -485,12 +485,7 @@ struct vm_area_struct { within vm_mm. */ struct mm_struct *vm_mm; /* The address space we belong to. */ - - /* - * Access permissions of this VMA. - * See vmf_insert_mixed_prot() for discussion. - */ - pgprot_t vm_page_prot; + pgprot_t vm_page_prot; /* Access permissions of this VMA. */ /* * Flags, see mm.h. -- cgit v1.2.3 From 7b806d229ef151c2997c041b791dfa89593e0e1b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 12 Mar 2023 23:40:14 +0000 Subject: mm: remove vmf_insert_pfn_xxx_prot() for huge page-table entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This functionality's sole user, the drm ttm module, removed support for it in commit 0d979509539e ("drm/ttm: remove ttm_bo_vm_insert_huge()") as the whole approach is currently unworkable without a PMD/PUD special bit and updates to GUP. Link: https://lkml.kernel.org/r/604c2ad79659d4b8a6e3e1611c6219d5d3233988.1678661628.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Cc: Christian König Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Thomas Hellström Cc: Aaron Tomlin Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Heiko Carstens Cc: Huacai Chen Cc: Marcelo Tosatti Cc: Peter Xu Cc: "Russell King (Oracle)" Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 39 ++------------------------------------- 1 file changed, 2 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9a3a3af2dd80..20284387b841 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -39,44 +39,9 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags); -vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, - pgprot_t pgprot, bool write); -/** - * vmf_insert_pfn_pmd - insert a pmd size pfn - * @vmf: Structure describing the fault - * @pfn: pfn to insert - * @pgprot: page protection to use - * @write: whether it's a write fault - * - * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. - * - * Return: vm_fault_t value. - */ -static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, - bool write) -{ - return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write); -} -vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, - pgprot_t pgprot, bool write); - -/** - * vmf_insert_pfn_pud - insert a pud size pfn - * @vmf: Structure describing the fault - * @pfn: pfn to insert - * @pgprot: page protection to use - * @write: whether it's a write fault - * - * Insert a pud size pfn. See vmf_insert_pfn() for additional info. - * - * Return: vm_fault_t value. - */ -static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, - bool write) -{ - return vmf_insert_pfn_pud_prot(vmf, pfn, vmf->vma->vm_page_prot, write); -} +vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); +vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_UNSUPPORTED, -- cgit v1.2.3 From 3f6dac0fd1b83178137e7b4e722d8f29612cbec1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 21 Mar 2023 03:24:15 +0300 Subject: mm/page_alloc: make deferred page init free pages in MAX_ORDER blocks Normal page init path frees pages during the boot in MAX_ORDER chunks, but deferred page init path does it in pageblock blocks. Change deferred page init path to work in MAX_ORDER blocks. For cases when MAX_ORDER is larger than pageblock, set migrate type to MIGRATE_MOVABLE for all pageblocks covered by the page. Link: https://lkml.kernel.org/r/20230321002415.20843-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2d3d78d01283..2d22e47dc1eb 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -32,6 +32,8 @@ #endif #define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) +#define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) + /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should -- cgit v1.2.3 From 4f80818b4a58c9458dce0df7cce9abe107da445e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 22 Mar 2023 18:57:03 +0000 Subject: iov_iter: add copy_page_to_iter_nofault() Provide a means to copy a page to user space from an iterator, aborting if a page fault would occur. This supports compound pages, but may be passed a tail page with an offset extending further into the compound page, so we cannot pass a folio. This allows for this function to be called from atomic context and _try_ to user pages if they are faulted in, aborting if not. The function does not use _copy_to_iter() in order to not specify might_fault(), this is similar to copy_page_from_iter_atomic(). This is being added in order that an iteratable form of vread() can be implemented while holding spinlocks. Link: https://lkml.kernel.org/r/19734729defb0f498a76bdec1bef3ac48a3af3e8.1679511146.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Baoquan He Cc: Alexander Viro Cc: David Hildenbrand Cc: Jens Axboe Cc: Jiri Olsa Cc: Liu Shixin Cc: Matthew Wilcox (Oracle) Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- include/linux/uio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 27e3fd942960..29eb18bb6feb 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -173,6 +173,8 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, { return copy_page_to_iter(&folio->page, offset, bytes, i); } +size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, + size_t bytes, struct iov_iter *i); static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) -- cgit v1.2.3 From 4c91c07c93bbbdd7f2d9de2beb7ee5c2a48ad8e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 22 Mar 2023 18:57:04 +0000 Subject: mm: vmalloc: convert vread() to vread_iter() Having previously laid the foundation for converting vread() to an iterator function, pull the trigger and do so. This patch attempts to provide minimal refactoring and to reflect the existing logic as best we can, for example we continue to zero portions of memory not read, as before. Overall, there should be no functional difference other than a performance improvement in /proc/kcore access to vmalloc regions. Now we have eliminated the need for a bounce buffer in read_kcore_iter(), we dispense with it, and try to write to user memory optimistically but with faults disabled via copy_page_to_iter_nofault(). We already have preemption disabled by holding a spin lock. We continue faulting in until the operation is complete. Additionally, we must account for the fact that at any point a copy may fail (most likely due to a fault not being able to occur), we exit indicating fewer bytes retrieved than expected. [sfr@canb.auug.org.au: fix sparc64 warning] Link: https://lkml.kernel.org/r/20230320144721.663280c3@canb.auug.org.au [lstoakes@gmail.com: redo Stephen's sparc build fix] Link: https://lkml.kernel.org/r/8506cbc667c39205e65a323f750ff9c11a463798.1679566220.git.lstoakes@gmail.com [akpm@linux-foundation.org: unbreak uio.h includes] Link: https://lkml.kernel.org/r/941f88bc5ab928e6656e1e2593b91bf0f8c81e1b.1679511146.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Signed-off-by: Stephen Rothwell Reviewed-by: Baoquan He Cc: Alexander Viro Cc: David Hildenbrand Cc: Jens Axboe Cc: Jiri Olsa Cc: Liu Shixin Cc: Matthew Wilcox (Oracle) Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 351fc7697214..c720be70c8dd 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -14,6 +14,7 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ struct notifier_block; /* in notifier.h */ +struct iov_iter; /* in uio.h */ /* bits in flags of vmalloc's vm_struct below */ #define VM_IOREMAP 0x00000001 /* ioremap() and friends */ @@ -247,7 +248,7 @@ static inline void set_vm_flush_reset_perms(void *addr) #endif /* for /proc/kcore */ -extern long vread(char *buf, char *addr, unsigned long count); +extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count); /* * Internals. Don't use.. -- cgit v1.2.3 From 20cce633f4254cc0df39665449726e3172518f6c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 27 Feb 2023 09:36:09 -0800 Subject: mm: rcu safe VMA freeing This prepares for page faults handling under VMA lock, looking up VMAs under protection of an rcu read lock, instead of the usual mmap read lock. Link: https://lkml.kernel.org/r/20230227173632.3292573-11-surenb@google.com Signed-off-by: Michel Lespinasse Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 651a5c256732..f203e2562e00 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -480,9 +480,16 @@ struct anon_vma_name { struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ - unsigned long vm_start; /* Our start address within vm_mm. */ - unsigned long vm_end; /* The first byte after our end address - within vm_mm. */ + union { + struct { + /* VMA covers [vm_start; vm_end) addresses within mm */ + unsigned long vm_start; + unsigned long vm_end; + }; +#ifdef CONFIG_PER_VMA_LOCK + struct rcu_head vm_rcu; /* Used for deferred freeing. */ +#endif + }; struct mm_struct *vm_mm; /* The address space we belong to. */ pgprot_t vm_page_prot; /* Access permissions of this VMA. */ -- cgit v1.2.3 From 438b6e12cd603a9416ca7c5462ab75b2a8c4d5b9 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:10 -0800 Subject: mm: move mmap_lock assert function definitions Move mmap_lock assert function definitions up so that they can be used by other mmap_lock routines. Link: https://lkml.kernel.org/r/20230227173632.3292573-12-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 96e113e23d04..e49ba91bb1f0 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -60,6 +60,18 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) #endif /* CONFIG_TRACING */ +static inline void mmap_assert_locked(struct mm_struct *mm) +{ + lockdep_assert_held(&mm->mmap_lock); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); +} + +static inline void mmap_assert_write_locked(struct mm_struct *mm) +{ + lockdep_assert_held_write(&mm->mmap_lock); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); +} + static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); @@ -150,18 +162,6 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) up_read_non_owner(&mm->mmap_lock); } -static inline void mmap_assert_locked(struct mm_struct *mm) -{ - lockdep_assert_held(&mm->mmap_lock); - VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); -} - -static inline void mmap_assert_write_locked(struct mm_struct *mm) -{ - lockdep_assert_held_write(&mm->mmap_lock); - VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); -} - static inline int mmap_lock_is_contended(struct mm_struct *mm) { return rwsem_is_contended(&mm->mmap_lock); -- cgit v1.2.3 From 5e31275cc997f8ec5d9e8d65fe9840ebed89db19 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:11 -0800 Subject: mm: add per-VMA lock and helper functions to control it Introduce per-VMA locking. The lock implementation relies on a per-vma and per-mm sequence counters to note exclusive locking: - read lock - (implemented by vma_start_read) requires the vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to differ. If they match then there must be a vma exclusive lock held somewhere. - read unlock - (implemented by vma_end_read) is a trivial vma->lock unlock. - write lock - (vma_start_write) requires the mmap_lock to be held exclusively and the current mm counter is assigned to the vma counter. This will allow multiple vmas to be locked under a single mmap_lock write lock (e.g. during vma merging). The vma counter is modified under exclusive vma lock. - write unlock - (vma_end_write_all) is a batch release of all vma locks held. It doesn't pair with a specific vma_start_write! It is done before exclusive mmap_lock is released by incrementing mm sequence counter (mm_lock_seq). - write downgrade - if the mmap_lock is downgraded to the read lock, all vma write locks are released as well (effectivelly same as write unlock). Link: https://lkml.kernel.org/r/20230227173632.3292573-13-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 82 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 8 +++++ include/linux/mmap_lock.h | 13 ++++++++ 3 files changed, 103 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 55d02356146c..53bfb42663ea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -624,6 +624,87 @@ struct vm_operations_struct { unsigned long addr); }; +#ifdef CONFIG_PER_VMA_LOCK +static inline void vma_init_lock(struct vm_area_struct *vma) +{ + init_rwsem(&vma->lock); + vma->vm_lock_seq = -1; +} + +/* + * Try to read-lock a vma. The function is allowed to occasionally yield false + * locked result to avoid performance overhead, in which case we fall back to + * using mmap_lock. The function should never yield false unlocked result. + */ +static inline bool vma_start_read(struct vm_area_struct *vma) +{ + /* Check before locking. A race might cause false locked result. */ + if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) + return false; + + if (unlikely(down_read_trylock(&vma->lock) == 0)) + return false; + + /* + * Overflow might produce false locked result. + * False unlocked result is impossible because we modify and check + * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq + * modification invalidates all existing locks. + */ + if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { + up_read(&vma->lock); + return false; + } + return true; +} + +static inline void vma_end_read(struct vm_area_struct *vma) +{ + rcu_read_lock(); /* keeps vma alive till the end of up_read */ + up_read(&vma->lock); + rcu_read_unlock(); +} + +static inline void vma_start_write(struct vm_area_struct *vma) +{ + int mm_lock_seq; + + mmap_assert_write_locked(vma->vm_mm); + + /* + * current task is holding mmap_write_lock, both vma->vm_lock_seq and + * mm->mm_lock_seq can't be concurrently modified. + */ + mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); + if (vma->vm_lock_seq == mm_lock_seq) + return; + + down_write(&vma->lock); + vma->vm_lock_seq = mm_lock_seq; + up_write(&vma->lock); +} + +static inline void vma_assert_write_locked(struct vm_area_struct *vma) +{ + mmap_assert_write_locked(vma->vm_mm); + /* + * current task is holding mmap_write_lock, both vma->vm_lock_seq and + * mm->mm_lock_seq can't be concurrently modified. + */ + VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma); +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline void vma_init_lock(struct vm_area_struct *vma) {} +static inline bool vma_start_read(struct vm_area_struct *vma) + { return false; } +static inline void vma_end_read(struct vm_area_struct *vma) {} +static inline void vma_start_write(struct vm_area_struct *vma) {} +static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} + +#endif /* CONFIG_PER_VMA_LOCK */ + static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { static const struct vm_operations_struct dummy_vm_ops = {}; @@ -632,6 +713,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); + vma_init_lock(vma); } /* Use when VMA is not part of the VMA tree and needs no locking */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f203e2562e00..843a45893991 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -503,6 +503,11 @@ struct vm_area_struct { vm_flags_t __private __vm_flags; }; +#ifdef CONFIG_PER_VMA_LOCK + int vm_lock_seq; + struct rw_semaphore lock; +#endif + /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. @@ -639,6 +644,9 @@ struct mm_struct { * init_mm.mmlist, and are protected * by mmlist_lock */ +#ifdef CONFIG_PER_VMA_LOCK + int mm_lock_seq; +#endif unsigned long hiwater_rss; /* High-watermark of RSS usage */ diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index e49ba91bb1f0..aab8f1b28d26 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); } +#ifdef CONFIG_PER_VMA_LOCK +static inline void vma_end_write_all(struct mm_struct *mm) +{ + mmap_assert_write_locked(mm); + /* No races during update due to exclusive mmap_lock being held */ + WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1); +} +#else +static inline void vma_end_write_all(struct mm_struct *mm) {} +#endif + static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); @@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm) static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); + vma_end_write_all(mm); up_write(&mm->mmap_lock); } static inline void mmap_write_downgrade(struct mm_struct *mm) { __mmap_lock_trace_acquire_returned(mm, false, true); + vma_end_write_all(mm); downgrade_write(&mm->mmap_lock); } -- cgit v1.2.3 From c732293331a22187d59cd485879276a1da398387 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:12 -0800 Subject: mm: mark VMA as being written when changing vm_flags Updates to vm_flags have to be done with VMA marked as being written for preventing concurrent page faults or other modifications. Link: https://lkml.kernel.org/r/20230227173632.3292573-14-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 53bfb42663ea..d0a6c99aba09 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -727,28 +727,28 @@ static inline void vm_flags_init(struct vm_area_struct *vma, static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { - mmap_assert_write_locked(vma->vm_mm); + vma_start_write(vma); vm_flags_init(vma, flags); } static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { - mmap_assert_write_locked(vma->vm_mm); + vma_start_write(vma); WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { - mmap_assert_write_locked(vma->vm_mm); + vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) |= flags; } static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { - mmap_assert_write_locked(vma->vm_mm); + vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } @@ -769,7 +769,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma, static inline void vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { - mmap_assert_write_locked(vma->vm_mm); + vma_start_write(vma); __vm_flags_mod(vma, set, clear); } -- cgit v1.2.3 From 55fd6fccad3172c0feaaa817f0a1283629ff183e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:14 -0800 Subject: mm/khugepaged: write-lock VMA while collapsing a huge page Protect VMA from concurrent page fault handler while collapsing a huge page. Page fault handler needs a stable PMD to use PTL and relies on per-VMA lock to prevent concurrent PMD changes. pmdp_collapse_flush(), set_huge_pmd() and collapse_and_free_pmd() can modify a PMD, which will not be detected by a page fault handler without proper locking. Before this patch, page tables can be walked under any one of the mmap_lock, the mapping lock, and the anon_vma lock; so when khugepaged unlinks and frees page tables, it must ensure that all of those either are locked or don't exist. This patch adds a fourth lock under which page tables can be traversed, and so khugepaged must also lock out that one. [surenb@google.com: vm_lock/i_mmap_rwsem inversion in retract_page_tables] Link: https://lkml.kernel.org/r/20230303213250.3555716-1-surenb@google.com [surenb@google.com: build fix] Link: https://lkml.kernel.org/r/CAJuCfpFjWhtzRE1X=J+_JjgJzNKhq-=JT8yTBSTHthwp0pqWZw@mail.gmail.com Link: https://lkml.kernel.org/r/20230227173632.3292573-16-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d0a6c99aba09..d6a2abc51e3d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -665,18 +665,23 @@ static inline void vma_end_read(struct vm_area_struct *vma) rcu_read_unlock(); } -static inline void vma_start_write(struct vm_area_struct *vma) +static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) { - int mm_lock_seq; - mmap_assert_write_locked(vma->vm_mm); /* * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); - if (vma->vm_lock_seq == mm_lock_seq) + *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); + return (vma->vm_lock_seq == *mm_lock_seq); +} + +static inline void vma_start_write(struct vm_area_struct *vma) +{ + int mm_lock_seq; + + if (__is_vma_write_locked(vma, &mm_lock_seq)) return; down_write(&vma->lock); @@ -684,14 +689,26 @@ static inline void vma_start_write(struct vm_area_struct *vma) up_write(&vma->lock); } +static inline bool vma_try_start_write(struct vm_area_struct *vma) +{ + int mm_lock_seq; + + if (__is_vma_write_locked(vma, &mm_lock_seq)) + return true; + + if (!down_write_trylock(&vma->vm_lock->lock)) + return false; + + vma->vm_lock_seq = mm_lock_seq; + up_write(&vma->vm_lock->lock); + return true; +} + static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - mmap_assert_write_locked(vma->vm_mm); - /* - * current task is holding mmap_write_lock, both vma->vm_lock_seq and - * mm->mm_lock_seq can't be concurrently modified. - */ - VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma); + int mm_lock_seq; + + VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } #else /* CONFIG_PER_VMA_LOCK */ @@ -701,6 +718,8 @@ static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} +static inline bool vma_try_start_write(struct vm_area_struct *vma) + { return true; } static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} #endif /* CONFIG_PER_VMA_LOCK */ -- cgit v1.2.3 From 457f67be5910a2b5f1fda8af06bfe4d3492a0a4f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:21 -0800 Subject: mm: introduce vma detached flag Per-vma locking mechanism will search for VMA under RCU protection and then after locking it, has to ensure it was not removed from the VMA tree after we found it. To make this check efficient, introduce a vma->detached flag to mark VMAs which were removed from the VMA tree. Link: https://lkml.kernel.org/r/20230227173632.3292573-23-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 +++++++++++ include/linux/mm_types.h | 3 +++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d6a2abc51e3d..d0f289bfef01 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -711,6 +711,14 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } +static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) +{ + /* When detaching vma should be write-locked */ + if (detached) + vma_assert_write_locked(vma); + vma->detached = detached; +} + #else /* CONFIG_PER_VMA_LOCK */ static inline void vma_init_lock(struct vm_area_struct *vma) {} @@ -721,6 +729,8 @@ static inline void vma_start_write(struct vm_area_struct *vma) {} static inline bool vma_try_start_write(struct vm_area_struct *vma) { return true; } static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} +static inline void vma_mark_detached(struct vm_area_struct *vma, + bool detached) {} #endif /* CONFIG_PER_VMA_LOCK */ @@ -732,6 +742,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); + vma_mark_detached(vma, false); vma_init_lock(vma); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 843a45893991..3248ae45cb2e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -506,6 +506,9 @@ struct vm_area_struct { #ifdef CONFIG_PER_VMA_LOCK int vm_lock_seq; struct rw_semaphore lock; + + /* Flag to indicate areas detached from the mm->mm_mt tree */ + bool detached; #endif /* -- cgit v1.2.3 From 50ee32537206140e4cf6e47024be29a84d458d49 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:22 -0800 Subject: mm: introduce lock_vma_under_rcu to be used from arch-specific code Introduce lock_vma_under_rcu function to lookup and lock a VMA during page fault handling. When VMA is not found, can't be locked or changes after being locked, the function returns NULL. The lookup is performed under RCU protection to prevent the found VMA from being destroyed before the VMA lock is acquired. VMA lock statistics are updated according to the results. For now only anonymous VMAs can be searched this way. In other cases the function returns NULL. Link: https://lkml.kernel.org/r/20230227173632.3292573-24-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d0f289bfef01..0619f9fa7581 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -719,6 +719,9 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) vma->detached = detached; } +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address); + #else /* CONFIG_PER_VMA_LOCK */ static inline void vma_init_lock(struct vm_area_struct *vma) {} -- cgit v1.2.3 From 55324e46eb8b886ecd08c9a089d3a694c705b3b0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:24 -0800 Subject: mm: add FAULT_FLAG_VMA_LOCK flag Add a new flag to distinguish page faults handled under protection of per-vma lock. [surenb@google.com: document FAULT_FLAG_VMA_LOCK flag] Link: https://lkml.kernel.org/r/20230301022720.1380780-2-surenb@google.com Link: https://lore.kernel.org/all/20230301113648.7c279865@canb.auug.org.au/ Link: https://lkml.kernel.org/r/20230227173632.3292573-26-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Laurent Dufour Cc: Dan Carpenter Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Liam R. Howlett Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Pavel Tatashin Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 ++- include/linux/mm_types.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0619f9fa7581..1876825d6700 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -479,7 +479,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ - { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } + { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ + { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } /* * vm_fault is filled by the pagefault handler and passed to the vma's diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3248ae45cb2e..3a6cf4c2edd0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1049,6 +1049,7 @@ typedef struct { * mapped after the fault. * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. * We should only access orig_pte if this flag set. + * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two @@ -1086,6 +1087,7 @@ enum fault_flag { FAULT_FLAG_INTERRUPTIBLE = 1 << 9, FAULT_FLAG_UNSHARE = 1 << 10, FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, + FAULT_FLAG_VMA_LOCK = 1 << 12, }; typedef unsigned int __bitwise zap_flags_t; -- cgit v1.2.3 From 52f238653e452e0fda61e880f263a173d219acd1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:27 -0800 Subject: mm: introduce per-VMA lock statistics Add a new CONFIG_PER_VMA_LOCK_STATS config option to dump extra statistics about handling page fault under VMA lock. Link: https://lkml.kernel.org/r/20230227173632.3292573-29-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 6 ++++++ include/linux/vmstat.h | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 7f5d1caf5890..8abfa1240040 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -149,6 +149,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, DIRECT_MAP_LEVEL3_SPLIT, +#endif +#ifdef CONFIG_PER_VMA_LOCK_STATS + VMA_LOCK_SUCCESS, + VMA_LOCK_ABORT, + VMA_LOCK_RETRY, + VMA_LOCK_MISS, #endif NR_VM_EVENT_ITEMS }; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 19cf5b6892ce..fed855bae6d8 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -125,6 +125,12 @@ static inline void vm_events_fold_cpu(int cpu) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif +#ifdef CONFIG_PER_VMA_LOCK_STATS +#define count_vm_vma_lock_event(x) count_vm_event(x) +#else +#define count_vm_vma_lock_event(x) do {} while (0) +#endif + #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) -- cgit v1.2.3 From 0d2ebf9c3f7822e7ba3e4792ea3b6b19aa2da34a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:31 -0800 Subject: mm/mmap: free vm_area_struct without call_rcu in exit_mmap call_rcu() can take a long time when callback offloading is enabled. Its use in the vm_area_free can cause regressions in the exit path when multiple VMAs are being freed. Because exit_mmap() is called only after the last mm user drops its refcount, the page fault handlers can't be racing with it. Any other possible user like oom-reaper or process_mrelease are already synchronized using mmap_lock. Therefore exit_mmap() can free VMAs directly, without the use of call_rcu(). Expose __vm_area_free() and use it from exit_mmap() to avoid possible call_rcu() floods and performance regressions caused by it. Link: https://lkml.kernel.org/r/20230227173632.3292573-33-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1876825d6700..f31c75dc190e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -257,6 +257,8 @@ void setup_initial_init_mm(void *start_code, void *end_code, struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); +/* Use only if VMA has no other users */ +void __vm_area_free(struct vm_area_struct *vma); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; -- cgit v1.2.3 From c7f8f31c00d187a2c71a241c7f2bd6aa102a4e6f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:32 -0800 Subject: mm: separate vma->lock from vm_area_struct vma->lock being part of the vm_area_struct causes performance regression during page faults because during contention its count and owner fields are constantly updated and having other parts of vm_area_struct used during page fault handling next to them causes constant cache line bouncing. Fix that by moving the lock outside of the vm_area_struct. All attempts to keep vma->lock inside vm_area_struct in a separate cache line still produce performance regression especially on NUMA machines. Smallest regression was achieved when lock is placed in the fourth cache line but that bloats vm_area_struct to 256 bytes. Considering performance and memory impact, separate lock looks like the best option. It increases memory footprint of each VMA but that can be optimized later if the new size causes issues. Note that after this change vma_init() does not allocate or initialize vma->lock anymore. A number of drivers allocate a pseudo VMA on the stack but they never use the VMA's lock, therefore it does not need to be allocated. The future drivers which might need the VMA lock should use vm_area_alloc()/vm_area_free() to allocate the VMA. Link: https://lkml.kernel.org/r/20230227173632.3292573-34-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 23 ++++++++++------------- include/linux/mm_types.h | 6 +++++- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f31c75dc190e..f5487a86b154 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -628,12 +628,6 @@ struct vm_operations_struct { }; #ifdef CONFIG_PER_VMA_LOCK -static inline void vma_init_lock(struct vm_area_struct *vma) -{ - init_rwsem(&vma->lock); - vma->vm_lock_seq = -1; -} - /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to @@ -645,17 +639,17 @@ static inline bool vma_start_read(struct vm_area_struct *vma) if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) return false; - if (unlikely(down_read_trylock(&vma->lock) == 0)) + if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) return false; /* * Overflow might produce false locked result. * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq + * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq * modification invalidates all existing locks. */ if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { - up_read(&vma->lock); + up_read(&vma->vm_lock->lock); return false; } return true; @@ -664,7 +658,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) static inline void vma_end_read(struct vm_area_struct *vma) { rcu_read_lock(); /* keeps vma alive till the end of up_read */ - up_read(&vma->lock); + up_read(&vma->vm_lock->lock); rcu_read_unlock(); } @@ -687,9 +681,9 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - down_write(&vma->lock); + down_write(&vma->vm_lock->lock); vma->vm_lock_seq = mm_lock_seq; - up_write(&vma->lock); + up_write(&vma->vm_lock->lock); } static inline bool vma_try_start_write(struct vm_area_struct *vma) @@ -740,6 +734,10 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, #endif /* CONFIG_PER_VMA_LOCK */ +/* + * WARNING: vma_init does not initialize vma->vm_lock. + * Use vm_area_alloc()/vm_area_free() if vma needs locking. + */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { static const struct vm_operations_struct dummy_vm_ops = {}; @@ -749,7 +747,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); - vma_init_lock(vma); } /* Use when VMA is not part of the VMA tree and needs no locking */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3a6cf4c2edd0..1b3031e95215 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -471,6 +471,10 @@ struct anon_vma_name { char name[]; }; +struct vma_lock { + struct rw_semaphore lock; +}; + /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -505,7 +509,7 @@ struct vm_area_struct { #ifdef CONFIG_PER_VMA_LOCK int vm_lock_seq; - struct rw_semaphore lock; + struct vma_lock *vm_lock; /* Flag to indicate areas detached from the mm->mm_mt tree */ bool detached; -- cgit v1.2.3 From ef6a22b70f6d90449a5c797b8968a682824e2011 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 1 Mar 2023 17:49:00 +0530 Subject: sched/numa: apply the scan delay to every new vma Pach series "sched/numa: Enhance vma scanning", v3. The patchset proposes one of the enhancements to numa vma scanning suggested by Mel. This is continuation of [3]. Reposting the rebased patchset to akpm mm-unstable tree (March 1) Existing mechanism of scan period involves, scan period derived from per-thread stats. Process Adaptive autoNUMA [1] proposed to gather NUMA fault stats at per-process level to capture aplication behaviour better. During that course of discussion, Mel proposed several ideas to enhance current numa balancing. One of the suggestion was below Track what threads access a VMA. The suggestion was to use an unsigned long pid_mask and use the lower bits to tag approximately what threads access a VMA. Skip VMAs that did not trap a fault. This would be approximate because of PID collisions but would reduce scanning of areas the thread is not interested in. The above suggestion intends not to penalize threads that has no interest in the vma, thus reduce scanning overhead. V3 changes are mostly based on PeterZ comments (details below in changes) Summary of patchset: Current patchset implements: 1. Delay the vma scanning logic for newly created VMA's so that additional overhead of scanning is not incurred for short lived tasks (implementation by Mel) 2. Store the information of tasks accessing VMA in 2 windows. It is regularly cleared in (4*sysctl_numa_balancing_scan_delay) interval. The above time is derived from experimenting (Suggested by PeterZ) to balance between frequent clearing vs obsolete access data 3. hash_32 used to encode task index accessing VMA information 4. VMA's acess information is used to skip scanning for the tasks which had not accessed VMA Changes since V2: patch1: - Renaming of structure, macro to function, - Add explanation to heuristics - Adding more details from result (PeterZ) Patch2: - Usage of test and set bit (PeterZ) - Move storing access PID info to numa_migrate_prep() - Add a note on fainess among tasks allowed to scan (PeterZ) Patch3: - Maintain two windows of access PID information (PeterZ supported implementation and Gave idea to extend to N if needed) Patch4: - Apply hash_32 function to track VMA accessing PIDs (PeterZ) Changes since RFC V1: - Include Mel's vma scan delay patch - Change the accessing pid store logic (Thanks Mel) - Fencing structure / code to NUMA_BALANCING (David, Mel) - Adding clearing access PID logic (Mel) - Descriptive change log ( Mike Rapoport) Things to ponder over: ========================================== - Improvement to clearing accessing PIDs logic (discussed in-detail in patch3 itself (Done in this patchset by implementing 2 window history) - Current scan period is not changed in the patchset, so we do see frequent tries to scan. Relaxing scan period dynamically could improve results further. [1] sched/numa: Process Adaptive autoNUMA Link: https://lore.kernel.org/lkml/20220128052851.17162-1-bharata@amd.com/T/ [2] RFC V1 Link: https://lore.kernel.org/all/cover.1673610485.git.raghavendra.kt@amd.com/ [3] V2 Link: https://lore.kernel.org/lkml/cover.1675159422.git.raghavendra.kt@amd.com/ Results: Summary: Huge autonuma cost reduction seen in mmtest. Kernbench improvement is more than 5% and huge system time (80%+) improvement from mmtest autonuma. (dbench had huge std deviation to post) kernbench =========== 6.2.0-mmunstable-base 6.2.0-mmunstable-patched Amean user-256 22002.51 ( 0.00%) 22649.95 * -2.94%* Amean syst-256 10162.78 ( 0.00%) 8214.13 * 19.17%* Amean elsp-256 160.74 ( 0.00%) 156.92 * 2.38%* Duration User 66017.43 67959.84 Duration System 30503.15 24657.03 Duration Elapsed 504.61 493.12 6.2.0-mmunstable-base 6.2.0-mmunstable-patched Ops NUMA alloc hit 1738835089.00 1738780310.00 Ops NUMA alloc local 1738834448.00 1738779711.00 Ops NUMA base-page range updates 477310.00 392566.00 Ops NUMA PTE updates 477310.00 392566.00 Ops NUMA hint faults 96817.00 87555.00 Ops NUMA hint local faults % 10150.00 2192.00 Ops NUMA hint local percent 10.48 2.50 Ops NUMA pages migrated 86660.00 85363.00 Ops AutoNUMA cost 489.07 442.14 autonumabench =============== 6.2.0-mmunstable-base 6.2.0-mmunstable-patched Amean syst-NUMA01 399.50 ( 0.00%) 52.05 * 86.97%* Amean syst-NUMA01_THREADLOCAL 0.21 ( 0.00%) 0.22 * -5.41%* Amean syst-NUMA02 0.80 ( 0.00%) 0.78 * 2.68%* Amean syst-NUMA02_SMT 0.65 ( 0.00%) 0.68 * -3.95%* Amean elsp-NUMA01 313.26 ( 0.00%) 313.11 * 0.05%* Amean elsp-NUMA01_THREADLOCAL 1.06 ( 0.00%) 1.08 * -1.76%* Amean elsp-NUMA02 3.19 ( 0.00%) 3.24 * -1.52%* Amean elsp-NUMA02_SMT 3.72 ( 0.00%) 3.61 * 2.92%* Duration User 396433.47 324835.96 Duration System 2808.70 376.66 Duration Elapsed 2258.61 2258.12 6.2.0-mmunstable-base 6.2.0-mmunstable-patched Ops NUMA alloc hit 59921806.00 49623489.00 Ops NUMA alloc miss 0.00 0.00 Ops NUMA interleave hit 0.00 0.00 Ops NUMA alloc local 59920880.00 49622594.00 Ops NUMA base-page range updates 152259275.00 50075.00 Ops NUMA PTE updates 152259275.00 50075.00 Ops NUMA PMD updates 0.00 0.00 Ops NUMA hint faults 154660352.00 39014.00 Ops NUMA hint local faults % 138550501.00 23139.00 Ops NUMA hint local percent 89.58 59.31 Ops NUMA pages migrated 8179067.00 14147.00 Ops AutoNUMA cost 774522.98 195.69 This patch (of 4): Currently whenever a new task is created we wait for sysctl_numa_balancing_scan_delay to avoid unnessary scanning overhead. Extend the same logic to new or very short-lived VMAs. [raghavendra.kt@amd.com: add initialization in vm_area_dup())] Link: https://lkml.kernel.org/r/cover.1677672277.git.raghavendra.kt@amd.com Link: https://lkml.kernel.org/r/7a6fbba87c8b51e67efd3e74285bb4cb311a16ca.1677672277.git.raghavendra.kt@amd.com Signed-off-by: Mel Gorman Signed-off-by: Raghavendra K T Cc: Bharata B Rao Cc: David Hildenbrand Cc: Ingo Molnar Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Disha Talreja Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++++ include/linux/mm_types.h | 7 +++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f5487a86b154..9f08a11b355c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -29,6 +29,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -627,6 +628,20 @@ struct vm_operations_struct { unsigned long addr); }; +#ifdef CONFIG_NUMA_BALANCING +static inline void vma_numab_state_init(struct vm_area_struct *vma) +{ + vma->numab_state = NULL; +} +static inline void vma_numab_state_free(struct vm_area_struct *vma) +{ + kfree(vma->numab_state); +} +#else +static inline void vma_numab_state_init(struct vm_area_struct *vma) {} +static inline void vma_numab_state_free(struct vm_area_struct *vma) {} +#endif /* CONFIG_NUMA_BALANCING */ + #ifdef CONFIG_PER_VMA_LOCK /* * Try to read-lock a vma. The function is allowed to occasionally yield false @@ -747,6 +762,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); + vma_numab_state_init(vma); } /* Use when VMA is not part of the VMA tree and needs no locking */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1b3031e95215..3e1a42673769 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -475,6 +475,10 @@ struct vma_lock { struct rw_semaphore lock; }; +struct vma_numab_state { + unsigned long next_scan; +}; + /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -560,6 +564,9 @@ struct vm_area_struct { #endif #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ +#endif +#ifdef CONFIG_NUMA_BALANCING + struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; -- cgit v1.2.3 From fc137c0ddab29b591db6a091dc6d7ce20ccb73f2 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Wed, 1 Mar 2023 17:49:01 +0530 Subject: sched/numa: enhance vma scanning logic During Numa scanning make sure only relevant vmas of the tasks are scanned. Before: All the tasks of a process participate in scanning the vma even if they do not access vma in it's lifespan. Now: Except cases of first few unconditional scans, if a process do not touch vma (exluding false positive cases of PID collisions) tasks no longer scan all vma Logic used: 1) 6 bits of PID used to mark active bit in vma numab status during fault to remember PIDs accessing vma. (Thanks Mel) 2) Subsequently in scan path, vma scanning is skipped if current PID had not accessed vma. 3) First two times we do allow unconditional scan to preserve earlier behaviour of scanning. Acknowledgement to Bharata B Rao for initial patch to store pid information and Peter Zijlstra (Usage of test and set bit) Link: https://lkml.kernel.org/r/092f03105c7c1d3450f4636b1ea350407f07640e.1677672277.git.raghavendra.kt@amd.com Signed-off-by: Raghavendra K T Suggested-by: Mel Gorman Cc: David Hildenbrand Cc: Disha Talreja Cc: Ingo Molnar Cc: Mike Rapoport Signed-off-by: Andrew Morton --- include/linux/mm.h | 14 ++++++++++++++ include/linux/mm_types.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9f08a11b355c..215327daffae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1686,6 +1686,16 @@ static inline int xchg_page_access_time(struct page *page, int time) last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS); return last_time << PAGE_ACCESS_TIME_BUCKETS; } + +static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) +{ + unsigned int pid_bit; + + pid_bit = current->pid % BITS_PER_LONG; + if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids)) { + __set_bit(pid_bit, &vma->numab_state->access_pids); + } +} #else /* !CONFIG_NUMA_BALANCING */ static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { @@ -1735,6 +1745,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) { return false; } + +static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) +{ +} #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3e1a42673769..f8cbd8efc7cb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -477,6 +477,7 @@ struct vma_lock { struct vma_numab_state { unsigned long next_scan; + unsigned long access_pids; }; /* -- cgit v1.2.3 From 20f586486b87dcfe10b8c79398e24e720885588a Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Wed, 1 Mar 2023 17:49:02 +0530 Subject: sched/numa: implement access PID reset logic This helps to ensure that only recently accessed PIDs scan the VMAs. Current implementation: (idea supported by PeterZ) 1. Accessing PID information is maintained in two windows. access_pids[1] being newest. 2. Reset old access PID info i.e. access_pid[0] every (4 * sysctl_numa_balancing_scan_delay) interval after initial scan delay period expires. The above interval seemed to be experimentally optimum since it avoids frequent reset of access info as well as helps clearing the old access info regularly. The reset logic is implemented in scan path. Link: https://lkml.kernel.org/r/f7a675f66d1442d048b4216b2baf94515012c405.1677672277.git.raghavendra.kt@amd.com Signed-off-by: Raghavendra K T Suggested-by: Mel Gorman Cc: Bharata B Rao Cc: David Hildenbrand Cc: Disha Talreja Cc: Ingo Molnar Cc: Mike Rapoport Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- include/linux/mm_types.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 215327daffae..e05a878e186e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1692,8 +1692,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) unsigned int pid_bit; pid_bit = current->pid % BITS_PER_LONG; - if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids)) { - __set_bit(pid_bit, &vma->numab_state->access_pids); + if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) { + __set_bit(pid_bit, &vma->numab_state->access_pids[1]); } } #else /* !CONFIG_NUMA_BALANCING */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f8cbd8efc7cb..092f842a854f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -477,7 +477,8 @@ struct vma_lock { struct vma_numab_state { unsigned long next_scan; - unsigned long access_pids; + unsigned long next_pid_reset; + unsigned long access_pids[2]; }; /* -- cgit v1.2.3 From d46031f40e0f7f7bf63914bb3f2e404ad3886ecd Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Wed, 1 Mar 2023 17:49:03 +0530 Subject: sched/numa: use hash_32 to mix up PIDs accessing VMA before: last 6 bits of PID is used as index to store information about tasks accessing VMA's. after: hash_32 is used to take of cases where tasks are created over a period of time, and thus improve collision probability. Result: The patch series overall improves autonuma cost. Kernbench around more than 5% improvement and system time in mmtest autonuma showed more than 80% improvement Link: https://lkml.kernel.org/r/d5a9f75513300caed74e5c8570bba9317b963c2b.1677672277.git.raghavendra.kt@amd.com Signed-off-by: Raghavendra K T Suggested-by: Peter Zijlstra Cc: Bharata B Rao Cc: David Hildenbrand Cc: Disha Talreja Cc: Ingo Molnar Cc: Mel Gorman Cc: Mike Rapoport Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e05a878e186e..e249208f8fbe 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1691,7 +1691,7 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { unsigned int pid_bit; - pid_bit = current->pid % BITS_PER_LONG; + pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) { __set_bit(pid_bit, &vma->numab_state->access_pids[1]); } -- cgit v1.2.3 From 5f300fd59a2ae90b8a7fb5ed3d5fd43768236c38 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Apr 2023 10:03:53 +0200 Subject: mm: make arch_has_descending_max_zone_pfns() static clang produces a build failure on x86 for some randconfig builds after a change that moves around code to mm/mm_init.c: Cannot find symbol for section 2: .text. mm/mm_init.o: failed I have not been able to figure out why this happens, but the __weak annotation on arch_has_descending_max_zone_pfns() is the trigger here. Removing the weak function in favor of an open-coded Kconfig option check avoids the problem and becomes clearer as well as better to optimize by the compiler. [arnd@arndb.de: fix logic bug] Link: https://lkml.kernel.org/r/20230415081904.969049-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20230414080418.110236-1-arnd@kernel.org Fixes: 9420f89db2dd ("mm: move most of core MM initialization to mm/mm_init.c") Signed-off-by: Arnd Bergmann Acked-by: Vlastimil Babka Tested-by: SeongJae Park Tested-by: Geert Uytterhoeven Acked-by: Mike Rapoport (IBM) Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e249208f8fbe..5e5ef6ee4003 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3035,7 +3035,6 @@ extern void setup_per_cpu_pageset(void); extern int min_free_kbytes; extern int watermark_boost_factor; extern int watermark_scale_factor; -extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; -- cgit v1.2.3 From a85c2257a8ac353af16dbcbf32c50d3380860bc5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Mar 2023 14:44:47 +0100 Subject: sched/isolation: add cpu_is_isolated() API Patch series "memcg, cpuisol: do not interfere pcp cache charges draining with cpuisol workloads". Leonardo has reported [1] that pcp memcg charge draining can interfere with cpu isolated workloads. The said draining is done from a WQ context with a pcp worker scheduled on each CPU which holds any cached charges for a specific memcg hierarchy. Operation is not really a common operation [2]. It can be triggered from the userspace though so some care is definitely due. Leonardo has tried to address the issue by allowing remote charge draining [3]. This approach requires an additional locking to synchronize pcp caches sync from a remote cpu from local pcp consumers. Even though the proposed lock was per-cpu there is still potential for contention and less predictable behavior. This patchset addresses the issue from a different angle. Rather than dealing with a potential synchronization, cpus which are isolated are simply never scheduled to be drained. This means that a small amount of charges could be laying around and waiting for a later use or they are flushed when a different memcg is charged from the same cpu. More details are in patch 2. The first patch from Frederic is implementing an abstraction to tell whether a specific cpu has been isolated and therefore require a special treatment. This patch (of 2): Provide this new API to check if a CPU has been isolated either through isolcpus= or nohz_full= kernel parameter. It aims at avoiding kernel load deemed to be safely spared on CPUs running sensitive workload that can't bear any disturbance, such as pcp cache draining. Link: https://lkml.kernel.org/r/20230317134448.11082-1-mhocko@kernel.org Link: https://lkml.kernel.org/r/20230317134448.11082-2-mhocko@kernel.org Signed-off-by: Frederic Weisbecker Signed-off-by: Michal Hocko Suggested-by: Michal Hocko Cc: Johannes Weiner Cc: Marcelo Tosatti Cc: Muchun Song Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Leonardo Bras Cc: Ingo Molnar Signed-off-by: Andrew Morton --- include/linux/sched/isolation.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 8c15abd67aed..fe1a46f30d24 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -46,6 +46,12 @@ static inline bool housekeeping_enabled(enum hk_type type) static inline void housekeeping_affine(struct task_struct *t, enum hk_type type) { } + +static inline bool housekeeping_test_cpu(int cpu, enum hk_type type) +{ + return true; +} + static inline void housekeeping_init(void) { } #endif /* CONFIG_CPU_ISOLATION */ @@ -58,4 +64,10 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type) return true; } +static inline bool cpu_is_isolated(int cpu) +{ + return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) || + !housekeeping_test_cpu(cpu, HK_TYPE_TICK); +} + #endif /* _LINUX_SCHED_ISOLATION_H */ -- cgit v1.2.3 From 957ebbdf434013ee01f29f6c9174eced995ebbe7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Mar 2023 16:10:50 +0100 Subject: hugetlb: remove PageHeadHuge() Sidhartha Kumar removed the last caller of PageHeadHuge(), so we can now remove it and make folio_test_hugetlb() the real implementation. Add kernel-doc for folio_test_hugetlb(). Link: https://lkml.kernel.org/r/20230327151050.1787744-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index dcda20c47b8f..efc42fae3d96 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -815,14 +815,9 @@ static inline void ClearPageCompound(struct page *page) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); -int PageHeadHuge(struct page *page); -static inline bool folio_test_hugetlb(struct folio *folio) -{ - return PageHeadHuge(&folio->page); -} +bool folio_test_hugetlb(struct folio *folio); #else TESTPAGEFLAG_FALSE(Huge, hugetlb) -TESTPAGEFLAG_FALSE(HeadHuge, headhuge) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE -- cgit v1.2.3 From 62f31bd4dcedffe3c919deb76ed65bf62c3cf80b Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 26 Mar 2023 19:02:15 +0300 Subject: mm: move free_area_empty() to mm/internal.h The free_area_empty() helper is only used inside mm/ so move it there to reduce noise in include/linux/mmzone.h Link: https://lkml.kernel.org/r/20230326160215.2674531-1-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Suggested-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2d22e47dc1eb..337956748976 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -110,11 +110,6 @@ struct free_area { unsigned long nr_free; }; -static inline bool free_area_empty(struct free_area *area, int migratetype) -{ - return list_empty(&area->free_list[migratetype]); -} - struct pglist_data; #ifdef CONFIG_NUMA -- cgit v1.2.3 From 8bff9a04ca33476213ea6155850505787c540c25 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 30 Mar 2023 19:17:54 +0000 Subject: cgroup: rename cgroup_rstat_flush_"irqsafe" to "atomic" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "memcg: avoid flushing stats atomically where possible", v3. rstat flushing is an expensive operation that scales with the number of cpus and the number of cgroups in the system. The purpose of this series is to minimize the contexts where we flush stats atomically. Patches 1 and 2 are cleanups requested during reviews of prior versions of this series. Patch 3 makes sure we never try to flush from within an irq context. Patches 4 to 7 introduce separate variants of mem_cgroup_flush_stats() for atomic and non-atomic flushing, and make sure we only flush the stats atomically when necessary. Patch 8 is a slightly tangential optimization that limits the work done by rstat flushing in some scenarios. This patch (of 8): cgroup_rstat_flush_irqsafe() can be a confusing name. It may read as "irqs are disabled throughout", which is what the current implementation does (currently under discussion [1]), but is not the intention. The intention is that this function is safe to call from atomic contexts. Name it as such. Link: https://lkml.kernel.org/r/20230330191801.1967435-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20230330191801.1967435-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Suggested-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Jens Axboe Cc: Josef Bacik Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Cc: Vasily Averin Cc: Zefan Li Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3410aecffdb4..885f5395fcd0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -692,7 +692,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); -void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); +void cgroup_rstat_flush_atomic(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); -- cgit v1.2.3 From 92fbbc7202ac4fb16250dc91c88211814ae2190b Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 30 Mar 2023 19:17:55 +0000 Subject: memcg: rename mem_cgroup_flush_stats_"delayed" to "ratelimited" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mem_cgroup_flush_stats_delayed() suggests his is using a delayed_work, but this is actually sometimes flushing directly from the callsite. What it's doing is ratelimited calls. A better name would be mem_cgroup_flush_stats_ratelimited(). Link: https://lkml.kernel.org/r/20230330191801.1967435-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Suggested-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Jens Axboe Cc: Josef Bacik Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Cc: Vasily Averin Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index aa69ea98e2d8..00a88cf947e1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1038,7 +1038,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, } void mem_cgroup_flush_stats(void); -void mem_cgroup_flush_stats_delayed(void); +void mem_cgroup_flush_stats_ratelimited(void); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); @@ -1536,7 +1536,7 @@ static inline void mem_cgroup_flush_stats(void) { } -static inline void mem_cgroup_flush_stats_delayed(void) +static inline void mem_cgroup_flush_stats_ratelimited(void) { } -- cgit v1.2.3 From 9fad9aee1f267a8ad1f86b87ae70b2c4d6796164 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 30 Mar 2023 19:17:58 +0000 Subject: memcg: sleep during flushing stats in safe contexts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, all contexts that flush memcg stats do so with sleeping not allowed. Some of these contexts are perfectly safe to sleep in, such as reading cgroup files from userspace or the background periodic flusher. Flushing is an expensive operation that scales with the number of cpus and the number of cgroups in the system, so avoid doing it atomically where possible. Refactor the code to make mem_cgroup_flush_stats() non-atomic (aka sleepable), and provide a separate atomic version. The atomic version is used in reclaim, refault, writeback, and in mem_cgroup_usage(). All other code paths are left to use the non-atomic version. This includes callbacks for userspace reads and the periodic flusher. Since refault is the only caller of mem_cgroup_flush_stats_ratelimited(), change it to mem_cgroup_flush_stats_atomic_ratelimited(). Reclaim and refault code paths are modified to do non-atomic flushing in separate later patches -- so it will eventually be changed back to mem_cgroup_flush_stats_ratelimited(). Link: https://lkml.kernel.org/r/20230330191801.1967435-6-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Jens Axboe Cc: Josef Bacik Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Cc: Vasily Averin Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 00a88cf947e1..3db355e6677f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1038,7 +1038,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, } void mem_cgroup_flush_stats(void); -void mem_cgroup_flush_stats_ratelimited(void); +void mem_cgroup_flush_stats_atomic(void); +void mem_cgroup_flush_stats_atomic_ratelimited(void); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); @@ -1536,7 +1537,11 @@ static inline void mem_cgroup_flush_stats(void) { } -static inline void mem_cgroup_flush_stats_ratelimited(void) +static inline void mem_cgroup_flush_stats_atomic(void) +{ +} + +static inline void mem_cgroup_flush_stats_atomic_ratelimited(void) { } -- cgit v1.2.3 From 4009b2f1887036d30637bc06dd0ade7e18408bb3 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 30 Mar 2023 19:17:59 +0000 Subject: workingset: memcg: sleep when flushing stats in workingset_refault() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In workingset_refault(), we call mem_cgroup_flush_stats_atomic_ratelimited() to read accurate stats within an RCU read section and with sleeping disallowed. Move the call above the RCU read section to make it non-atomic. Flushing is an expensive operation that scales with the number of cpus and the number of cgroups in the system, so avoid doing it atomically where possible. Since workingset_refault() is the only caller of mem_cgroup_flush_stats_atomic_ratelimited(), just make it non-atomic, and rename it to mem_cgroup_flush_stats_ratelimited(). Link: https://lkml.kernel.org/r/20230330191801.1967435-7-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Jens Axboe Cc: Josef Bacik Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Cc: Vasily Averin Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3db355e6677f..222d7370134c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1039,7 +1039,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, void mem_cgroup_flush_stats(void); void mem_cgroup_flush_stats_atomic(void); -void mem_cgroup_flush_stats_atomic_ratelimited(void); +void mem_cgroup_flush_stats_ratelimited(void); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); @@ -1541,7 +1541,7 @@ static inline void mem_cgroup_flush_stats_atomic(void) { } -static inline void mem_cgroup_flush_stats_atomic_ratelimited(void) +static inline void mem_cgroup_flush_stats_ratelimited(void) { } -- cgit v1.2.3 From 6efc7afb5cc98488410d44695685d003d832534d Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Wed, 29 Mar 2023 08:11:20 -0700 Subject: mm/hwpoison: introduce copy_mc_highpage Similar to how copy_mc_user_highpage is implemented for copy_user_highpage on #MC supported architecture, introduce the #MC handled version of copy_highpage. This helper has immediate usage when khugepaged wants to copy file-backed memory pages and tolerate #MC. Link: https://lkml.kernel.org/r/20230329151121.949896-3-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Reviewed-by: Yang Shi Cc: David Stevens Cc: Hugh Dickins Cc: Kefeng Wang Cc: Kirill A. Shutemov Cc: "Kirill A. Shutemov" Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Tong Tiangen Cc: Tony Luck Signed-off-by: Andrew Morton --- include/linux/highmem.h | 54 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 9c7cdaa3de8c..4de1dbcd3ef6 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -315,7 +315,29 @@ static inline void copy_user_highpage(struct page *to, struct page *from, #endif +#ifndef __HAVE_ARCH_COPY_HIGHPAGE + +static inline void copy_highpage(struct page *to, struct page *from) +{ + char *vfrom, *vto; + + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); + copy_page(vto, vfrom); + kmsan_copy_page_meta(to, from); + kunmap_local(vto); + kunmap_local(vfrom); +} + +#endif + #ifdef copy_mc_to_kernel +/* + * If architecture supports machine check exception handling, define the + * #MC versions of copy_user_highpage and copy_highpage. They copy a memory + * page with #MC in source page (@from) handled, and return the number + * of bytes not copied if there was a #MC, otherwise 0 for success. + */ static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { @@ -332,29 +354,35 @@ static inline int copy_mc_user_highpage(struct page *to, struct page *from, return ret; } -#else -static inline int copy_mc_user_highpage(struct page *to, struct page *from, - unsigned long vaddr, struct vm_area_struct *vma) -{ - copy_user_highpage(to, from, vaddr, vma); - return 0; -} -#endif -#ifndef __HAVE_ARCH_COPY_HIGHPAGE - -static inline void copy_highpage(struct page *to, struct page *from) +static inline int copy_mc_highpage(struct page *to, struct page *from) { + unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); - copy_page(vto, vfrom); - kmsan_copy_page_meta(to, from); + ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); + if (!ret) + kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); + + return ret; +} +#else +static inline int copy_mc_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + copy_user_highpage(to, from, vaddr, vma); + return 0; } +static inline int copy_mc_highpage(struct page *to, struct page *from) +{ + copy_highpage(to, from); + return 0; +} #endif static inline void memcpy_page(struct page *dst_page, size_t dst_off, -- cgit v1.2.3 From b4aca54792e76556bb9f8661bab07b4bf2eb4e5f Mon Sep 17 00:00:00 2001 From: Steven Price Date: Wed, 5 Apr 2023 11:38:19 +0100 Subject: smaps: fix defined but not used smaps_shmem_walk_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When !CONFIG_SHMEM smaps_shmem_walk_ops is defined but not used, triggering a compiler warning. To avoid the warning remove the #ifdef around the usage. This has no effect because shmem_mapping() is a stub returning false when !CONFIG_SHMEM so the code will be compiled out, however we now need to also provide a stub for shmem_swap_usage(). Link: https://lkml.kernel.org/r/20230405103819.151246-1-steven.price@arm.com Fixes: 7b86ac3371b7 ("pagewalk: separate function pointers from iterator data") Signed-off-by: Steven Price Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202304031749.UiyJpxzF-lkp@intel.com/ Reviewed-by: Jason Gunthorpe Cc: Christoph Hellwig Cc: Alexey Dobriyan Cc: Thomas Hellström Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 3bb8d21edbb3..b9516a9802bf 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -96,7 +96,14 @@ int shmem_unuse(unsigned int type); extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags); +#ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); +#else +static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma) +{ + return 0; +} +#endif extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); -- cgit v1.2.3 From e87340ca5c9cecc8a11daf1a2dcabf23f06a4e10 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Mon, 10 Apr 2023 21:39:29 +0800 Subject: userfaultfd: convert copy_huge_page_from_user() to copy_folio_from_user() Replace copy_huge_page_from_user() with copy_folio_from_user(). copy_folio_from_user() does the same as copy_huge_page_from_user(), but takes in a folio instead of a page. Convert page_kaddr to kaddr in copy_folio_from_user() to do indenting cleanup. Link: https://lkml.kernel.org/r/20230410133932.32288-4-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Muchun Song Cc: Nanyong Sun Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5e5ef6ee4003..01a009efc2cb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3681,10 +3681,9 @@ extern void copy_user_huge_page(struct page *dst, struct page *src, unsigned long addr_hint, struct vm_area_struct *vma, unsigned int pages_per_huge_page); -extern long copy_huge_page_from_user(struct page *dst_page, - const void __user *usr_src, - unsigned int pages_per_huge_page, - bool allow_pagefault); +long copy_folio_from_user(struct folio *dst_folio, + const void __user *usr_src, + bool allow_pagefault); /** * vma_is_special_huge - Are transhuge page-table entries considered special? -- cgit v1.2.3 From 0169fd518a8934d8d723659752b07589ecc9f692 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Mon, 10 Apr 2023 21:39:30 +0800 Subject: userfaultfd: convert mfill_atomic_hugetlb() to use a folio Convert hugetlb_mfill_atomic_pte() to take in a folio pointer instead of a page pointer. Convert mfill_atomic_hugetlb() to use a folio. Link: https://lkml.kernel.org/r/20230410133932.32288-5-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Muchun Song Cc: Nanyong Sun Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2a758bcd6719..28703fe22386 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -163,7 +163,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, - struct page **pagep); + struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, @@ -397,7 +397,7 @@ static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, - struct page **pagep) + struct folio **foliop) { BUG(); return 0; -- cgit v1.2.3 From c0e8150e144b62ae467520d0b51c4707c09e897b Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Mon, 10 Apr 2023 21:39:31 +0800 Subject: mm: convert copy_user_huge_page() to copy_user_large_folio() Replace copy_user_huge_page() with copy_user_large_folio(). copy_user_large_folio() does the same as copy_user_huge_page(), but takes in folios instead of pages. Remove pages_per_huge_page from copy_user_large_folio(), because we can get that from folio_nr_pages(dst). Convert copy_user_gigantic_page() to take in folios. Link: https://lkml.kernel.org/r/20230410133932.32288-6-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Cc: Nanyong Sun Cc: Sidhartha Kumar Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 01a009efc2cb..5a3eaa9a1f8c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3677,10 +3677,9 @@ extern const struct attribute_group memory_failure_attr_group; extern void clear_huge_page(struct page *page, unsigned long addr_hint, unsigned int pages_per_huge_page); -extern void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr_hint, - struct vm_area_struct *vma, - unsigned int pages_per_huge_page); +void copy_user_large_folio(struct folio *dst, struct folio *src, + unsigned long addr_hint, + struct vm_area_struct *vma); long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault); -- cgit v1.2.3 From d7be6d7eee1bbf98671d7a2c95654322241e2ae4 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Mon, 10 Apr 2023 21:39:32 +0800 Subject: userfaultfd: convert mfill_atomic() to use a folio Convert mfill_atomic_pte_copy(), shmem_mfill_atomic_pte() and mfill_atomic_pte() to take in a folio pointer. Convert mfill_atomic() to use a folio. Convert page_kaddr to kaddr in mfill_atomic(). Link: https://lkml.kernel.org/r/20230410133932.32288-7-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Mike Kravetz Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Muchun Song Cc: Nanyong Sun Cc: Sidhartha Kumar Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index b9516a9802bf..9029abd29b1c 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -165,10 +165,10 @@ extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, - struct page **pagep); + struct folio **foliop); #else /* !CONFIG_SHMEM */ #define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ - src_addr, flags, pagep) ({ BUG(); 0; }) + src_addr, flags, foliop) ({ BUG(); 0; }) #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ -- cgit v1.2.3 From 87a7ae75d7383afa998f57656d1d14e2a730cc47 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 11 Apr 2023 19:52:13 +0530 Subject: mm/vmemmap/devdax: fix kernel crash when probing devdax devices commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps") added support for using optimized vmmemap for devdax devices. But how vmemmap mappings are created are architecture specific. For example, powerpc with hash translation doesn't have vmemmap mappings in init_mm page table instead they are bolted table entries in the hardware page table vmemmap_populate_compound_pages() used by vmemmap optimization code is not aware of these architecture-specific mapping. Hence allow architecture to opt for this feature. I selected architectures supporting HUGETLB_PAGE_OPTIMIZE_VMEMMAP option as also supporting this feature. This patch fixes the below crash on ppc64. BUG: Unable to handle kernel data access on write at 0xc00c000100400038 Faulting instruction address: 0xc000000001269d90 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 7 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc5-150500.34-default+ #2 5c90a668b6bbd142599890245c2fb5de19d7d28a Hardware name: IBM,9009-42G POWER9 (raw) 0x4e0202 0xf000005 of:IBM,FW950.40 (VL950_099) hv:phyp pSeries NIP: c000000001269d90 LR: c0000000004c57d4 CTR: 0000000000000000 REGS: c000000003632c30 TRAP: 0300 Not tainted (6.3.0-rc5-150500.34-default+) MSR: 8000000000009033 CR: 24842228 XER: 00000000 CFAR: c0000000004c57d0 DAR: c00c000100400038 DSISR: 42000000 IRQMASK: 0 .... NIP [c000000001269d90] __init_single_page.isra.74+0x14/0x4c LR [c0000000004c57d4] __init_zone_device_page+0x44/0xd0 Call Trace: [c000000003632ed0] [c000000003632f60] 0xc000000003632f60 (unreliable) [c000000003632f10] [c0000000004c5ca0] memmap_init_zone_device+0x170/0x250 [c000000003632fe0] [c0000000005575f8] memremap_pages+0x2c8/0x7f0 [c0000000036330c0] [c000000000557b5c] devm_memremap_pages+0x3c/0xa0 [c000000003633100] [c000000000d458a8] dev_dax_probe+0x108/0x3e0 [c0000000036331a0] [c000000000d41430] dax_bus_probe+0xb0/0x140 [c0000000036331d0] [c000000000cef27c] really_probe+0x19c/0x520 [c000000003633260] [c000000000cef6b4] __driver_probe_device+0xb4/0x230 [c0000000036332e0] [c000000000cef888] driver_probe_device+0x58/0x120 [c000000003633320] [c000000000cefa6c] __device_attach_driver+0x11c/0x1e0 [c0000000036333a0] [c000000000cebc58] bus_for_each_drv+0xa8/0x130 [c000000003633400] [c000000000ceefcc] __device_attach+0x15c/0x250 [c0000000036334a0] [c000000000ced458] bus_probe_device+0x108/0x110 [c0000000036334f0] [c000000000ce92dc] device_add+0x7fc/0xa10 [c0000000036335b0] [c000000000d447c8] devm_create_dev_dax+0x1d8/0x530 [c000000003633640] [c000000000d46b60] __dax_pmem_probe+0x200/0x270 [c0000000036337b0] [c000000000d46bf0] dax_pmem_probe+0x20/0x70 [c0000000036337d0] [c000000000d2279c] nvdimm_bus_probe+0xac/0x2b0 [c000000003633860] [c000000000cef27c] really_probe+0x19c/0x520 [c0000000036338f0] [c000000000cef6b4] __driver_probe_device+0xb4/0x230 [c000000003633970] [c000000000cef888] driver_probe_device+0x58/0x120 [c0000000036339b0] [c000000000cefd08] __driver_attach+0x1d8/0x240 [c000000003633a30] [c000000000cebb04] bus_for_each_dev+0xb4/0x130 [c000000003633a90] [c000000000cee564] driver_attach+0x34/0x50 [c000000003633ab0] [c000000000ced878] bus_add_driver+0x218/0x300 [c000000003633b40] [c000000000cf1144] driver_register+0xa4/0x1b0 [c000000003633bb0] [c000000000d21a0c] __nd_driver_register+0x5c/0x100 [c000000003633c10] [c00000000206a2e8] dax_pmem_init+0x34/0x48 [c000000003633c30] [c0000000000132d0] do_one_initcall+0x60/0x320 [c000000003633d00] [c0000000020051b0] kernel_init_freeable+0x360/0x400 [c000000003633de0] [c000000000013764] kernel_init+0x34/0x1d0 [c000000003633e50] [c00000000000de14] ret_from_kernel_thread+0x5c/0x64 Link: https://lkml.kernel.org/r/20230411142214.64464-1-aneesh.kumar@linux.ibm.com Fixes: 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps") Signed-off-by: Aneesh Kumar K.V Reported-by: Tarun Sahu Reviewed-by: Joao Martins Cc: Muchun Song Cc: Dan Williams Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5a3eaa9a1f8c..21a7e2460084 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3560,6 +3560,22 @@ void vmemmap_populate_print_last(void); void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif + +#ifdef CONFIG_ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + return is_power_of_2(sizeof(struct page)) && + pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap; +} +#else +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + return false; +} +#endif + void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, unsigned long nr_pages); -- cgit v1.2.3 From 0b376f1e0ff555435597fa16823ae0f30b2883e3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 12 Apr 2023 10:30:25 +0530 Subject: mm/hugetlb_vmemmap: rename ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP Now we use ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP config option to indicate devdax and hugetlb vmemmap optimization support. Hence rename that to a generic ARCH_WANT_OPTIMIZE_VMEMMAP Link: https://lkml.kernel.org/r/20230412050025.84346-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Muchun Song Cc: Joao Martins Cc: Dan Williams Cc: Mike Kravetz Cc: Tarun Sahu Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 21a7e2460084..0b514de43143 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3561,7 +3561,7 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif -#ifdef CONFIG_ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { -- cgit v1.2.3 From 1cb9dc4b475c7418f925ab0c97b6750007d9f52e Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 13 Apr 2023 21:13:49 +0800 Subject: mm: hwpoison: support recovery from HugePage copy-on-write faults copy-on-write of hugetlb user pages with uncorrectable errors will result in a kernel crash. This is because the copy is performed in kernel mode and in general we can not handle accessing memory with such errors while in kernel mode. Commit a873dfe1032a ("mm, hwpoison: try to recover from copy-on write faults") introduced the routine copy_user_highpage_mc() to gracefully handle copying of user pages with uncorrectable errors. However, the separate hugetlb copy-on-write code paths were not modified as part of commit a873dfe1032a. Modify hugetlb copy-on-write code paths to use copy_mc_user_highpage() so that they can also gracefully handle uncorrectable errors in user pages. This involves changing the hugetlb specific routine copy_user_large_folio() from type void to int so that it can return an error. Modify the hugetlb userfaultfd code in the same way so that it can return -EHWPOISON if it encounters an uncorrectable error. Link: https://lkml.kernel.org/r/20230413131349.2524210-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Acked-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Miaohe Lin Cc: Muchun Song Cc: Tony Luck Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b514de43143..2772a1800cf3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3693,9 +3693,9 @@ extern const struct attribute_group memory_failure_attr_group; extern void clear_huge_page(struct page *page, unsigned long addr_hint, unsigned int pages_per_huge_page); -void copy_user_large_folio(struct folio *dst, struct folio *src, - unsigned long addr_hint, - struct vm_area_struct *vma); +int copy_user_large_folio(struct folio *dst, struct folio *src, + unsigned long addr_hint, + struct vm_area_struct *vma); long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault); -- cgit v1.2.3 From bb1508c24c9c361e6344308c8de2cb81d7f228ba Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 13 Apr 2023 15:12:22 +0200 Subject: mm: kmsan: apply __must_check to non-void functions Non-void KMSAN hooks may return error codes that indicate that KMSAN failed to reflect the changed memory state in the metadata (e.g. it could not create the necessary memory mappings). In such cases the callers should handle the errors to prevent the tool from using the inconsistent metadata in the future. We mark non-void hooks with __must_check so that error handling is not skipped. Link: https://lkml.kernel.org/r/20230413131223.4135168-3-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Christoph Hellwig Cc: Dipanjan Das Cc: Dmitry Vyukov Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 30b17647ce3c..e0c23a32cdf0 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -54,7 +54,8 @@ void __init kmsan_init_runtime(void); * Freed pages are either returned to buddy allocator or held back to be used * as metadata pages. */ -bool __init kmsan_memblock_free_pages(struct page *page, unsigned int order); +bool __init __must_check kmsan_memblock_free_pages(struct page *page, + unsigned int order); /** * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. @@ -137,9 +138,11 @@ void kmsan_kfree_large(const void *ptr); * vmalloc metadata address range. Returns 0 on success, callers must check * for non-zero return value. */ -int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages, - unsigned int page_shift); +int __must_check kmsan_vmap_pages_range_noflush(unsigned long start, + unsigned long end, + pgprot_t prot, + struct page **pages, + unsigned int page_shift); /** * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. @@ -163,9 +166,9 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end); * virtual memory. Returns 0 on success, callers must check for non-zero return * value. */ -int kmsan_ioremap_page_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int page_shift); +int __must_check kmsan_ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift); /** * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call. @@ -237,8 +240,8 @@ static inline void kmsan_init_runtime(void) { } -static inline bool kmsan_memblock_free_pages(struct page *page, - unsigned int order) +static inline bool __must_check kmsan_memblock_free_pages(struct page *page, + unsigned int order) { return true; } @@ -251,10 +254,9 @@ static inline void kmsan_task_exit(struct task_struct *task) { } -static inline int kmsan_alloc_page(struct page *page, unsigned int order, - gfp_t flags) +static inline void kmsan_alloc_page(struct page *page, unsigned int order, + gfp_t flags) { - return 0; } static inline void kmsan_free_page(struct page *page, unsigned int order) @@ -283,11 +285,9 @@ static inline void kmsan_kfree_large(const void *ptr) { } -static inline int kmsan_vmap_pages_range_noflush(unsigned long start, - unsigned long end, - pgprot_t prot, - struct page **pages, - unsigned int page_shift) +static inline int __must_check kmsan_vmap_pages_range_noflush( + unsigned long start, unsigned long end, pgprot_t prot, + struct page **pages, unsigned int page_shift) { return 0; } @@ -297,10 +297,11 @@ static inline void kmsan_vunmap_range_noflush(unsigned long start, { } -static inline int kmsan_ioremap_page_range(unsigned long start, - unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int page_shift) +static inline int __must_check kmsan_ioremap_page_range(unsigned long start, + unsigned long end, + phys_addr_t phys_addr, + pgprot_t prot, + unsigned int page_shift) { return 0; } -- cgit v1.2.3 From c7b23b68e2aa93f86a206222d23ccd9a21f5982a Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 13 Apr 2023 10:40:34 +0000 Subject: mm: vmscan: refactor updating current->reclaim_state During reclaim, we keep track of pages reclaimed from other means than LRU-based reclaim through scan_control->reclaim_state->reclaimed_slab, which we stash a pointer to in current task_struct. However, we keep track of more than just reclaimed slab pages through this. We also use it for clean file pages dropped through pruned inodes, and xfs buffer pages freed. Rename reclaimed_slab to reclaimed, and add a helper function that wraps updating it through current, so that future changes to this logic are contained within include/linux/swap.h. Link: https://lkml.kernel.org/r/20230413104034.1086717-4-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Michal Hocko Cc: Alexander Viro Cc: Christoph Lameter Cc: Darrick J. Wong Cc: Dave Chinner Cc: David Hildenbrand Cc: David Rientjes Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Miaohe Lin Cc: NeilBrown Cc: Peter Xu Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Chen Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/swap.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index bfc3b06b5f8f..7f7d5b9ddf7e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -153,13 +153,28 @@ union swap_header { * memory reclaim */ struct reclaim_state { - unsigned long reclaimed_slab; + /* pages reclaimed outside of LRU-based reclaim */ + unsigned long reclaimed; #ifdef CONFIG_LRU_GEN /* per-thread mm walk data */ struct lru_gen_mm_walk *mm_walk; #endif }; +/* + * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based + * reclaim + * @pages: number of pages reclaimed + * + * If the current process is undergoing a reclaim operation, increment the + * number of reclaimed pages by @pages. + */ +static inline void mm_account_reclaimed_pages(unsigned long pages) +{ + if (current->reclaim_state) + current->reclaim_state->reclaimed += pages; +} + #ifdef __KERNEL__ struct address_space; -- cgit v1.2.3 From 7f63cf2d9b9bbe7b90f808927558a66ff737d399 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 13 Apr 2023 14:43:26 -0700 Subject: mm: Multi-gen LRU: remove wait_event_killable() Android 14 and later default to MGLRU [1] and field telemetry showed occasional long tail latency (>100ms) in the reclaim path. Tracing revealed priority inversion in the reclaim path. In try_to_inc_max_seq(), when high priority tasks were blocked on wait_event_killable(), the preemption of the low priority task to call wake_up_all() caused those high priority tasks to wait longer than necessary. In general, this problem is not different from others of its kind, e.g., one caused by mutex_lock(). However, it is specific to MGLRU because it introduced the new wait queue lruvec->mm_state.wait. The purpose of this new wait queue is to avoid the thundering herd problem. If many direct reclaimers rush into try_to_inc_max_seq(), only one can succeed, i.e., the one to wake up the rest, and the rest who failed might cause premature OOM kills if they do not wait. So far there is no evidence supporting this scenario, based on how often the wait has been hit. And this begs the question how useful the wait queue is in practice. Based on Minchan's recommendation, which is in line with his commit 6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the rest of the MGLRU code which also uses trylock when possible, remove the wait queue. [1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") Signed-off-by: Kalesh Singh Suggested-by: Minchan Kim Reported-by: Wei Wang Acked-by: Yu Zhao Cc: Minchan Kim Cc: Jan Alexander Steffens (heftig) Cc: Oleksandr Natalenko Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 337956748976..a4889c9d4055 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -443,18 +443,14 @@ enum { struct lru_gen_mm_state { /* set to max_seq after each iteration */ unsigned long seq; - /* where the current iteration continues (inclusive) */ + /* where the current iteration continues after */ struct list_head *head; - /* where the last iteration ended (exclusive) */ + /* where the last iteration ended before */ struct list_head *tail; - /* to wait for the last page table walker to finish */ - struct wait_queue_head wait; /* Bloom filters flip after each iteration */ unsigned long *filters[NR_BLOOM_FILTERS]; /* the mm stats for debugging */ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; - /* the number of concurrent page table walkers */ - int nr_walkers; }; struct lru_gen_mm_walk { -- cgit v1.2.3 From f7b8f70ba44fb63df47b6fc3204d49ac2885de64 Mon Sep 17 00:00:00 2001 From: Luca Vizzarro Date: Fri, 14 Apr 2023 16:24:58 +0100 Subject: memfd: pass argument of memfd_fcntl as int The interface for fcntl expects the argument passed for the command F_ADD_SEALS to be of type int. The current code wrongly treats it as a long. In order to avoid access to undefined bits, we should explicitly cast the argument to int. This commit changes the signature of all the related and helper functions so that they treat the argument as int instead of long. Link: https://lkml.kernel.org/r/20230414152459.816046-5-Luca.Vizzarro@arm.com Signed-off-by: Luca Vizzarro Cc: Alexander Viro Cc: Christian Brauner Cc: Jeff Layton Cc: Chuck Lever Cc: Kevin Brodsky Cc: Vincenzo Frascino Cc: Szabolcs Nagy Cc: "Theodore Ts'o" Cc: David Laight Cc: Mark Rutland Signed-off-by: Andrew Morton --- include/linux/memfd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 4f1600413f91..e7abf6fa4c52 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -5,9 +5,9 @@ #include #ifdef CONFIG_MEMFD_CREATE -extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); +extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); #else -static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) +static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { return -EINVAL; } -- cgit v1.2.3 From 4248d0083ec5817eebfb916c54950d100b3468ee Mon Sep 17 00:00:00 2001 From: Longlong Xia Date: Fri, 14 Apr 2023 10:17:41 +0800 Subject: mm: ksm: support hwpoison for ksm page hwpoison_user_mappings() is updated to support ksm pages, and add collect_procs_ksm() to collect processes when the error hit an ksm page. The difference from collect_procs_anon() is that it also needs to traverse the rmap-item list on the stable node of the ksm page. At the same time, add_to_kill_ksm() is added to handle ksm pages. And task_in_to_kill_list() is added to avoid duplicate addition of tsk to the to_kill list. This is because when scanning the list, if the pages that make up the ksm page all come from the same process, they may be added repeatedly. Link: https://lkml.kernel.org/r/20230414021741.2597273-3-xialonglong1@huawei.com Signed-off-by: Longlong Xia Tested-by: Naoya Horiguchi Reviewed-by: Kefeng Wang Cc: Miaohe Lin Cc: Nanyong Sun Signed-off-by: Andrew Morton --- include/linux/ksm.h | 11 +++++++++++ include/linux/mm.h | 7 +++++++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 7e232ba59b86..d9e701326a88 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -51,6 +51,10 @@ struct page *ksm_might_need_to_copy(struct page *page, void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); +#ifdef CONFIG_MEMORY_FAILURE +void collect_procs_ksm(struct page *page, struct list_head *to_kill, + int force_early); +#endif #else /* !CONFIG_KSM */ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) @@ -62,6 +66,13 @@ static inline void ksm_exit(struct mm_struct *mm) { } +#ifdef CONFIG_MEMORY_FAILURE +static inline void collect_procs_ksm(struct page *page, + struct list_head *to_kill, int force_early) +{ +} +#endif + #ifdef CONFIG_MMU static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2772a1800cf3..37554b08bb28 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3604,6 +3604,7 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); +struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); #else static inline void memory_failure_queue(unsigned long pfn, int flags) { @@ -3624,6 +3625,12 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i) } #endif +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM) +void add_to_kill_ksm(struct task_struct *tsk, struct page *p, + struct vm_area_struct *vma, struct list_head *to_kill, + unsigned long ksm_addr); +#endif + #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) extern void memblk_nr_poison_inc(unsigned long pfn); extern void memblk_nr_poison_sub(unsigned long pfn, long i); -- cgit v1.2.3 From 465e5e6a1698f3325f5d2262d2875779b06b50c7 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 17 Apr 2023 14:36:15 +0200 Subject: fs/buffer: add folio_set_bh helper Patch series "convert create_page_buffers to folio_create_buffers". One of the first kernel panic we hit when we try to increase the block size > 4k is inside create_page_buffers()[1]. Even though buffer.c function do not support large folios (folios > PAGE_SIZE) at the moment, these changes are required when we want to remove that constraint. This patch (of 4): The folio version of set_bh_page(). This is required to convert create_page_buffers() to folio_create_buffers() later in the series. Link: https://lkml.kernel.org/r/20230417123618.22094-1-p.raghav@samsung.com Link: https://lkml.kernel.org/r/20230417123618.22094-2-p.raghav@samsung.com Signed-off-by: Pankaj Raghav Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Hannes Reinecke Cc: Alexander Viro Cc: Christian Brauner Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 8f14dca5fed7..7e92d23f4782 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -196,6 +196,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh); void touch_buffer(struct buffer_head *bh); void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); +void folio_set_bh(struct buffer_head *bh, struct folio *folio, + unsigned long offset); bool try_to_free_buffers(struct folio *); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); -- cgit v1.2.3 From c71124a8afa441af203d2001a92c91f114446687 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 17 Apr 2023 14:36:16 +0200 Subject: buffer: add folio_alloc_buffers() helper Folio version of alloc_page_buffers() helper. This is required to convert create_page_buffers() to folio_create_buffers() later in the series. alloc_page_buffers() has been modified to call folio_alloc_buffers() which adds one call to compound_head() but folio_alloc_buffers() removes one call to compound_head() compared to the existing alloc_page_buffers() implementation. Link: https://lkml.kernel.org/r/20230417123618.22094-3-p.raghav@samsung.com Signed-off-by: Pankaj Raghav Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Hannes Reinecke Cc: Alexander Viro Cc: Christian Brauner Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 7e92d23f4782..0b14eab41bd1 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -199,6 +199,8 @@ void set_bh_page(struct buffer_head *bh, void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset); bool try_to_free_buffers(struct folio *); +struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, + bool retry); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); void create_empty_buffers(struct page *, unsigned long, -- cgit v1.2.3 From 8e2e17560bed73c2a73c94cbe378719f6cf850ee Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 17 Apr 2023 14:36:17 +0200 Subject: fs/buffer: add folio_create_empty_buffers helper Folio version of create_empty_buffers(). This is required to convert create_page_buffers() to folio_create_buffers() later in the series. It removes several calls to compound_head() as it works directly on folio compared to create_empty_buffers(). Hence, create_empty_buffers() has been modified to call folio_create_empty_buffers(). Link: https://lkml.kernel.org/r/20230417123618.22094-4-p.raghav@samsung.com Signed-off-by: Pankaj Raghav Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Hannes Reinecke Cc: Alexander Viro Cc: Christian Brauner Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 0b14eab41bd1..1520793c72da 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -205,6 +205,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); void create_empty_buffers(struct page *, unsigned long, unsigned long b_state); +void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, + unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); void end_buffer_async_write(struct buffer_head *bh, int uptodate); -- cgit v1.2.3 From 4bf4f155bfbc77a12ad2c9e12d9f57718adb9a5c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 17 Apr 2023 19:48:07 +0800 Subject: mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list() Both of them change the arg from page_list to folio_list when convert them to use a folio, but not the declaration, let's correct it, also move the reclaim_pages() from swap.h to internal.h as it only used in mm. Link: https://lkml.kernel.org/r/20230417114807.186786-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviwed-by: Matthew Wilcox (Oracle) Reviewed-by: SeongJae Park Reviewed-by: David Hildenbrand Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 7f7d5b9ddf7e..3c69cb653cb9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -442,7 +442,6 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); -extern unsigned long reclaim_pages(struct list_head *page_list); #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; -- cgit v1.2.3 From d7597f59d1d33e9efbffa7060deb9ee5bd119e62 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 17 Apr 2023 22:13:40 -0700 Subject: mm: add new api to enable ksm per process Patch series "mm: process/cgroup ksm support", v9. So far KSM can only be enabled by calling madvise for memory regions. To be able to use KSM for more workloads, KSM needs to have the ability to be enabled / disabled at the process / cgroup level. Use case 1: The madvise call is not available in the programming language. An example for this are programs with forked workloads using a garbage collected language without pointers. In such a language madvise cannot be made available. In addition the addresses of objects get moved around as they are garbage collected. KSM sharing needs to be enabled "from the outside" for these type of workloads. Use case 2: The same interpreter can also be used for workloads where KSM brings no benefit or even has overhead. We'd like to be able to enable KSM on a workload by workload basis. Use case 3: With the madvise call sharing opportunities are only enabled for the current process: it is a workload-local decision. A considerable number of sharing opportunities may exist across multiple workloads or jobs (if they are part of the same security domain). Only a higler level entity like a job scheduler or container can know for certain if its running one or more instances of a job. That job scheduler however doesn't have the necessary internal workload knowledge to make targeted madvise calls. Security concerns: In previous discussions security concerns have been brought up. The problem is that an individual workload does not have the knowledge about what else is running on a machine. Therefore it has to be very conservative in what memory areas can be shared or not. However, if the system is dedicated to running multiple jobs within the same security domain, its the job scheduler that has the knowledge that sharing can be safely enabled and is even desirable. Performance: Experiments with using UKSM have shown a capacity increase of around 20%. Here are the metrics from an instagram workload (taken from a machine with 64GB main memory): full_scans: 445 general_profit: 20158298048 max_page_sharing: 256 merge_across_nodes: 1 pages_shared: 129547 pages_sharing: 5119146 pages_to_scan: 4000 pages_unshared: 1760924 pages_volatile: 10761341 run: 1 sleep_millisecs: 20 stable_node_chains: 167 stable_node_chains_prune_millisecs: 2000 stable_node_dups: 2751 use_zero_pages: 0 zero_pages_sharing: 0 After the service is running for 30 minutes to an hour, 4 to 5 million shared pages are common for this workload when using KSM. Detailed changes: 1. New options for prctl system command This patch series adds two new options to the prctl system call. The first one allows to enable KSM at the process level and the second one to query the setting. The setting will be inherited by child processes. With the above setting, KSM can be enabled for the seed process of a cgroup and all processes in the cgroup will inherit the setting. 2. Changes to KSM processing When KSM is enabled at the process level, the KSM code will iterate over all the VMA's and enable KSM for the eligible VMA's. When forking a process that has KSM enabled, the setting will be inherited by the new child process. 3. Add general_profit metric The general_profit metric of KSM is specified in the documentation, but not calculated. This adds the general profit metric to /sys/kernel/debug/mm/ksm. 4. Add more metrics to ksm_stat This adds the process profit metric to /proc//ksm_stat. 5. Add more tests to ksm_tests and ksm_functional_tests This adds an option to specify the merge type to the ksm_tests. This allows to test madvise and prctl KSM. It also adds a two new tests to ksm_functional_tests: one to test the new prctl options and the other one is a fork test to verify that the KSM process setting is inherited by client processes. This patch (of 3): So far KSM can only be enabled by calling madvise for memory regions. To be able to use KSM for more workloads, KSM needs to have the ability to be enabled / disabled at the process / cgroup level. 1. New options for prctl system command This patch series adds two new options to the prctl system call. The first one allows to enable KSM at the process level and the second one to query the setting. The setting will be inherited by child processes. With the above setting, KSM can be enabled for the seed process of a cgroup and all processes in the cgroup will inherit the setting. 2. Changes to KSM processing When KSM is enabled at the process level, the KSM code will iterate over all the VMA's and enable KSM for the eligible VMA's. When forking a process that has KSM enabled, the setting will be inherited by the new child process. 1) Introduce new MMF_VM_MERGE_ANY flag This introduces the new flag MMF_VM_MERGE_ANY flag. When this flag is set, kernel samepage merging (ksm) gets enabled for all vma's of a process. 2) Setting VM_MERGEABLE on VMA creation When a VMA is created, if the MMF_VM_MERGE_ANY flag is set, the VM_MERGEABLE flag will be set for this VMA. 3) support disabling of ksm for a process This adds the ability to disable ksm for a process if ksm has been enabled for the process with prctl. 4) add new prctl option to get and set ksm for a process This adds two new options to the prctl system call - enable ksm for all vmas of a process (if the vmas support it). - query if ksm has been enabled for a process. 3. Disabling MMF_VM_MERGE_ANY for storage keys in s390 In the s390 architecture when storage keys are used, the MMF_VM_MERGE_ANY will be disabled. Link: https://lkml.kernel.org/r/20230418051342.1919757-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20230418051342.1919757-2-shr@devkernel.io Signed-off-by: Stefan Roesch Acked-by: David Hildenbrand Cc: David Hildenbrand Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: Bagas Sanjaya Signed-off-by: Andrew Morton --- include/linux/ksm.h | 21 +++++++++++++++++++-- include/linux/sched/coredump.h | 1 + 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index d9e701326a88..4647b0c70c12 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -18,13 +18,26 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); + +void ksm_add_vma(struct vm_area_struct *vma); +int ksm_enable_merge_any(struct mm_struct *mm); + int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) - return __ksm_enter(mm); + int ret; + + if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) { + ret = __ksm_enter(mm); + if (ret) + return ret; + } + + if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags)) + set_bit(MMF_VM_MERGE_ANY, &mm->flags); + return 0; } @@ -57,6 +70,10 @@ void collect_procs_ksm(struct page *page, struct list_head *to_kill, #endif #else /* !CONFIG_KSM */ +static inline void ksm_add_vma(struct vm_area_struct *vma) +{ +} + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { return 0; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 0e17ae7fbfd3..0ee96ea7a0e9 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -90,4 +90,5 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) +#define MMF_VM_MERGE_ANY 29 #endif /* _LINUX_SCHED_COREDUMP_H */ -- cgit v1.2.3 From d21077fbc2fc987c2e593c34dc3b4d84e546dc9f Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 17 Apr 2023 22:13:41 -0700 Subject: mm: add new KSM process and sysfs knobs This adds the general_profit KSM sysfs knob and the process profit metric knobs to ksm_stat. 1) expose general_profit metric The documentation mentions a general profit metric, however this metric is not calculated. In addition the formula depends on the size of internal structures, which makes it more difficult for an administrator to make the calculation. Adding the metric for a better user experience. 2) document general_profit sysfs knob 3) calculate ksm process profit metric The ksm documentation mentions the process profit metric and how to calculate it. This adds the calculation of the metric. 4) mm: expose ksm process profit metric in ksm_stat This exposes the ksm process profit metric in /proc//ksm_stat. The documentation mentions the formula for the ksm process profit metric, however it does not calculate it. In addition the formula depends on the size of internal structures. So it makes sense to expose it. 5) document new procfs ksm knobs Link: https://lkml.kernel.org/r/20230418051342.1919757-3-shr@devkernel.io Signed-off-by: Stefan Roesch Reviewed-by: Bagas Sanjaya Acked-by: David Hildenbrand Cc: David Hildenbrand Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/ksm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 4647b0c70c12..7a9b76fb6c3f 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -68,6 +68,11 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); void collect_procs_ksm(struct page *page, struct list_head *to_kill, int force_early); #endif + +#ifdef CONFIG_PROC_FS +long ksm_process_profit(struct mm_struct *); +#endif /* CONFIG_PROC_FS */ + #else /* !CONFIG_KSM */ static inline void ksm_add_vma(struct vm_area_struct *vma) -- cgit v1.2.3 From f724392415b3e1ae844d2766669c0d955fe9a17b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 18 Apr 2023 22:22:02 -0700 Subject: hugetlb: pte_alloc_huge() to replace huge pte_alloc_map() Some architectures can have their hugetlb pages down at the lowest PTE level: their huge_pte_alloc() using pte_alloc_map(), but without any following pte_unmap(). Since none of these arches uses CONFIG_HIGHPTE, this is not seen as a problem at present; but would become a problem if forthcoming changes were to add an rcu_read_lock() into pte_offset_map(), with the rcu_read_unlock() expected in pte_unmap(). Similarly in their huge_pte_offset(): pte_offset_kernel() is good enough for that, but it's probably less confusing if we define pte_offset_huge() along with pte_alloc_huge(). Only define them without CONFIG_HIGHPTE: so there would be a build error to signal if ever more work is needed. For ease of development, define these now for 6.4-rc1, ahead of any use: then architectures can integrate patches using them, independent from mm. Link: https://lkml.kernel.org/r/ae9e7d98-8a3a-cfd9-4762-bcddffdf96cf@google.com Signed-off-by: Hugh Dickins Reviewed-by: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 28703fe22386..cbe1e97a15a1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -191,6 +191,23 @@ extern struct list_head huge_boot_pages; /* arch callbacks */ +#ifndef CONFIG_HIGHPTE +/* + * pte_offset_huge() and pte_alloc_huge() are helpers for those architectures + * which may go down to the lowest PTE level in their huge_pte_offset() and + * huge_pte_alloc(): to avoid reliance on pte_offset_map() without pte_unmap(). + */ +static inline pte_t *pte_offset_huge(pmd_t *pmd, unsigned long address) +{ + return pte_offset_kernel(pmd, address); +} +static inline pte_t *pte_alloc_huge(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + return pte_alloc(mm, pmd) ? NULL : pte_offset_huge(pmd, address); +} +#endif + pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); /* -- cgit v1.2.3