From d92725256b4f22d084b813b37ddc394da79aacab Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 30 May 2022 14:34:50 -0400 Subject: mm: avoid unnecessary page fault retires on shared memory types I observed that for each of the shared file-backed page faults, we're very likely to retry one more time for the 1st write fault upon no page. It's because we'll need to release the mmap lock for dirty rate limit purpose with balance_dirty_pages_ratelimited() (in fault_dirty_shared_page()). Then after that throttling we return VM_FAULT_RETRY. We did that probably because VM_FAULT_RETRY is the only way we can return to the fault handler at that time telling it we've released the mmap lock. However that's not ideal because it's very likely the fault does not need to be retried at all since the pgtable was well installed before the throttling, so the next continuous fault (including taking mmap read lock, walk the pgtable, etc.) could be in most cases unnecessary. It's not only slowing down page faults for shared file-backed, but also add more mmap lock contention which is in most cases not needed at all. To observe this, one could try to write to some shmem page and look at "pgfault" value in /proc/vmstat, then we should expect 2 counts for each shmem write simply because we retried, and vm event "pgfault" will capture that. To make it more efficient, add a new VM_FAULT_COMPLETED return code just to show that we've completed the whole fault and released the lock. It's also a hint that we should very possibly not need another fault immediately on this page because we've just completed it. This patch provides a ~12% perf boost on my aarch64 test VM with a simple program sequentially dirtying 400MB shmem file being mmap()ed and these are the time it needs: Before: 650.980 ms (+-1.94%) After: 569.396 ms (+-1.38%) I believe it could help more than that. We need some special care on GUP and the s390 pgfault handler (for gmap code before returning from pgfault), the rest changes in the page fault handlers should be relatively straightforward. Another thing to mention is that mm_account_fault() does take this new fault as a generic fault to be accounted, unlike VM_FAULT_RETRY. I explicitly didn't touch hmm_vma_fault() and break_ksm() because they do not handle VM_FAULT_RETRY even with existing code, so I'm literally keeping them as-is. Link: https://lkml.kernel.org/r/20220530183450.42886-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: Geert Uytterhoeven Acked-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Acked-by: Vineet Gupta Acked-by: Guo Ren Acked-by: Max Filippov Acked-by: Christian Borntraeger Acked-by: Michael Ellerman (powerpc) Acked-by: Catalin Marinas Reviewed-by: Alistair Popple Reviewed-by: Ingo Molnar Acked-by: Russell King (Oracle) [arm part] Acked-by: Heiko Carstens Cc: Vasily Gorbik Cc: Stafford Horne Cc: David S. Miller Cc: Johannes Berg Cc: Brian Cain Cc: Richard Henderson Cc: Richard Weinberger Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Janosch Frank Cc: Albert Ou Cc: Anton Ivanov Cc: Dave Hansen Cc: Borislav Petkov Cc: Sven Schnelle Cc: Andrea Arcangeli Cc: James Bottomley Cc: Al Viro Cc: Alexander Gordeev Cc: Jonas Bonn Cc: Will Deacon Cc: Vlastimil Babka Cc: Michal Simek Cc: Matt Turner Cc: Paul Mackerras Cc: David Hildenbrand Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Stefan Kristiansson Cc: Paul Walmsley Cc: Ivan Kokshaysky Cc: Chris Zankel Cc: Hugh Dickins Cc: Dinh Nguyen Cc: Rich Felker Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Thomas Bogendoerfer Cc: Helge Deller Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c29ab4c0cd5c..6b961a29bf26 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -729,6 +729,7 @@ typedef __bitwise unsigned int vm_fault_t; * @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs * fsync() to complete (for synchronous page faults * in DAX) + * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released * @VM_FAULT_HINDEX_MASK: mask HINDEX value * */ @@ -746,6 +747,7 @@ enum vm_fault_reason { VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, + VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, }; -- cgit v1.2.3 From bcc728eb4f446073e0160671d7d0059a4e9aa300 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 31 May 2022 10:04:21 +0800 Subject: mm/damon: remove obsolete comments of kdamond_stop Since commit 0f91d13366a4 ("mm/damon: simplify stop mechanism") delete kdamond_stop and change to use kthread stop mechanism, these obsolete comments should be removed accordingly. Link: https://lkml.kernel.org/r/20220531020421.46849-1-zhouchengming@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 7c62da31ce4b..2765c7d99beb 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -397,7 +397,6 @@ struct damon_callback { * detail. * * @kdamond: Kernel thread who does the monitoring. - * @kdamond_stop: Notifies whether kdamond should stop. * @kdamond_lock: Mutex for the synchronizations with @kdamond. * * For each monitoring context, one kernel thread for the monitoring is @@ -406,14 +405,14 @@ struct damon_callback { * Once started, the monitoring thread runs until explicitly required to be * terminated or every monitoring target is invalid. The validity of the * targets is checked via the &damon_operations.target_valid of @ops. The - * termination can also be explicitly requested by writing non-zero to - * @kdamond_stop. The thread sets @kdamond to NULL when it terminates. - * Therefore, users can know whether the monitoring is ongoing or terminated by - * reading @kdamond. Reads and writes to @kdamond and @kdamond_stop from - * outside of the monitoring thread must be protected by @kdamond_lock. - * - * Note that the monitoring thread protects only @kdamond and @kdamond_stop via - * @kdamond_lock. Accesses to other fields must be protected by themselves. + * termination can also be explicitly requested by calling damon_stop(). + * The thread sets @kdamond to NULL when it terminates. Therefore, users can + * know whether the monitoring is ongoing or terminated by reading @kdamond. + * Reads and writes to @kdamond from outside of the monitoring thread must + * be protected by @kdamond_lock. + * + * Note that the monitoring thread protects only @kdamond via @kdamond_lock. + * Accesses to other fields must be protected by themselves. * * @ops: Set of monitoring operations for given use cases. * @callback: Set of callbacks for monitoring events notifications. -- cgit v1.2.3 From 9384d79249d04b03572abb7e551a35d99c9268c0 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 6 Jun 2022 16:15:33 +0200 Subject: mm/highmem: delete memmove_page() Matthew Wilcox reported that, while he was looking at memmove_page(), he realized that it can't actually work. The reasons are hidden in its implementation, which makes use of memmove() on logical addresses provided by kmap_local_page(). memmove() does the wrong thing when it tests "if (dest <= src)". Therefore, delete memmove_page(). No need to change any other code because we have no call sites of memmove_page() across the whole kernel. Link: https://lkml.kernel.org/r/20220606141533.555-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reported-by: Matthew Wilcox Reviewed-by: Baoquan He Reviewed-by: Ira Weiny Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/highmem.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 3af34de54330..fee9835e3793 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -336,19 +336,6 @@ static inline void memcpy_page(struct page *dst_page, size_t dst_off, kunmap_local(dst); } -static inline void memmove_page(struct page *dst_page, size_t dst_off, - struct page *src_page, size_t src_off, - size_t len) -{ - char *dst = kmap_local_page(dst_page); - char *src = kmap_local_page(src_page); - - VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE); - memmove(dst + dst_off, src + src_off, len); - kunmap_local(src); - kunmap_local(dst); -} - static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { -- cgit v1.2.3 From c200d90049dbe08fa8b016f74b713fddefca0479 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 11 Jun 2022 11:55:48 +0800 Subject: mm: kmemleak: remove kmemleak_not_leak_phys() and the min_count argument to kmemleak_alloc_phys() Patch series "mm: kmemleak: store objects allocated with physical address separately and check when scan", v4. The kmemleak_*_phys() interface uses "min_low_pfn" and "max_low_pfn" to check address. But on some architectures, kmemleak_*_phys() is called before those two variables initialized. The following steps will be taken: 1) Add OBJECT_PHYS flag and rbtree for the objects allocated with physical address 2) Store physical address in objects if allocated with OBJECT_PHYS 3) Check the boundary when scan instead of in kmemleak_*_phys() This patch set will solve: https://lore.kernel.org/r/20220527032504.30341-1-yee.lee@mediatek.com https://lore.kernel.org/r/9dd08bb5-f39e-53d8-f88d-bec598a08c93@gmail.com v3: https://lore.kernel.org/r/20220609124950.1694394-1-patrick.wang.shcn@gmail.com v2: https://lore.kernel.org/r/20220603035415.1243913-1-patrick.wang.shcn@gmail.com v1: https://lore.kernel.org/r/20220531150823.1004101-1-patrick.wang.shcn@gmail.com This patch (of 4): Remove the unused kmemleak_not_leak_phys() function. And remove the min_count argument to kmemleak_alloc_phys() function, assume it's 0. Link: https://lkml.kernel.org/r/20220611035551.1823303-1-patrick.wang.shcn@gmail.com Link: https://lkml.kernel.org/r/20220611035551.1823303-2-patrick.wang.shcn@gmail.com Signed-off-by: Patrick Wang Suggested-by: Catalin Marinas Reviewed-by: Catalin Marinas Cc: Yee Lee Signed-off-by: Andrew Morton --- include/linux/kmemleak.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 34684b2026ab..6a3cd1bf4680 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -29,10 +29,9 @@ extern void kmemleak_not_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; extern void kmemleak_no_scan(const void *ptr) __ref; -extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, +extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) __ref; extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref; -extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref; extern void kmemleak_ignore_phys(phys_addr_t phys) __ref; static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, @@ -107,15 +106,12 @@ static inline void kmemleak_no_scan(const void *ptr) { } static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size, - int min_count, gfp_t gfp) + gfp_t gfp) { } static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size) { } -static inline void kmemleak_not_leak_phys(phys_addr_t phys) -{ -} static inline void kmemleak_ignore_phys(phys_addr_t phys) { } -- cgit v1.2.3 From fc4db90fe71e640e3fe88df346f7cf653b75315d Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 10 Jun 2022 11:03:10 -0700 Subject: mm: kmem: make mem_cgroup_from_obj() vmalloc()-safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently mem_cgroup_from_obj() is not working properly with objects allocated using vmalloc(). It creates problems in some cases, when it's called for static objects belonging to modules or generally allocated using vmalloc(). This patch makes mem_cgroup_from_obj() safe to be called on objects allocated using vmalloc(). It also introduces mem_cgroup_from_slab_obj(), which is a faster version to use in places when we know the object is either a slab object or a generic slab page (e.g. when adding an object to a lru list). Link: https://lkml.kernel.org/r/20220610180310.1725111-1-roman.gushchin@linux.dev Suggested-by: Kefeng Wang Signed-off-by: Roman Gushchin Tested-by: Linux Kernel Functional Testing Acked-by: Shakeel Butt Tested-by: Vasily Averin Acked-by: Michal Hocko Acked-by: Muchun Song Cc: Johannes Weiner Cc: Naresh Kamboju Cc: Qian Cai Cc: Kefeng Wang Cc: David S. Miller Cc: Eric Dumazet Cc: Florian Westphal Cc: Jakub Kicinski Cc: Michal Koutný Cc: Paolo Abeni Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9ecead1042b9..3ce96ce5fe3e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1740,6 +1740,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_obj(void *p); +struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); static inline void count_objcg_event(struct obj_cgroup *objcg, enum vm_event_item idx) @@ -1801,6 +1802,11 @@ static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) return NULL; } +static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) +{ + return NULL; +} + static inline void count_objcg_event(struct obj_cgroup *objcg, enum vm_event_item idx) { -- cgit v1.2.3 From 1d0403d20f6c281cb3d14c5f1db5317caeec48e9 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 3 Jun 2022 07:19:43 +0300 Subject: net: set proper memcg for net_init hooks allocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __register_pernet_operations() executes init hook of registered pernet_operation structure in all existing net namespaces. Typically, these hooks are called by a process associated with the specified net namespace, and all __GFP_ACCOUNT marked allocation are accounted for corresponding container/memcg. However __register_pernet_operations() calls the hooks in the same context, and as a result all marked allocations are accounted to one memcg for all processed net namespaces. This patch adjusts active memcg for each net namespace and helps to account memory allocated inside ops_init() into the proper memcg. Link: https://lkml.kernel.org/r/f9394752-e272-9bf9-645f-a18c56d1c4ec@openvz.org Signed-off-by: Vasily Averin Acked-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Michal Koutný Cc: Vlastimil Babka Cc: Michal Hocko Cc: Florian Westphal Cc: David S. Miller Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Eric Dumazet Cc: Johannes Weiner Cc: Kefeng Wang Cc: Linux Kernel Functional Testing Cc: Muchun Song Cc: Naresh Kamboju Cc: Qian Cai Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 47 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3ce96ce5fe3e..04f2f33607e9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1756,6 +1756,42 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, rcu_read_unlock(); } +/** + * get_mem_cgroup_from_obj - get a memcg associated with passed kernel object. + * @p: pointer to object from which memcg should be extracted. It can be NULL. + * + * Retrieves the memory group into which the memory of the pointed kernel + * object is accounted. If memcg is found, its reference is taken. + * If a passed kernel object is uncharged, or if proper memcg cannot be found, + * as well as if mem_cgroup is disabled, NULL is returned. + * + * Return: valid memcg pointer with taken reference or NULL. + */ +static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); + do { + memcg = mem_cgroup_from_obj(p); + } while (memcg && !css_tryget(&memcg->css)); + rcu_read_unlock(); + return memcg; +} + +/** + * mem_cgroup_or_root - always returns a pointer to a valid memory cgroup. + * @memcg: pointer to a valid memory cgroup or NULL. + * + * If passed argument is not NULL, returns it without any additional checks + * and changes. Otherwise, root_mem_cgroup is returned. + * + * NOTE: root_mem_cgroup can be NULL during early boot. + */ +static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg) +{ + return memcg ? memcg : root_mem_cgroup; +} #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1799,7 +1835,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) { - return NULL; + return NULL; } static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) @@ -1812,6 +1848,15 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { } +static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p) +{ + return NULL; +} + +static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg) +{ + return NULL; +} #endif /* CONFIG_MEMCG_KMEM */ #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) -- cgit v1.2.3 From ee65728e103bb7dd99d8604bf6c7aa89c7d7e446 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 27 Jun 2022 09:00:26 +0300 Subject: docs: rename Documentation/vm to Documentation/mm so it will be consistent with code mm directory and with Documentation/admin-guide/mm and won't be confused with virtual machines. Signed-off-by: Mike Rapoport Suggested-by: Matthew Wilcox Tested-by: Ira Weiny Acked-by: Jonathan Corbet Acked-by: Wu XiangCheng --- include/linux/hmm.h | 4 ++-- include/linux/memremap.h | 2 +- include/linux/mmu_notifier.h | 2 +- include/linux/sched/mm.h | 4 ++-- include/linux/swap.h | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index d5a6f101f843..126a36571667 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -4,7 +4,7 @@ * * Authors: Jérôme Glisse * - * See Documentation/vm/hmm.rst for reasons and overview of what HMM is. + * See Documentation/mm/hmm.rst for reasons and overview of what HMM is. */ #ifndef LINUX_HMM_H #define LINUX_HMM_H @@ -100,7 +100,7 @@ struct hmm_range { }; /* - * Please see Documentation/vm/hmm.rst for how to use the range API. + * Please see Documentation/mm/hmm.rst for how to use the range API. */ int hmm_range_fault(struct hmm_range *range); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 8af304f6b504..9f5ee49482de 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -39,7 +39,7 @@ struct vmem_altmap { * must be treated as an opaque object, rather than a "normal" struct page. * * A more complete discussion of unaddressable memory may be found in - * include/linux/hmm.h and Documentation/vm/hmm.rst. + * include/linux/hmm.h and Documentation/mm/hmm.rst. * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 45fc2c81e370..d6c06e140277 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -198,7 +198,7 @@ struct mmu_notifier_ops { * invalidate_range_start()/end() notifiers, as * invalidate_range() already catches the points in time when an * external TLB range needs to be flushed. For more in depth - * discussion on this see Documentation/vm/mmu_notifier.rst + * discussion on this see Documentation/mm/mmu_notifier.rst * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8cd975a8bfeb..2a243616f222 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -29,7 +29,7 @@ extern struct mm_struct *mm_alloc(void); * * Use mmdrop() to release the reference acquired by mmgrab(). * - * See also for an in-depth explanation + * See also for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmgrab(struct mm_struct *mm) @@ -92,7 +92,7 @@ static inline void mmdrop_sched(struct mm_struct *mm) * * Use mmput() to release the reference acquired by mmget(). * - * See also for an in-depth explanation + * See also for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmget(struct mm_struct *mm) diff --git a/include/linux/swap.h b/include/linux/swap.h index 0c0fed1b348f..95a5b7aa1ae9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -74,7 +74,7 @@ static inline int current_is_kswapd(void) /* * Unaddressable device memory support. See include/linux/hmm.h and - * Documentation/vm/hmm.rst. Short description is we need struct pages for + * Documentation/mm/hmm.rst. Short description is we need struct pages for * device memory that is unaddressable (inaccessible) by CPU, so that we can * migrate part of a process memory to device memory. * -- cgit v1.2.3 From 507db7927cd181d409dd495c8384b8e14c21c600 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Sun, 3 Jul 2022 18:08:36 -0700 Subject: mm: rmap: use the correct parameter name for DEFINE_PAGE_VMA_WALK The parameter used by DEFINE_PAGE_VMA_WALK is _page not page, fix the parameter name. It didn't cause any build error, it is probably because the only caller is write_protect_page() from ksm.c, which pass in page. Link: https://lkml.kernel.org/r/20220512174551.81279-1-shy828301@gmail.com Fixes: 2aff7a4755be ("mm: Convert page_vma_mapped_walk to work on PFNs") Signed-off-by: Yang Shi Reviewed-by: Muchun Song Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/rmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 9ec23138e410..bf80adca980b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -325,8 +325,8 @@ struct page_vma_mapped_walk { #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ .pfn = page_to_pfn(_page), \ - .nr_pages = compound_nr(page), \ - .pgoff = page_to_pgoff(page), \ + .nr_pages = compound_nr(_page), \ + .pgoff = page_to_pgoff(_page), \ .vma = _vma, \ .address = _address, \ .flags = _flags, \ -- cgit v1.2.3 From c453d8c7d1384d7e1d7f26d3ec0d527092edf801 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 13 May 2022 12:17:05 -0700 Subject: mm/page_vma_mapped.c: check possible huge PMD map with transhuge_vma_suitable() IIUC page_vma_mapped_walk() checks if the vma is possibly huge PMD mapped with transparent_hugepage_active() and "pvmw->nr_pages >= HPAGE_PMD_NR". Actually pvmw->nr_pages is returned by compound_nr() or folio_nr_pages(), so the page should be THP as long as "pvmw->nr_pages >= HPAGE_PMD_NR". And it is guaranteed THP is allocated for valid VMA in the first place. But it may be not PMD mapped if the VMA is file VMA and it is not properly aligned. The transhuge_vma_suitable() is used to do such check, so replace transparent_hugepage_active() to it, which is too heavy and overkilling. Link: https://lkml.kernel.org/r/20220513191705.457775-1-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Muchun Song Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index de29821231c9..648cb3ce7099 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -117,8 +117,10 @@ extern struct kobj_attribute shmem_enabled_attr; extern unsigned long transparent_hugepage_flags; static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long haddr) + unsigned long addr) { + unsigned long haddr; + /* Don't have to check pgoff for anonymous vma */ if (!vma_is_anonymous(vma)) { if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, @@ -126,6 +128,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return false; } + haddr = addr & HPAGE_PMD_MASK; + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return false; return true; @@ -342,7 +346,7 @@ static inline bool transparent_hugepage_active(struct vm_area_struct *vma) } static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long haddr) + unsigned long addr) { return false; } -- cgit v1.2.3 From 7ce82f4c3f3ead13a9d9498768e3b1a79975c4d8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 30 May 2022 19:30:15 +0800 Subject: mm/migration: return errno when isolate_huge_page failed We might fail to isolate huge page due to e.g. the page is under migration which cleared HPageMigratable. We should return errno in this case rather than always return 1 which could confuse the user, i.e. the caller might think all of the memory is migrated while the hugetlb page is left behind. We make the prototype of isolate_huge_page consistent with isolate_lru_page as suggested by Huang Ying and rename isolate_huge_page to isolate_hugetlb as suggested by Muchun to improve the readability. Link: https://lkml.kernel.org/r/20220530113016.16663-4-linmiaohe@huawei.com Fixes: e8db67eb0ded ("mm: migrate: move_pages() supports thp migration") Signed-off-by: Miaohe Lin Suggested-by: Huang Ying Reported-by: kernel test robot (build error) Cc: Alistair Popple Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Hildenbrand Cc: David Howells Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e4cff27d1198..756b66ff025e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -170,7 +170,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -bool isolate_huge_page(struct page *page, struct list_head *list); +int isolate_hugetlb(struct page *page, struct list_head *list); int get_hwpoison_huge_page(struct page *page, bool *hugetlb); int get_huge_page_for_hwpoison(unsigned long pfn, int flags); void putback_active_hugepage(struct page *page); @@ -376,9 +376,9 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline bool isolate_huge_page(struct page *page, struct list_head *list) +static inline int isolate_hugetlb(struct page *page, struct list_head *list) { - return false; + return -EBUSY; } static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) -- cgit v1.2.3 From ad1ac596e8a8c4b06715dfbd89853eb73c9886b2 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 30 May 2022 19:30:16 +0800 Subject: mm/migration: fix potential pte_unmap on an not mapped pte __migration_entry_wait and migration_entry_wait_on_locked assume pte is always mapped from caller. But this is not the case when it's called from migration_entry_wait_huge and follow_huge_pmd. Add a hugetlbfs variant that calls hugetlb_migration_entry_wait(ptep == NULL) to fix this issue. Link: https://lkml.kernel.org/r/20220530113016.16663-5-linmiaohe@huawei.com Fixes: 30dad30922cc ("mm: migration: add migrate_entry_wait_huge()") Signed-off-by: Miaohe Lin Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Howells Cc: Huang Ying Cc: kernel test robot Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/swapops.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index f24775b41880..bb7afd03a324 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -244,8 +244,10 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -extern void migration_entry_wait_huge(struct vm_area_struct *vma, - struct mm_struct *mm, pte_t *pte); +#ifdef CONFIG_HUGETLB_PAGE +extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl); +extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); +#endif #else static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -271,8 +273,10 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } -static inline void migration_entry_wait_huge(struct vm_area_struct *vma, - struct mm_struct *mm, pte_t *pte) { } +#ifdef CONFIG_HUGETLB_PAGE +static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { } +static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } +#endif static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; -- cgit v1.2.3 From c9e124e0382d83d458db204f929002ea98daa6a8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jun 2022 18:23:06 +0000 Subject: mm/damon/{dbgfs,sysfs}: move target_has_pid() from dbgfs to damon.h The function for knowing if given monitoring context's targets will have pid or not is defined and used in dbgfs only. However, the logic is also needed for sysfs. This commit moves the code to damon.h and makes both dbgfs and sysfs to use it. Link: https://lkml.kernel.org/r/20220606182310.48781-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2765c7d99beb..b9aae19fab3e 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -525,6 +525,12 @@ bool damon_is_registered_ops(enum damon_ops_id id); int damon_register_ops(struct damon_operations *ops); int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id); +static inline bool damon_target_has_pid(const struct damon_ctx *ctx) +{ + return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR; +} + + int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); -- cgit v1.2.3 From d9da8f6cf55eeca642c021912af1890002464c64 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 9 Jun 2022 20:18:46 +0200 Subject: mm: introduce clear_highpage_kasan_tagged Add a clear_highpage_kasan_tagged() helper that does clear_highpage() on a page potentially tagged by KASAN. This helper is used by the following patch. Link: https://lkml.kernel.org/r/4471979b46b2c487787ddcd08b9dc5fedd1b6ffd.1654798516.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/highmem.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index fee9835e3793..22379a63e293 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -243,6 +243,16 @@ static inline void clear_highpage(struct page *page) kunmap_local(kaddr); } +static inline void clear_highpage_kasan_tagged(struct page *page) +{ + u8 tag; + + tag = page_kasan_tag(page); + page_kasan_tag_reset(page); + clear_highpage(page); + page_kasan_tag_set(page, tag); +} + #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE static inline void tag_clear_highpage(struct page *page) -- cgit v1.2.3 From c15187a4a2d660bf490f7873afd0de5288f65c8f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:22 -0700 Subject: mm: memcontrol: introduce mem_cgroup_ino() and mem_cgroup_get_from_ino() Patch series "mm: introduce shrinker debugfs interface", v5. The only existing debugging mechanism is a couple of tracepoints in do_shrink_slab(): mm_shrink_slab_start and mm_shrink_slab_end. They aren't covering everything though: shrinkers which report 0 objects will never show up, there is no support for memcg-aware shrinkers. Shrinkers are identified by their scan function, which is not always enough (e.g. hard to guess which super block's shrinker it is having only "super_cache_scan"). To provide a better visibility and debug options for memory shrinkers this patchset introduces a /sys/kernel/debug/shrinker interface, to some extent similar to /sys/kernel/slab. For each shrinker registered in the system a directory is created. As now, the directory will contain only a "scan" file, which allows to get the number of managed objects for each memory cgroup (for memcg-aware shrinkers) and each numa node (for numa-aware shrinkers on a numa machine). Other interfaces might be added in the future. To make debugging more pleasant, the patchset also names all shrinkers, so that debugfs entries can have meaningful names. This patch (of 5): Shrinker debugfs requires a way to represent memory cgroups without using full paths, both for displaying information and getting input from a user. Cgroup inode number is a perfect way, already used by bpf. This commit adds a couple of helper functions which will be used to handle memcg-aware shrinkers. Link: https://lkml.kernel.org/r/20220601032227.4076670-1-roman.gushchin@linux.dev Link: https://lkml.kernel.org/r/20220601032227.4076670-2-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Muchun Song Cc: Dave Chinner Cc: Kent Overstreet Cc: Hillf Danton Cc: Christophe JAILLET Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 04f2f33607e9..4d31ce55b1c0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -837,6 +837,15 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); +#ifdef CONFIG_SHRINKER_DEBUG +static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +{ + return memcg ? cgroup_ino(memcg->css.cgroup) : 0; +} + +struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); +#endif + static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return mem_cgroup_from_css(seq_css(m)); @@ -1343,6 +1352,18 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +#ifdef CONFIG_SHRINKER_DEBUG +static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +{ + return 0; +} + +static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +{ + return NULL; +} +#endif + static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return NULL; -- cgit v1.2.3 From 5035ebc644aec92d55d1bbfe042f35341e4bffb5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:23 -0700 Subject: mm: shrinkers: introduce debugfs interface for memory shrinkers This commit introduces the /sys/kernel/debug/shrinker debugfs interface which provides an ability to observe the state of individual kernel memory shrinkers. Because the feature adds some memory overhead (which shouldn't be large unless there is a huge amount of registered shrinkers), it's guarded by a config option (enabled by default). This commit introduces the "count" interface for each shrinker registered in the system. The output is in the following format: ... ... ... To reduce the size of output on machines with many thousands cgroups, if the total number of objects on all nodes is 0, the line is omitted. If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is printed as cgroup inode id. If the shrinker is not numa-aware, 0's are printed for all nodes except the first one. This commit gives debugfs entries simple numeric names, which are not very convenient. The following commit in the series will provide shrinkers with more meaningful names. [akpm@linux-foundation.org: remove WARN_ON_ONCE(), per Roman] Reported-by: syzbot+300d27c79fe6d4cbcc39@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20220601032227.4076670-3-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Reviewed-by: Kent Overstreet Acked-by: Muchun Song Cc: Christophe JAILLET Cc: Dave Chinner Cc: Hillf Danton Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 76fbf92b04d9..2ced8149c513 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -72,6 +72,10 @@ struct shrinker { #ifdef CONFIG_MEMCG /* ID in shrinker_idr */ int id; +#endif +#ifdef CONFIG_SHRINKER_DEBUG + int debugfs_id; + struct dentry *debugfs_entry; #endif /* objs pending delete, per node */ atomic_long_t *nr_deferred; @@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker); extern void unregister_shrinker(struct shrinker *shrinker); extern void free_prealloced_shrinker(struct shrinker *shrinker); extern void synchronize_shrinkers(void); -#endif + +#ifdef CONFIG_SHRINKER_DEBUG +extern int shrinker_debugfs_add(struct shrinker *shrinker); +extern void shrinker_debugfs_remove(struct shrinker *shrinker); +#else /* CONFIG_SHRINKER_DEBUG */ +static inline int shrinker_debugfs_add(struct shrinker *shrinker) +{ + return 0; +} +static inline void shrinker_debugfs_remove(struct shrinker *shrinker) +{ +} +#endif /* CONFIG_SHRINKER_DEBUG */ +#endif /* _LINUX_SHRINKER_H */ -- cgit v1.2.3 From e33c267ab70de4249d22d7eab1cc7d68a889bac2 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:24 -0700 Subject: mm: shrinkers: provide shrinkers with names Currently shrinkers are anonymous objects. For debugging purposes they can be identified by count/scan function names, but it's not always useful: e.g. for superblock's shrinkers it's nice to have at least an idea of to which superblock the shrinker belongs. This commit adds names to shrinkers. register_shrinker() and prealloc_shrinker() functions are extended to take a format and arguments to master a name. In some cases it's not possible to determine a good name at the time when a shrinker is allocated. For such cases shrinker_debugfs_rename() is provided. The expected format is: -[:]- For some shrinkers an instance can be encoded as (MAJOR:MINOR) pair. After this change the shrinker debugfs directory looks like: $ cd /sys/kernel/debug/shrinker/ $ ls dquota-cache-16 sb-devpts-28 sb-proc-47 sb-tmpfs-42 mm-shadow-18 sb-devtmpfs-5 sb-proc-48 sb-tmpfs-43 mm-zspool:zram0-34 sb-hugetlbfs-17 sb-pstore-31 sb-tmpfs-44 rcu-kfree-0 sb-hugetlbfs-33 sb-rootfs-2 sb-tmpfs-49 sb-aio-20 sb-iomem-12 sb-securityfs-6 sb-tracefs-13 sb-anon_inodefs-15 sb-mqueue-21 sb-selinuxfs-22 sb-xfs:vda1-36 sb-bdev-3 sb-nsfs-4 sb-sockfs-8 sb-zsmalloc-19 sb-bpf-32 sb-pipefs-14 sb-sysfs-26 thp-deferred_split-10 sb-btrfs:vda2-24 sb-proc-25 sb-tmpfs-1 thp-zero-9 sb-cgroup2-30 sb-proc-39 sb-tmpfs-27 xfs-buf:vda1-37 sb-configfs-23 sb-proc-41 sb-tmpfs-29 xfs-inodegc:vda1-38 sb-dax-11 sb-proc-45 sb-tmpfs-35 sb-debugfs-7 sb-proc-46 sb-tmpfs-40 [roman.gushchin@linux.dev: fix build warnings] Link: https://lkml.kernel.org/r/Yr+ZTnLb9lJk6fJO@castle Reported-by: kernel test robot Link: https://lkml.kernel.org/r/20220601032227.4076670-4-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Cc: Christophe JAILLET Cc: Dave Chinner Cc: Hillf Danton Cc: Kent Overstreet Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 2ced8149c513..08e6054e061f 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -75,6 +75,7 @@ struct shrinker { #endif #ifdef CONFIG_SHRINKER_DEBUG int debugfs_id; + const char *name; struct dentry *debugfs_entry; #endif /* objs pending delete, per node */ @@ -92,9 +93,11 @@ struct shrinker { */ #define SHRINKER_NONSLAB (1 << 3) -extern int prealloc_shrinker(struct shrinker *shrinker); +extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker, + const char *fmt, ...); extern void register_shrinker_prepared(struct shrinker *shrinker); -extern int register_shrinker(struct shrinker *shrinker); +extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker, + const char *fmt, ...); extern void unregister_shrinker(struct shrinker *shrinker); extern void free_prealloced_shrinker(struct shrinker *shrinker); extern void synchronize_shrinkers(void); @@ -102,6 +105,8 @@ extern void synchronize_shrinkers(void); #ifdef CONFIG_SHRINKER_DEBUG extern int shrinker_debugfs_add(struct shrinker *shrinker); extern void shrinker_debugfs_remove(struct shrinker *shrinker); +extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, + const char *fmt, ...); #else /* CONFIG_SHRINKER_DEBUG */ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { @@ -110,5 +115,10 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker) static inline void shrinker_debugfs_remove(struct shrinker *shrinker) { } +static inline __printf(2, 3) +int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) +{ + return 0; +} #endif /* CONFIG_SHRINKER_DEBUG */ #endif /* _LINUX_SHRINKER_H */ -- cgit v1.2.3 From 8cdcc532268df0893d9756f537cbce479f4c4831 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 13 Jun 2022 19:22:56 +0000 Subject: mm/damon/schemes: add 'LRU_PRIO' DAMOS action This commit adds a new DAMOS action called 'LRU_PRIO' for the physical address space. The action prioritizes pages in the memory regions of the user-specified target access pattern on their LRU lists. This is hence supposed to be used for frequently accessed (hot) memory regions so that hot pages could be more likely protected under memory pressure. Internally, it simply calls 'mark_page_accessed()'. Link: https://lkml.kernel.org/r/20220613192301.8817-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index b9aae19fab3e..4c64e03e94d8 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -86,6 +86,7 @@ struct damon_target { * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions */ @@ -95,6 +96,7 @@ enum damos_action { DAMOS_PAGEOUT, DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, + DAMOS_LRU_PRIO, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, }; -- cgit v1.2.3 From 99cdc2cd180a7adc87badc9ca92f8af803d8bf3b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 13 Jun 2022 19:22:58 +0000 Subject: mm/damon/schemes: add 'LRU_DEPRIO' action This commit adds a new DAMON-based operation scheme action called 'LRU_DEPRIO' for physical address space. The action deprioritizes pages in the memory area of the target access pattern on their LRU lists. This is hence supposed to be used for rarely accessed (cold) memory regions so that cold pages could be more likely reclaimed first under memory pressure. Internally, it simply calls 'lru_deactivate()'. Using this with 'LRU_PRIO' action for hot pages, users can proactively sort LRU lists based on the access pattern. That is, it can make the LRU lists somewhat more trustworthy source of access temperature. As a result, efficiency of LRU-lists based mechanisms including the reclamation target selection could be improved. Link: https://lkml.kernel.org/r/20220613192301.8817-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 4c64e03e94d8..7b1f4a488230 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -87,6 +87,7 @@ struct damon_target { * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. + * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions */ @@ -97,6 +98,7 @@ enum damos_action { DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, DAMOS_LRU_PRIO, + DAMOS_LRU_DEPRIO, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, }; -- cgit v1.2.3 From 64fe24a3e05e5f3ac56fcd45afd2fd1d9cc8fcb6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 14 Jun 2022 11:36:29 +0200 Subject: mm/mprotect: try avoiding write faults for exclusive anonymous pages when changing protection Similar to our MM_CP_DIRTY_ACCT handling for shared, writable mappings, we can try mapping anonymous pages in a private writable mapping writable if they are exclusive, the PTE is already dirty, and no special handling applies. Mapping the anonymous page writable is essentially the same thing the write fault handler would do in this case. Special handling is required for uffd-wp and softdirty tracking, so take care of that properly. Also, leave PROT_NONE handling alone for now; in the future, we could similarly extend the logic in do_numa_page() or use pte_mk_savedwrite() here. While this improves mprotect(PROT_READ)+mprotect(PROT_READ|PROT_WRITE) performance, it should also be a valuable optimization for uffd-wp, when un-protecting. This has been previously suggested by Peter Collingbourne in [1], relevant in the context of the Scudo memory allocator, before we had PageAnonExclusive. This commit doesn't add the same handling for PMDs (i.e., anonymous THP, anonymous hugetlb); benchmark results from Andrea indicate that there are minor performance gains, so it's might still be valuable to streamline that logic for all anonymous pages in the future. As we now also set MM_CP_DIRTY_ACCT for private mappings, let's rename it to MM_CP_TRY_CHANGE_WRITABLE, to make it clearer what's actually happening. Micro-benchmark courtesy of Andrea: === #define _GNU_SOURCE #include #include #include #include #include #define SIZE (1024*1024*1024) int main(int argc, char *argv[]) { char *p; if (posix_memalign((void **)&p, sysconf(_SC_PAGESIZE)*512, SIZE)) perror("posix_memalign"), exit(1); if (madvise(p, SIZE, argc > 1 ? MADV_HUGEPAGE : MADV_NOHUGEPAGE)) perror("madvise"); explicit_bzero(p, SIZE); for (int loops = 0; loops < 40; loops++) { if (mprotect(p, SIZE, PROT_READ)) perror("mprotect"), exit(1); if (mprotect(p, SIZE, PROT_READ|PROT_WRITE)) perror("mprotect"), exit(1); explicit_bzero(p, SIZE); } } === Results on my Ryzen 9 3900X: Stock 10 runs (lower is better): AVG 6.398s, STDEV 0.043 Patched 10 runs (lower is better): AVG 3.780s, STDEV 0.026 === [1] https://lkml.kernel.org/r/20210429214801.2583336-1-pcc@google.com Link: https://lkml.kernel.org/r/20220614093629.76309-1-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Peter Collingbourne Acked-by: Peter Xu Cc: Nadav Amit Cc: Dave Hansen Cc: Andrea Arcangeli Cc: Yang Shi Cc: Hugh Dickins Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index cf3d0d673f6b..09ea26056e2f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1962,8 +1962,12 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, * for now all the callers are only use one of the flags at the same * time. */ -/* Whether we should allow dirty bit accounting */ -#define MM_CP_DIRTY_ACCT (1UL << 0) +/* + * Whether we should manually check if we can map individual PTEs writable, + * because something (e.g., COW, uffd-wp) blocks that from happening for all + * PTEs automatically in a writable mapping. + */ +#define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) /* Whether this change is for write protecting */ -- cgit v1.2.3 From b8cecb9376b9d3031cf62b476a0db087b6b01072 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 16:42:44 +0100 Subject: mm/vmscan: convert reclaim_clean_pages_from_list() to folios Patch series "nvert much of vmscan to folios" vmscan always operates on folios since it puts the pages on the LRU list. Switching all of these functions from pages to folios saves 1483 bytes of text from removing all the baggage around calling compound_page() and similar functions. This patch (of 5): This is a straightforward conversion which removes several hidden calls to compound_head, saving 330 bytes of kernel text. Link: https://lkml.kernel.org/r/20220617154248.700416-1-willy@infradead.org Link: https://lkml.kernel.org/r/20220617154248.700416-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e66f7aa3191d..f32aade2a6e0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -670,6 +670,12 @@ static __always_inline bool PageAnon(struct page *page) return folio_test_anon(page_folio(page)); } +static __always_inline bool __folio_test_movable(const struct folio *folio) +{ + return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == + PAGE_MAPPING_MOVABLE; +} + static __always_inline int __PageMovable(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == -- cgit v1.2.3 From e3c4cebf3f9db8c9150eb1982da7e353d9938bed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:49:59 +0100 Subject: mm: add folios_put() Patch series "Convert the swap code to be more folio-based". There's still more to do with the swap code, but this reaps a lot of the folio benefit. More than 4kB of kernel text saved (with the UEK7 kernel config). I don't know how much that's going to translate into CPU savings, but some of those compound_head() calls are on every page free, so it should be noticable. It might even be noticable just from an I-cache consumption perspective. This patch (of 22): This is just a wrapper around release_pages() for now. Place the prototype in mm.h along with folio_put() and folio_put_refs(). Link: https://lkml.kernel.org/r/20220617175020.717127-1-willy@infradead.org Link: https://lkml.kernel.org/r/20220617175020.717127-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 19 +++++++++++++++++++ include/linux/pagemap.h | 2 -- 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 09ea26056e2f..09670ccb94e7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1220,6 +1220,25 @@ static inline void folio_put_refs(struct folio *folio, int refs) __put_page(&folio->page); } +void release_pages(struct page **pages, int nr); + +/** + * folios_put - Decrement the reference count on an array of folios. + * @folios: The folios. + * @nr: How many folios there are. + * + * Like folio_put(), but for an array of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which + * need to be taken if the folios are freed. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folios_put(struct folio **folios, unsigned int nr) +{ + release_pages((struct page **)folios, nr); +} + static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ce96866fbec4..c399a9c5da7d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -345,8 +345,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) #endif } -void release_pages(struct page **pages, int nr); - struct address_space *page_mapping(struct page *); struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); -- cgit v1.2.3 From 7d80dd096f8f889128f67a2d452e4dadeed71e63 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:01 +0100 Subject: mm/swap: make __pagevec_lru_add static __pagevec_lru_add has no callers outside swap.c, so make it static, and move it to a more logical position in the file. Link: https://lkml.kernel.org/r/20220617175020.717127-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 67b1246f136b..b0e3540f3a4c 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -26,7 +26,6 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -void __pagevec_lru_add(struct pagevec *pvec); unsigned pagevec_lookup_range(struct pagevec *pvec, struct address_space *mapping, pgoff_t *start, pgoff_t end); -- cgit v1.2.3 From 8d29c7036f5ff360ea1f51b9fed5d909be7c8094 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:13 +0100 Subject: mm/swap: convert __put_page() to __folio_put() Saves 11 bytes of text by removing a check of PageTail. Link: https://lkml.kernel.org/r/20220617175020.717127-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 09670ccb94e7..3fb49aec13fd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -855,7 +855,7 @@ static inline struct folio *virt_to_folio(const void *x) return page_folio(page); } -void __put_page(struct page *page); +void __folio_put(struct folio *folio); void put_pages_list(struct list_head *pages); @@ -1197,7 +1197,7 @@ static inline __must_check bool try_get_page(struct page *page) static inline void folio_put(struct folio *folio) { if (folio_put_testzero(folio)) - __put_page(&folio->page); + __folio_put(folio); } /** @@ -1217,7 +1217,7 @@ static inline void folio_put(struct folio *folio) static inline void folio_put_refs(struct folio *folio, int refs) { if (folio_ref_sub_and_test(folio, refs)) - __put_page(&folio->page); + __folio_put(folio); } void release_pages(struct page **pages, int nr); -- cgit v1.2.3 From 5375336c8c42a343c3b440b6f1e21c65e7b174b9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:17 +0100 Subject: mm: convert destroy_compound_page() to destroy_large_folio() All callers now have a folio, so push the folio->page conversion down to this function. [akpm@linux-foundation.org: uninline destroy_large_folio() to fix build issue] Link: https://lkml.kernel.org/r/20220617175020.717127-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3fb49aec13fd..9cc02a7e503b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -892,11 +892,7 @@ static inline void set_compound_page_dtor(struct page *page, page[1].compound_dtor = compound_dtor; } -static inline void destroy_compound_page(struct page *page) -{ - VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); - compound_page_dtors[page[1].compound_dtor](page); -} +void destroy_large_folio(struct folio *folio); static inline int head_compound_pincount(struct page *head) { -- cgit v1.2.3 From ed7802dd48f7a507213cbb95bb4c6f1fe134eb5d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 17 Jun 2022 21:56:49 +0800 Subject: mm: memory_hotplug: enumerate all supported section flags Patch series "make hugetlb_optimize_vmemmap compatible with memmap_on_memory", v3. This series makes hugetlb_optimize_vmemmap compatible with memmap_on_memory. This patch (of 2): We are almost running out of section flags, only one bit is available in the worst case (powerpc with 256k pages). However, there are still some free bits (in ->section_mem_map) on other architectures (e.g. x86_64 has 10 bits available, arm64 has 8 bits available with worst case of 64K pages). We have hard coded those numbers in code, it is inconvenient to use those bits on other architectures except powerpc. So transfer those section flags to enumeration to make it easy to add new section flags in the future. Also, move SECTION_TAINT_ZONE_DEVICE into the scope of CONFIG_ZONE_DEVICE to save a bit on non-zone-device case. [songmuchun@bytedance.com: replace enum with defines per David] Link: https://lkml.kernel.org/r/20220620110616.12056-2-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220617135650.74901-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220617135650.74901-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: David Hildenbrand Cc: Jonathan Corbet Cc: Mike Kravetz Cc: Oscar Salvador Cc: Paul E. McKenney Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aab70355d64f..2b5757752333 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1418,16 +1418,32 @@ extern size_t mem_section_usage_size(void); * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. - * To sum it up, at least 6 bits are available. + * To sum it up, at least 6 bits are available on all architectures. + * However, we can exceed 6 bits on some other architectures except + * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available + * with the worst case of 64K pages on arm64) if we make sure the + * exceeded bit is not applicable to powerpc. */ -#define SECTION_MARKED_PRESENT (1UL<<0) -#define SECTION_HAS_MEM_MAP (1UL<<1) -#define SECTION_IS_ONLINE (1UL<<2) -#define SECTION_IS_EARLY (1UL<<3) -#define SECTION_TAINT_ZONE_DEVICE (1UL<<4) -#define SECTION_MAP_LAST_BIT (1UL<<5) -#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) -#define SECTION_NID_SHIFT 6 +enum { + SECTION_MARKED_PRESENT_BIT, + SECTION_HAS_MEM_MAP_BIT, + SECTION_IS_ONLINE_BIT, + SECTION_IS_EARLY_BIT, +#ifdef CONFIG_ZONE_DEVICE + SECTION_TAINT_ZONE_DEVICE_BIT, +#endif + SECTION_MAP_LAST_BIT, +}; + +#define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT) +#define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT) +#define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT) +#define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT) +#ifdef CONFIG_ZONE_DEVICE +#define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) +#endif +#define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) +#define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT static inline struct page *__section_mem_map_addr(struct mem_section *section) { @@ -1466,12 +1482,19 @@ static inline int online_section(struct mem_section *section) return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } +#ifdef CONFIG_ZONE_DEVICE static inline int online_device_section(struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } +#else +static inline int online_device_section(struct mem_section *section) +{ + return 0; +} +#endif static inline int online_section_nr(unsigned long nr) { -- cgit v1.2.3 From 66361095129b3b5d065e6c09cf0c085ef4a8c40f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 17 Jun 2022 21:56:50 +0800 Subject: mm: memory_hotplug: make hugetlb_optimize_vmemmap compatible with memmap_on_memory For now, the feature of hugetlb_free_vmemmap is not compatible with the feature of memory_hotplug.memmap_on_memory, and hugetlb_free_vmemmap takes precedence over memory_hotplug.memmap_on_memory. However, someone wants to make memory_hotplug.memmap_on_memory takes precedence over hugetlb_free_vmemmap since memmap_on_memory makes it more likely to succeed memory hotplug in close-to-OOM situations. So the decision of making hugetlb_free_vmemmap take precedence is not wise and elegant. The proper approach is to have hugetlb_vmemmap.c do the check whether the section which the HugeTLB pages belong to can be optimized. If the section's vmemmap pages are allocated from the added memory block itself, hugetlb_free_vmemmap should refuse to optimize the vmemmap, otherwise, do the optimization. Then both kernel parameters are compatible. So this patch introduces VmemmapSelfHosted to mask any non-optimizable vmemmap pages. The hugetlb_vmemmap can use this flag to detect if a vmemmap page can be optimized. [songmuchun@bytedance.com: walk vmemmap page tables to avoid false-positive] Link: https://lkml.kernel.org/r/20220620110616.12056-3-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220617135650.74901-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Co-developed-by: Oscar Salvador Signed-off-by: Oscar Salvador Acked-by: David Hildenbrand Cc: Jonathan Corbet Cc: Mike Kravetz Cc: Paul E. McKenney Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 9 --------- include/linux/page-flags.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 20d7edf62a6a..e0b2209ab71c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -351,13 +351,4 @@ void arch_remove_linear_mapping(u64 start, u64 size); extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ -#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY -bool mhp_memmap_on_memory(void); -#else -static inline bool mhp_memmap_on_memory(void) -{ - return false; -} -#endif - #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f32aade2a6e0..82719d33c0f1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -193,6 +193,11 @@ enum pageflags { /* Only valid for buddy pages. Used to track pages that are reported */ PG_reported = PG_uptodate, + +#ifdef CONFIG_MEMORY_HOTPLUG + /* For self-hosted memmap pages */ + PG_vmemmap_self_hosted = PG_owner_priv_1, +#endif }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -628,6 +633,12 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison) */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) +#ifdef CONFIG_MEMORY_HOTPLUG +PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY) +#else +PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) +#endif + /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; -- cgit v1.2.3 From e8da368a1e42a8056d1a6b419e1b91b6cf11d77e Mon Sep 17 00:00:00 2001 From: Yun-Ze Li Date: Mon, 20 Jun 2022 07:15:16 +0000 Subject: mm, docs: fix comments that mention mem_hotplug_end() Comments that mention mem_hotplug_end() are confusing as there is no function called mem_hotplug_end(). Fix them by replacing all the occurences of mem_hotplug_end() in the comments with mem_hotplug_done(). [akpm@linux-foundation.org: grammatical fixes] Link: https://lkml.kernel.org/r/20220620071516.1286101-1-p76091292@gs.ncku.edu.tw Signed-off-by: Yun-Ze Li Cc: Souptick Joarder Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2b5757752333..735bf5b37949 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -591,8 +591,8 @@ struct zone { * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by - * mem_hotplug_begin/end(). Any reader who can't tolerant drift of - * present_pages should get_online_mems() to get a stable value. + * mem_hotplug_begin/done(). Any reader who can't tolerant drift of + * present_pages should use get_online_mems() to get a stable value. */ atomic_long_t managed_pages; unsigned long spanned_pages; @@ -870,7 +870,7 @@ typedef struct pglist_data { unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ struct task_struct *kswapd; /* Protected by - mem_hotplug_begin/end() */ + mem_hotplug_begin/done() */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; -- cgit v1.2.3 From 18f3962953e40401b7ed98e8524167282c3e626e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Sun, 26 Jun 2022 22:57:17 +0800 Subject: mm: hugetlb: kill set_huge_swap_pte_at() Commit e5251fd43007 ("mm/hugetlb: introduce set_huge_swap_pte_at() helper") add set_huge_swap_pte_at() to handle swap entries on architectures that support hugepages consisting of contiguous ptes. And currently the set_huge_swap_pte_at() is only overridden by arm64. set_huge_swap_pte_at() provide a sz parameter to help determine the number of entries to be updated. But in fact, all hugetlb swap entries contain pfn information, so we can find the corresponding folio through the pfn recorded in the swap entry, then the folio_size() is the number of entries that need to be updated. And considering that users will easily cause bugs by ignoring the difference between set_huge_swap_pte_at() and set_huge_pte_at(). Let's handle swap entries in set_huge_pte_at() and remove the set_huge_swap_pte_at(), then we can call set_huge_pte_at() anywhere, which simplifies our coding. Link: https://lkml.kernel.org/r/20220626145717.53572-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Muchun Song Cc: Mike Kravetz Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 756b66ff025e..c6cccfaf8708 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -903,14 +903,6 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) atomic_long_sub(l, &mm->hugetlb_usage); } -#ifndef set_huge_swap_pte_at -static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, unsigned long sz) -{ - set_huge_pte_at(mm, addr, ptep, pte); -} -#endif - #ifndef huge_ptep_modify_prot_start #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, @@ -1094,11 +1086,6 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { } -static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, unsigned long sz) -{ -} - static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { -- cgit v1.2.3 From 1baec203b77cafa24610b5c9ae7a2aa380d74ef6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 25 Jun 2022 17:28:16 +0800 Subject: mm/khugepaged: try to free transhuge swapcache when possible Transhuge swapcaches won't be freed in __collapse_huge_page_copy(). It's because release_pte_page() is not called for these pages and thus free_page_and_swap_cache can't grab the page lock. These pages won't be freed from swap cache even if we are the only user until next time reclaim. It shouldn't hurt indeed, but we could try to free these pages to save more memory for system. Link: https://lkml.kernel.org/r/20220625092816.4856-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Alistair Popple Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: David Howells Cc: Matthew Wilcox (Oracle) Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 95a5b7aa1ae9..6d11c51b2b62 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -455,6 +455,7 @@ static inline unsigned long total_swapcache_pages(void) return global_node_page_state(NR_SWAPCACHE); } +extern void free_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); /* linux/mm/swapfile.c */ @@ -539,6 +540,10 @@ static inline void put_swap_device(struct swap_info_struct *si) /* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */ #define free_swap_and_cache(e) is_pfn_swap_entry(e) +static inline void free_swap_cache(struct page *page) +{ +} + static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) { return 0; -- cgit v1.2.3 From 6077c943beee407168f72ece745b0aeaef6b896f Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:08 -0500 Subject: mm: rename is_pinnable_page() to is_longterm_pinnable_page() Patch series "Add MEMORY_DEVICE_COHERENT for coherent device memory mapping", v9. This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory owned by a device that can be mapped into CPU page tables like MEMORY_DEVICE_GENERIC and can also be migrated like MEMORY_DEVICE_PRIVATE. This patch series is mostly self-contained except for a few places where it needs to update other subsystems to handle the new memory type. System stability and performance are not affected according to our ongoing testing, including xfstests. How it works: The system BIOS advertises the GPU device memory (aka VRAM) as SPM (special purpose memory) in the UEFI system address map. The amdgpu driver registers the memory with devmap as MEMORY_DEVICE_COHERENT using devm_memremap_pages. The initial user for this hardware page migration capability is the Frontier supercomputer project. This functionality is not AMD-specific. We expect other GPU vendors to find this functionality useful, and possibly other hardware types in the future. Our test nodes in the lab are similar to the Frontier configuration, with .5 TB of system memory plus 256 GB of device memory split across 4 GPUs, all in a single coherent address space. Page migration is expected to improve application efficiency significantly. We will report empirical results as they become available. Coherent device type pages at gup are now migrated back to system memory if they are being pinned long-term (FOLL_LONGTERM). The reason is, that long-term pinning would interfere with the device memory manager owning the device-coherent pages (e.g. evictions in TTM). These series incorporate Alistair Popple patches to do this migration from pin_user_pages() calls. hmm_gup_test has been added to hmm-test to test different get user pages calls. This series includes handling of device-managed anonymous pages returned by vm_normal_pages. Although they behave like normal pages for purposes of mapping in CPU page tables and for COW, they do not support LRU lists, NUMA migration or THP. We also introduced a FOLL_LRU flag that adds the same behaviour to follow_page and related APIs, to allow callers to specify that they expect to put pages on an LRU list. This patch (of 14): is_pinnable_page() and folio_is_pinnable() are renamed to is_longterm_pinnable_page() and folio_is_longterm_pinnable() respectively. These functions are used in the FOLL_LONGTERM flag context. Link: https://lkml.kernel.org/r/20220715150521.18165-1-alex.sierra@amd.com Link: https://lkml.kernel.org/r/20220715150521.18165-2-alex.sierra@amd.com Signed-off-by: Alex Sierra Reviewed-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Felix Kuehling Cc: Ralph Campbell Cc: Christoph Hellwig Cc: Jerome Glisse Cc: Alistair Popple Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9cc02a7e503b..3c044e38958c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1607,7 +1607,7 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ #ifdef CONFIG_MIGRATION -static inline bool is_pinnable_page(struct page *page) +static inline bool is_longterm_pinnable_page(struct page *page) { #ifdef CONFIG_CMA int mt = get_pageblock_migratetype(page); @@ -1618,15 +1618,15 @@ static inline bool is_pinnable_page(struct page *page) return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page)); } #else -static inline bool is_pinnable_page(struct page *page) +static inline bool is_longterm_pinnable_page(struct page *page) { return true; } #endif -static inline bool folio_is_pinnable(struct folio *folio) +static inline bool folio_is_longterm_pinnable(struct folio *folio) { - return is_pinnable_page(&folio->page); + return is_longterm_pinnable_page(&folio->page); } static inline void set_page_zone(struct page *page, enum zone_type zone) -- cgit v1.2.3 From 5bb88dc571b1cbf0284100a317fb21ab7d03e40c Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:09 -0500 Subject: mm: move page zone helpers from mm.h to mmzone.h It makes more sense to have these helpers in zone specific header file, rather than the generic mm.h Link: https://lkml.kernel.org/r/20220715150521.18165-3-alex.sierra@amd.com Signed-off-by: Alex Sierra Cc: Alistair Popple Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- include/linux/memremap.h | 2 +- include/linux/mm.h | 78 ---------------------------------------------- include/linux/mmzone.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 79 deletions(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 9f5ee49482de..732dde5988fb 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -2,7 +2,7 @@ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ -#include +#include #include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index 3c044e38958c..a2d01e49253b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1045,84 +1045,6 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); * back into memory. */ -/* - * The zone field is never updated after free_area_init_core() - * sets it, so none of the operations on it need to be atomic. - */ - -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ -#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) -#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) -#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) -#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) - -/* - * Define the bit shifts to access each section. For non-existent - * sections we define the shift as 0; that plus a 0 mask ensures - * the compiler will optimise away reference to them. - */ -#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) -#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) -#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) -#define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) - -/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ -#ifdef NODE_NOT_IN_PAGE_FLAGS -#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) -#define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \ - SECTIONS_PGOFF : ZONES_PGOFF) -#else -#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) -#define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \ - NODES_PGOFF : ZONES_PGOFF) -#endif - -#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) - -#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) -#define NODES_MASK ((1UL << NODES_WIDTH) - 1) -#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) -#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) -#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) - -static inline enum zone_type page_zonenum(const struct page *page) -{ - ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); - return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; -} - -static inline enum zone_type folio_zonenum(const struct folio *folio) -{ - return page_zonenum(&folio->page); -} - -#ifdef CONFIG_ZONE_DEVICE -static inline bool is_zone_device_page(const struct page *page) -{ - return page_zonenum(page) == ZONE_DEVICE; -} -extern void memmap_init_zone_device(struct zone *, unsigned long, - unsigned long, struct dev_pagemap *); -#else -static inline bool is_zone_device_page(const struct page *page) -{ - return false; -} -#endif - -static inline bool folio_is_zone_device(const struct folio *folio) -{ - return is_zone_device_page(&folio->page); -} - -static inline bool is_zone_movable_page(const struct page *page) -{ - return page_zonenum(page) == ZONE_MOVABLE; -} - #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 735bf5b37949..5da1135e6755 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -730,6 +730,86 @@ static inline bool zone_is_empty(struct zone *zone) return zone->spanned_pages == 0; } +#ifndef BUILD_VDSO32_64 +/* + * The zone field is never updated after free_area_init_core() + * sets it, so none of the operations on it need to be atomic. + */ + +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ +#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) +#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) +#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) +#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) +#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) + +/* + * Define the bit shifts to access each section. For non-existent + * sections we define the shift as 0; that plus a 0 mask ensures + * the compiler will optimise away reference to them. + */ +#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) +#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) +#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) +#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) +#define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) + +/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ +#ifdef NODE_NOT_IN_PAGE_FLAGS +#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) +#define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF) ? \ + SECTIONS_PGOFF : ZONES_PGOFF) +#else +#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) +#define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF) ? \ + NODES_PGOFF : ZONES_PGOFF) +#endif + +#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) + +#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) +#define NODES_MASK ((1UL << NODES_WIDTH) - 1) +#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) +#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) +#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) +#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) + +static inline enum zone_type page_zonenum(const struct page *page) +{ + ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); + return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; +} + +static inline enum zone_type folio_zonenum(const struct folio *folio) +{ + return page_zonenum(&folio->page); +} + +#ifdef CONFIG_ZONE_DEVICE +static inline bool is_zone_device_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_DEVICE; +} +extern void memmap_init_zone_device(struct zone *, unsigned long, + unsigned long, struct dev_pagemap *); +#else +static inline bool is_zone_device_page(const struct page *page) +{ + return false; +} +#endif + +static inline bool folio_is_zone_device(const struct folio *folio) +{ + return is_zone_device_page(&folio->page); +} + +static inline bool is_zone_movable_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_MOVABLE; +} +#endif + /* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone -- cgit v1.2.3 From f25cbb7a95a24ff9a2a3bebd308e303942ae6b2c Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:10 -0500 Subject: mm: add zone device coherent type memory support Device memory that is cache coherent from device and CPU point of view. This is used on platforms that have an advanced system bus (like CAPI or CXL). Any page of a process can be migrated to such memory. However, no one should be allowed to pin such memory so that it can always be evicted. [hch@lst.de: rebased ontop of the refcount changes, remove is_dev_private_or_coherent_page] Link: https://lkml.kernel.org/r/20220715150521.18165-4-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Popple Acked-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- include/linux/memremap.h | 19 +++++++++++++++++++ include/linux/mm.h | 5 ++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 732dde5988fb..09320b7f706c 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -41,6 +41,13 @@ struct vmem_altmap { * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/mm/hmm.rst. * + * MEMORY_DEVICE_COHERENT: + * Device memory that is cache coherent from device and CPU point of view. This + * is used on platforms that have an advanced system bus (like CAPI or CXL). A + * driver can hotplug the device memory using ZONE_DEVICE and with that memory + * type. Any page of a process can be migrated to such memory. However no one + * should be allowed to pin such memory so that it can always be evicted. + * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. In support of coordinating page @@ -61,6 +68,7 @@ struct vmem_altmap { enum memory_type { /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, + MEMORY_DEVICE_COHERENT, MEMORY_DEVICE_FS_DAX, MEMORY_DEVICE_GENERIC, MEMORY_DEVICE_PCI_P2PDMA, @@ -150,6 +158,17 @@ static inline bool is_pci_p2pdma_page(const struct page *page) page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } +static inline bool is_device_coherent_page(const struct page *page) +{ + return is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_COHERENT; +} + +static inline bool folio_is_device_coherent(const struct folio *folio) +{ + return is_device_coherent_page(&folio->page); +} + #ifdef CONFIG_ZONE_DEVICE void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); diff --git a/include/linux/mm.h b/include/linux/mm.h index a2d01e49253b..64393ed3330a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -28,6 +28,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1537,7 +1538,9 @@ static inline bool is_longterm_pinnable_page(struct page *page) if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif - return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page)); + return !(is_device_coherent_page(page) || + is_zone_movable_page(page) || + is_zero_pfn(page_to_pfn(page))); } #else static inline bool is_longterm_pinnable_page(struct page *page) -- cgit v1.2.3 From dd19e6d8ffaa1289d75d7833de97faf1b6b2c8e4 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:12 -0500 Subject: mm: add device coherent vma selection for memory migration This case is used to migrate pages from device memory, back to system memory. Device coherent type memory is cache coherent from device and CPU point of view. Link: https://lkml.kernel.org/r/20220715150521.18165-6-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Poppple Reviewed-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- include/linux/migrate.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 069a89e847f3..b84908debe5c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -148,6 +148,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn) enum migrate_vma_direction { MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, + MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2, }; struct migrate_vma { -- cgit v1.2.3 From 8012b866085523758780850087102421dbcce522 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:25 +0800 Subject: dax: introduce holder for dax_device Patch series "v14 fsdax-rmap + v11 fsdax-reflink", v2. The patchset fsdax-rmap is aimed to support shared pages tracking for fsdax. It moves owner tracking from dax_assocaite_entry() to pmem device driver, by introducing an interface ->memory_failure() for struct pagemap. This interface is called by memory_failure() in mm, and implemented by pmem device. Then call holder operations to find the filesystem which the corrupted data located in, and call filesystem handler to track files or metadata associated with this page. Finally we are able to try to fix the corrupted data in filesystem and do other necessary processing, such as killing processes who are using the files affected. The call trace is like this: memory_failure() |* fsdax case |------------ |pgmap->ops->memory_failure() => pmem_pgmap_memory_failure() | dax_holder_notify_failure() => | dax_device->holder_ops->notify_failure() => | - xfs_dax_notify_failure() | |* xfs_dax_notify_failure() | |-------------------------- | | xfs_rmap_query_range() | | xfs_dax_failure_fn() | | * corrupted on metadata | | try to recover data, call xfs_force_shutdown() | | * corrupted on file data | | try to recover data, call mf_dax_kill_procs() |* normal case |------------- |mf_generic_kill_procs() The patchset fsdax-reflink attempts to add CoW support for fsdax, and takes XFS, which has both reflink and fsdax features, as an example. One of the key mechanisms needed to be implemented in fsdax is CoW. Copy the data from srcmap before we actually write data to the destination iomap. And we just copy range in which data won't be changed. Another mechanism is range comparison. In page cache case, readpage() is used to load data on disk to page cache in order to be able to compare data. In fsdax case, readpage() does not work. So, we need another compare data with direct access support. With the two mechanisms implemented in fsdax, we are able to make reflink and fsdax work together in XFS. This patch (of 14): To easily track filesystem from a pmem device, we introduce a holder for dax_device structure, and also its operation. This holder is used to remember who is using this dax_device: - When it is the backend of a filesystem, the holder will be the instance of this filesystem. - When this pmem device is one of the targets in a mapped device, the holder will be this mapped device. In this case, the mapped device has its own dax_device and it will follow the first rule. So that we can finally track to the filesystem we needed. The holder and holder_ops will be set when filesystem is being mounted, or an target device is being activated. Link: https://lkml.kernel.org/r/20220603053738.1218681-1-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/20220603053738.1218681-2-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Cc: Dave Chinner Cc: Jane Chu Cc: Goldwyn Rodrigues Cc: Al Viro Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Miaohe Lin Cc: Dan Williams Cc: Goldwyn Rodrigues Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- include/linux/dax.h | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index e7b81634c52a..cf85fc36da5f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -43,8 +43,21 @@ struct dax_operations { void *addr, size_t bytes, struct iov_iter *iter); }; +struct dax_holder_operations { + /* + * notify_failure - notify memory failure into inner holder device + * @dax_dev: the dax device which contains the holder + * @offset: offset on this dax device where memory failure occurs + * @len: length of this memory failure event + * @flags: action flags for memory failure handler + */ + int (*notify_failure)(struct dax_device *dax_dev, u64 offset, + u64 len, int mf_flags); +}; + #if IS_ENABLED(CONFIG_DAX) struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); +void *dax_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); @@ -66,6 +79,10 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, return dax_synchronous(dax_dev); } #else +static inline void *dax_holder(struct dax_device *dax_dev) +{ + return NULL; +} static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { @@ -114,12 +131,9 @@ struct writeback_control; #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off); -static inline void fs_put_dax(struct dax_device *dax_dev) -{ - put_dax(dax_dev); -} +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, + void *holder, const struct dax_holder_operations *ops); +void fs_put_dax(struct dax_device *dax_dev, void *holder); #else static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { @@ -129,11 +143,12 @@ static inline void dax_remove_host(struct gendisk *disk) { } static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off) + u64 *start_off, void *holder, + const struct dax_holder_operations *ops) { return NULL; } -static inline void fs_put_dax(struct dax_device *dax_dev) +static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) { } #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ @@ -203,6 +218,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages); +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len, + int mf_flags); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, -- cgit v1.2.3 From 33a8f7f2b3a3437d016d1b4047a4fd37eb6951b3 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:27 +0800 Subject: pagemap,pmem: introduce ->memory_failure() When memory-failure occurs, we call this function which is implemented by each kind of devices. For the fsdax case, pmem device driver implements it. Pmem device driver will find out the filesystem in which the corrupted page located in. With dax_holder notify support, we are able to notify the memory failure from pmem driver to upper layers. If there is something not support in the notify routine, memory_failure will fall back to the generic hanlder. Link: https://lkml.kernel.org/r/20220603053738.1218681-4-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Reviewed-by: Naoya Horiguchi Cc: Al Viro Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- include/linux/memremap.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 09320b7f706c..19010491a603 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -87,6 +87,18 @@ struct dev_pagemap_ops { * the page back to a CPU accessible page. */ vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); + + /* + * Handle the memory failure happens on a range of pfns. Notify the + * processes who are using these pfns, and try to recover the data on + * them if necessary. The mf_flags is finally passed to the recover + * function through the whole notify routine. + * + * When this is not implemented, or it returns -EOPNOTSUPP, the caller + * will fall back to a common handler called mf_generic_kill_procs(). + */ + int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn, + unsigned long nr_pages, int mf_flags); }; #define PGMAP_ALTMAP_VALID (1 << 0) -- cgit v1.2.3 From 2f437effc689ef913fbe5e31110580b4e7cf04be Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:28 +0800 Subject: fsdax: introduce dax_lock_mapping_entry() The current dax_lock_page() locks dax entry by obtaining mapping and index in page. To support 1-to-N RMAP in NVDIMM, we need a new function to lock a specific dax entry corresponding to this file's mapping,index. And output the page corresponding to the specific dax entry for caller use. Link: https://lkml.kernel.org/r/20220603053738.1218681-5-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- include/linux/dax.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index cf85fc36da5f..7116681b48c0 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -161,6 +161,10 @@ struct page *dax_layout_busy_page(struct address_space *mapping); struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); dax_entry_t dax_lock_page(struct page *page); void dax_unlock_page(struct page *page, dax_entry_t cookie); +dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, + unsigned long index, struct page **page); +void dax_unlock_mapping_entry(struct address_space *mapping, + unsigned long index, dax_entry_t cookie); #else static inline struct page *dax_layout_busy_page(struct address_space *mapping) { @@ -188,6 +192,17 @@ static inline dax_entry_t dax_lock_page(struct page *page) static inline void dax_unlock_page(struct page *page, dax_entry_t cookie) { } + +static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, + unsigned long index, struct page **page) +{ + return 0; +} + +static inline void dax_unlock_mapping_entry(struct address_space *mapping, + unsigned long index, dax_entry_t cookie) +{ +} #endif int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, -- cgit v1.2.3 From c36e2024957120566efd99395b5c8cc95b5175c1 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:29 +0800 Subject: mm: introduce mf_dax_kill_procs() for fsdax case This new function is a variant of mf_generic_kill_procs that accepts a file, offset pair instead of a struct to support multiple files sharing a DAX mapping. It is intended to be called by the file systems as part of the memory_failure handler after the file system performed a reverse mapping from the storage address to the file and file offset. Link: https://lkml.kernel.org/r/20220603053738.1218681-6-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Dan Williams Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Miaohe Lin Cc: Al Viro Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 64393ed3330a..d4ebfc206e2b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3178,6 +3178,8 @@ enum mf_flags { MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, }; +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, + unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); -- cgit v1.2.3 From 6061b69b9a550a2ab84e805d0d2315ba6215f112 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:31 +0800 Subject: fsdax: set a CoW flag when associate reflink mappings Introduce a PAGE_MAPPING_DAX_COW flag to support association with CoW file mappings. In this case, since the dax-rmap has already took the responsibility to look up for shared files by given dax page, the page->mapping is no longer to used for rmap but for marking that this dax page is shared. And to make sure disassociation works fine, we use page->index as refcount, and clear page->mapping to the initial state when page->index is decreased to 0. With the help of this new flag, it is able to distinguish normal case and CoW case, and keep the warning in normal case. Link: https://lkml.kernel.org/r/20220603053738.1218681-8-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 82719d33c0f1..f2ff65f1bf83 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -661,6 +661,12 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) +/* + * Different with flags above, this flag is used only for fsdax mode. It + * indicates that this page->mapping is now under reflink case. + */ +#define PAGE_MAPPING_DAX_COW 0x1 + static __always_inline bool folio_mapping_flags(struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; -- cgit v1.2.3 From 6f7db3894ae23eb5d40af4efb404aa0c072a68d2 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:36 +0800 Subject: fsdax: dedup file range to use a compare function With dax we cannot deal with readpage() etc. So, we create a dax comparison function which is similar with vfs_dedupe_file_range_compare(). And introduce dax_remap_file_range_prep() for filesystem use. Link: https://lkml.kernel.org/r/20220603053738.1218681-13-ruansy.fnst@fujitsu.com Signed-off-by: Goldwyn Rodrigues Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- include/linux/dax.h | 8 ++++++++ include/linux/fs.h | 12 ++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 7116681b48c0..ba985333e26b 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -246,6 +246,14 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same, + const struct iomap_ops *ops); +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *ops); static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..134e9d7ad5d6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -74,6 +74,7 @@ struct fsverity_operations; struct fs_context; struct fs_parameter_spec; struct fileattr; +struct iomap_ops; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -2070,10 +2071,13 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags); -extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *count, - unsigned int remap_flags); +int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *dax_read_ops); +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *count, unsigned int remap_flags); extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); -- cgit v1.2.3 From 4fa6893faeaaea4fe4440512d2a708527ef47051 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:35 -0700 Subject: mm: thp: consolidate vma size check to transhuge_vma_suitable There are couple of places that check whether the vma size is ok for THP or whether address fits, they are open coded and duplicate, use transhuge_vma_suitable() to do the job by passing in (vma->end - HPAGE_PMD_SIZE). Move vma size check into hugepage_vma_check(). This will make khugepaged_enter() is as same as khugepaged_enter_vma(). There is just one caller for khugepaged_enter(), replace it to khugepaged_enter_vma() and remove khugepaged_enter(). Link: https://lkml.kernel.org/r/20220616174840.1202070-3-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 11 +++++++++++ include/linux/khugepaged.h | 14 -------------- 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 648cb3ce7099..8a5a8bfce0f5 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -116,6 +116,17 @@ extern struct kobj_attribute shmem_enabled_attr; extern unsigned long transparent_hugepage_flags; +/* + * Do the below checks: + * - For file vma, check if the linear page offset of vma is + * HPAGE_PMD_NR aligned within the file. The hugepage is + * guaranteed to be hugepage-aligned within the file, but we must + * check that the PMD-aligned addresses in the VMA map to + * PMD-aligned offsets within the file, else the hugepage will + * not be PMD-mappable. + * - For all vmas, check if the haddr is in an aligned HPAGE_PMD_SIZE + * area. + */ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long addr) { diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 392d34c3c59a..31ca8a7f78f4 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -51,16 +51,6 @@ static inline void khugepaged_exit(struct mm_struct *mm) if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) __khugepaged_exit(mm); } - -static inline void khugepaged_enter(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - khugepaged_enabled()) { - if (hugepage_vma_check(vma, vm_flags)) - __khugepaged_enter(vma->vm_mm); - } -} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { @@ -68,10 +58,6 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm static inline void khugepaged_exit(struct mm_struct *mm) { } -static inline void khugepaged_enter(struct vm_area_struct *vma, - unsigned long vm_flags) -{ -} static inline void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { -- cgit v1.2.3 From 9fec51689ff60d9766b38051a0b1692f93d95364 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:37 -0700 Subject: mm: thp: kill transparent_hugepage_active() The transparent_hugepage_active() was introduced to show THP eligibility bit in smaps in proc, smaps is the only user. But it actually does the similar check as hugepage_vma_check() which is used by khugepaged. We definitely don't have to maintain two similar checks, so kill transparent_hugepage_active(). This patch also fixed the wrong behavior for VM_NO_KHUGEPAGED vmas. Also move hugepage_vma_check() to huge_memory.c and huge_mm.h since it is not only for khugepaged anymore. [akpm@linux-foundation.org: check vma->vm_mm, per Zach] [akpm@linux-foundation.org: add comment to vdso check] Link: https://lkml.kernel.org/r/20220616174840.1202070-5-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 16 ++++++++++------ include/linux/khugepaged.h | 2 -- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8a5a8bfce0f5..64487bcd0c7b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -202,7 +202,9 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool transparent_hugepage_active(struct vm_area_struct *vma); +bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -351,11 +353,6 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) return false; } -static inline bool transparent_hugepage_active(struct vm_area_struct *vma) -{ - return false; -} - static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long addr) { @@ -368,6 +365,13 @@ static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, return false; } +static inline bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps) +{ + return false; +} + static inline void prep_transhuge_page(struct page *page) {} #define transparent_hugepage_flags 0UL diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 31ca8a7f78f4..ea5fd4c398f7 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -10,8 +10,6 @@ extern struct attribute_group khugepaged_attr_group; extern int khugepaged_init(void); extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); -extern bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags); extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern void khugepaged_enter_vma(struct vm_area_struct *vma, -- cgit v1.2.3 From 7da4e2cb8b1ff8221759bfc7512d651ee69516dc Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:38 -0700 Subject: mm: thp: kill __transhuge_page_enabled() The page fault path checks THP eligibility with __transhuge_page_enabled() which does the similar thing as hugepage_vma_check(), so use hugepage_vma_check() instead. However page fault allows DAX and !anon_vma cases, so added a new flag, in_pf, to hugepage_vma_check() to make page fault work correctly. The in_pf flag is also used to skip shmem and file THP for page fault since shmem handles THP in its own shmem_fault() and file THP allocation on fault is not supported yet. Also remove hugepage_vma_enabled() since hugepage_vma_check() is the only caller now, it is not necessary to have a helper function. Link: https://lkml.kernel.org/r/20220616174840.1202070-6-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 57 ++----------------------------------------------- 1 file changed, 2 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 64487bcd0c7b..cd8a6c5d9fe5 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -146,48 +146,6 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } -static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - /* Explicitly disabled through madvise. */ - if ((vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - return true; -} - -/* - * to be used on vmas which are known to support THP. - * Use transparent_hugepage_active otherwise - */ -static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) -{ - - /* - * If the hardware/firmware marked hugepage support disabled. - */ - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) - return false; - - if (!transhuge_vma_enabled(vma, vma->vm_flags)) - return false; - - if (vma_is_temporary_stack(vma)) - return false; - - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) - return true; - - if (vma_is_dax(vma)) - return true; - - if (transparent_hugepage_flags & - (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) - return !!(vma->vm_flags & VM_HUGEPAGE); - - return false; -} - static inline bool file_thp_enabled(struct vm_area_struct *vma) { struct inode *inode; @@ -204,7 +162,7 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps); + bool smaps, bool in_pf); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -348,26 +306,15 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) return false; } -static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) -{ - return false; -} - static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long addr) { return false; } -static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - return false; -} - static inline bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps) + bool smaps, bool in_pf) { return false; } -- cgit v1.2.3 From 1064026bab9f011bdea1251d44d66bbbcee04f6e Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:39 -0700 Subject: mm: khugepaged: reorg some khugepaged helpers The khugepaged_{enabled|always|req_madv} are not khugepaged only anymore, move them to huge_mm.h and rename to hugepage_flags_xxx, and remove khugepaged_req_madv due to no users. Also move khugepaged_defrag to khugepaged.c since its only caller is in that file, it doesn't have to be in a header file. Link: https://lkml.kernel.org/r/20220616174840.1202070-7-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 ++++++++ include/linux/khugepaged.h | 14 -------------- 2 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cd8a6c5d9fe5..ae3d8e2fd9e2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -116,6 +116,14 @@ extern struct kobj_attribute shmem_enabled_attr; extern unsigned long transparent_hugepage_flags; +#define hugepage_flags_enabled() \ + (transparent_hugepage_flags & \ + ((1<flags)) -- cgit v1.2.3 From e95a9851787bbb3cd4deb40fe8bab03f731852d1 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 21 Jun 2022 16:56:17 -0700 Subject: hugetlb: skip to end of PT page mapping when pte not present Patch series "hugetlb: speed up linear address scanning", v2. At unmap, fork and remap time hugetlb address ranges are linearly scanned. We can optimize these scans if the ranges are sparsely populated. Also, enable page table "Lazy copy" for hugetlb at fork. NOTE: Architectures not defining CONFIG_ARCH_WANT_GENERAL_HUGETLB need to add an arch specific version hugetlb_mask_last_page() to take advantage of sparse address scanning improvements. Baolin Wang added the routine for arm64. Other architectures which could be optimized are: ia64, mips, parisc, powerpc, s390, sh and sparc. This patch (of 4): HugeTLB address ranges are linearly scanned during fork, unmap and remap operations. If a non-present entry is encountered, the code currently continues to the next huge page aligned address. However, a non-present entry implies that the page table page for that entry is not present. Therefore, the linear scan can skip to the end of range mapped by the page table page. This can speed operations on large sparsely populated hugetlb mappings. Create a new routine hugetlb_mask_last_page() that will return an address mask. When the mask is ORed with an address, the result will be the address of the last huge page mapped by the associated page table page. Use this mask to update addresses in routines which linearly scan hugetlb address ranges when a non-present pte is encountered. hugetlb_mask_last_page is related to the implementation of huge_pte_offset as hugetlb_mask_last_page is called when huge_pte_offset returns NULL. This patch only provides a complete hugetlb_mask_last_page implementation when CONFIG_ARCH_WANT_GENERAL_HUGETLB is defined. Architectures which provide their own versions of huge_pte_offset can also provide their own version of hugetlb_mask_last_page. Link: https://lkml.kernel.org/r/20220621235620.291305-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220621235620.291305-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Tested-by: Baolin Wang Reviewed-by: Baolin Wang Acked-by: Muchun Song Reported-by: kernel test robot Cc: Michal Hocko Cc: Peter Xu Cc: Naoya Horiguchi Cc: James Houghton Cc: Mina Almasry Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Cc: Paul Walmsley Cc: Christian Borntraeger Cc: Catalin Marinas Cc: Will Deacon Cc: Rolf Eike Beer Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c6cccfaf8708..ce30fad5fd13 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -194,6 +194,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); +unsigned long hugetlb_mask_last_page(struct hstate *h); int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, -- cgit v1.2.3 From 4ddb4d91b82f4b64458fe35bc8e395c7c082ea2b Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 21 Jun 2022 16:56:19 -0700 Subject: hugetlb: do not update address in huge_pmd_unshare As an optimization for loops sequentially processing hugetlb address ranges, huge_pmd_unshare would update a passed address if it unshared a pmd. Updating a loop control variable outside the loop like this is generally a bad idea. These loops are now using hugetlb_mask_last_page to optimize scanning when non-present ptes are discovered. The same can be done when huge_pmd_unshare returns 1 indicating a pmd was unshared. Remove address update from huge_pmd_unshare. Change the passed argument type and update all callers. In loops sequentially processing addresses use hugetlb_mask_last_page to update address if pmd is unshared. [sfr@canb.auug.org.au: fix an unused variable warning/error] Link: https://lkml.kernel.org/r/20220622171117.70850960@canb.auug.org.au Link: https://lkml.kernel.org/r/20220621235620.291305-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Signed-off-by: Stephen Rothwell Acked-by: Muchun Song Reviewed-by: Baolin Wang Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David Hildenbrand Cc: James Houghton Cc: kernel test robot Cc: Michal Hocko Cc: Mina Almasry Cc: Naoya Horiguchi Cc: Paul Walmsley Cc: Peter Xu Cc: Rolf Eike Beer Cc: Will Deacon Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ce30fad5fd13..75ee739d815b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -196,7 +196,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep); + unsigned long addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, @@ -243,7 +243,7 @@ static inline struct address_space *hugetlb_page_mapping_lock_write( static inline int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { return 0; } -- cgit v1.2.3 From bf75f200569dd05ac2112797f44548beb6b4be26 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2022 13:54:17 +0100 Subject: mm/page_alloc: add page->buddy_list and page->pcp_list Patch series "Drain remote per-cpu directly", v5. Some setups, notably NOHZ_FULL CPUs, may be running realtime or latency-sensitive applications that cannot tolerate interference due to per-cpu drain work queued by __drain_all_pages(). Introduce a new mechanism to remotely drain the per-cpu lists. It is made possible by remotely locking 'struct per_cpu_pages' new per-cpu spinlocks. This has two advantages, the time to drain is more predictable and other unrelated tasks are not interrupted. This series has the same intent as Nicolas' series "mm/page_alloc: Remote per-cpu lists drain support" -- avoid interference of a high priority task due to a workqueue item draining per-cpu page lists. While many workloads can tolerate a brief interruption, it may cause a real-time task running on a NOHZ_FULL CPU to miss a deadline and at minimum, the draining is non-deterministic. Currently an IRQ-safe local_lock protects the page allocator per-cpu lists. The local_lock on its own prevents migration and the IRQ disabling protects from corruption due to an interrupt arriving while a page allocation is in progress. This series adjusts the locking. A spinlock is added to struct per_cpu_pages to protect the list contents while local_lock_irq is ultimately replaced by just the spinlock in the final patch. This allows a remote CPU to safely. Follow-on work should allow the spin_lock_irqsave to be converted to spin_lock to avoid IRQs being disabled/enabled in most cases. The follow-on patch will be one kernel release later as it is relatively high risk and it'll make bisections more clear if there are any problems. Patch 1 is a cosmetic patch to clarify when page->lru is storing buddy pages and when it is storing per-cpu pages. Patch 2 shrinks per_cpu_pages to make room for a spin lock. Strictly speaking this is not necessary but it avoids per_cpu_pages consuming another cache line. Patch 3 is a preparation patch to avoid code duplication. Patch 4 is a minor correction. Patch 5 uses a spin_lock to protect the per_cpu_pages contents while still relying on local_lock to prevent migration, stabilise the pcp lookup and prevent IRQ reentrancy. Patch 6 remote drains per-cpu pages directly instead of using a workqueue. Patch 7 uses a normal spinlock instead of local_lock for remote draining This patch (of 7): The page allocator uses page->lru for storing pages on either buddy or PCP lists. Create page->buddy_list and page->pcp_list as a union with page->lru. This is simply to clarify what type of list a page is on in the page allocator. No functional change intended. [minchan@kernel.org: fix page lru fields in macros] Link: https://lkml.kernel.org/r/20220624125423.6126-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Minchan Kim Acked-by: Minchan Kim Reviewed-by: Nicolas Saenz Julienne Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Marcelo Tosatti Cc: Michal Hocko Cc: Hugh Dickins Cc: Marek Szyprowski Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b961a29bf26..cf97f3884fda 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -87,6 +87,7 @@ struct page { */ union { struct list_head lru; + /* Or, for the Unevictable "LRU list" slot */ struct { /* Always even, to negate PageTail */ @@ -94,6 +95,10 @@ struct page { /* Count page's or folio's mlocks */ unsigned int mlock_count; }; + + /* Or, free page */ + struct list_head buddy_list; + struct list_head pcp_list; }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; -- cgit v1.2.3 From 5d0a661d808fc8ddc26940b1a12b82ae356f3ae2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2022 13:54:18 +0100 Subject: mm/page_alloc: use only one PCP list for THP-sized allocations The per_cpu_pages is cache-aligned on a standard x86-64 distribution configuration but a later patch will add a new field which would push the structure into the next cache line. Use only one list to store THP-sized pages on the per-cpu list. This assumes that the vast majority of THP-sized allocations are GFP_MOVABLE but even if it was another type, it would not contribute to serious fragmentation that potentially causes a later THP allocation failure. Align per_cpu_pages on the cacheline boundary to ensure there is no false cache sharing. After this patch, the structure sizing is; struct per_cpu_pages { int count; /* 0 4 */ int high; /* 4 4 */ int batch; /* 8 4 */ short int free_factor; /* 12 2 */ short int expire; /* 14 2 */ struct list_head lists[13]; /* 16 208 */ /* size: 256, cachelines: 4, members: 6 */ /* padding: 32 */ } __attribute__((__aligned__(64))); Link: https://lkml.kernel.org/r/20220624125423.6126-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Minchan Kim Acked-by: Minchan Kim Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Marek Szyprowski Cc: Michal Hocko Cc: Nicolas Saenz Julienne Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5da1135e6755..041136b5628a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -355,15 +355,18 @@ enum zone_watermarks { }; /* - * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional - * for pageblock size for THP if configured. + * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list + * for THP which will usually be GFP_MOVABLE. Even if it is another type, + * it should not contribute to serious fragmentation causing THP allocation + * failures. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_PCP_THP 1 #else #define NR_PCP_THP 0 #endif -#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP)) +#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) +#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) /* * Shift to encode migratetype and order in the same integer, with order @@ -389,7 +392,7 @@ struct per_cpu_pages { /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[NR_PCP_LISTS]; -}; +} ____cacheline_aligned_in_smp; struct per_cpu_zonestat { #ifdef CONFIG_SMP -- cgit v1.2.3 From 4b23a68f953628eb4e4b7fe1294ebf93d4b8ceee Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2022 13:54:21 +0100 Subject: mm/page_alloc: protect PCP lists with a spinlock Currently the PCP lists are protected by using local_lock_irqsave to prevent migration and IRQ reentrancy but this is inconvenient. Remote draining of the lists is impossible and a workqueue is required and every task allocation/free must disable then enable interrupts which is expensive. As preparation for dealing with both of those problems, protect the lists with a spinlock. The IRQ-unsafe version of the lock is used because IRQs are already disabled by local_lock_irqsave. spin_trylock is used in combination with local_lock_irqsave() but later will be replaced with a spin_trylock_irqsave when the local_lock is removed. The per_cpu_pages still fits within the same number of cache lines after this patch relative to before the series. struct per_cpu_pages { spinlock_t lock; /* 0 4 */ int count; /* 4 4 */ int high; /* 8 4 */ int batch; /* 12 4 */ short int free_factor; /* 16 2 */ short int expire; /* 18 2 */ /* XXX 4 bytes hole, try to pack */ struct list_head lists[13]; /* 24 208 */ /* size: 256, cachelines: 4, members: 7 */ /* sum members: 228, holes: 1, sum holes: 4 */ /* padding: 24 */ } __attribute__((__aligned__(64))); There is overhead in the fast path due to acquiring the spinlock even though the spinlock is per-cpu and uncontended in the common case. Page Fault Test (PFT) running on a 1-socket reported the following results on a 1 socket machine. 5.19.0-rc3 5.19.0-rc3 vanilla mm-pcpspinirq-v5r16 Hmean faults/sec-1 869275.7381 ( 0.00%) 874597.5167 * 0.61%* Hmean faults/sec-3 2370266.6681 ( 0.00%) 2379802.0362 * 0.40%* Hmean faults/sec-5 2701099.7019 ( 0.00%) 2664889.7003 * -1.34%* Hmean faults/sec-7 3517170.9157 ( 0.00%) 3491122.8242 * -0.74%* Hmean faults/sec-8 3965729.6187 ( 0.00%) 3939727.0243 * -0.66%* There is a small hit in the number of faults per second but given that the results are more stable, it's borderline noise. [akpm@linux-foundation.org: add missing local_unlock_irqrestore() on contention path] Link: https://lkml.kernel.org/r/20220624125423.6126-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Yu Zhao Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Marek Szyprowski Cc: Michal Hocko Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 041136b5628a..578247a341b2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -382,6 +382,7 @@ enum zone_watermarks { /* Fields and list protected by pagesets local_lock in page_alloc.c */ struct per_cpu_pages { + spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ -- cgit v1.2.3 From 840532711d7299d7e937952482ec899d4622c452 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:35 +0530 Subject: mm/mmap: build protect protection_map[] with __P000 Patch series "mm/mmap: Drop __SXXX/__PXXX macros from across platforms", v7. __SXXX/__PXXX macros are unnecessary abstraction layer in creating the generic protection_map[] array which is used for vm_get_page_prot(). This abstraction layer can be avoided, if the platforms just define the array protection_map[] for all possible vm_flags access permission combinations and also export vm_get_page_prot() implementation. This series drops __SXXX/__PXXX macros from across platforms in the tree. First it build protects generic protection_map[] array with '#ifdef __P000' and moves it inside platforms which enable ARCH_HAS_VM_GET_PAGE_PROT. Later this build protects same array with '#ifdef ARCH_HAS_VM_GET_PAGE_PROT' and moves inside remaining platforms while enabling ARCH_HAS_VM_GET_PAGE_PROT. This adds a new macro DECLARE_VM_GET_PAGE_PROT defining the current generic vm_get_page_prot(), in order for it to be reused on platforms that do not require custom implementation. Finally, ARCH_HAS_VM_GET_PAGE_PROT can just be dropped, as all platforms now define and export vm_get_page_prot(), via looking up a private and static protection_map[] array. protection_map[] data type has been changed as 'static const' on all platforms that do not change it during boot. This patch (of 26): Build protect generic protection_map[] array with __P000, so that it can be moved inside all the platforms one after the other. Otherwise there will be build failures during this process. CONFIG_ARCH_HAS_VM_GET_PAGE_PROT cannot be used for this purpose as only certain platforms enable this config now. Link: https://lkml.kernel.org/r/20220711070600.2378316-1-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/20220711070600.2378316-2-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christoph Hellwig Reviewed-by: Christophe Leroy Suggested-by: Christophe Leroy Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d4ebfc206e2b..1a435ce146a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -425,7 +425,9 @@ extern unsigned int kobjsize(const void *objp); * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ +#ifdef __P000 extern pgprot_t protection_map[16]; +#endif /* * The default fault flags that should be used by most of the -- cgit v1.2.3 From 43957b5d11037a651d162f65c682ec3c76777fc8 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:36 +0530 Subject: mm/mmap: define DECLARE_VM_GET_PAGE_PROT This just converts the generic vm_get_page_prot() implementation into a new macro i.e DECLARE_VM_GET_PAGE_PROT which later can be used across platforms when enabling them with ARCH_HAS_VM_GET_PAGE_PROT. This does not create any functional change. Link: https://lkml.kernel.org/r/20220711070600.2378316-3-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Suggested-by: Christoph Hellwig Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 3cdc16cfd867..014ee8f0fbaa 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1689,4 +1689,32 @@ typedef unsigned int pgtbl_mod_mask; #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif +/* description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and + * MAP_PRIVATE (with Enhanced PAN supported): + * r: (no) no + * w: (no) no + * x: (yes) yes + */ +#define DECLARE_VM_GET_PAGE_PROT \ +pgprot_t vm_get_page_prot(unsigned long vm_flags) \ +{ \ + return protection_map[vm_flags & \ + (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)]; \ +} \ +EXPORT_SYMBOL(vm_get_page_prot); + #endif /* _LINUX_PGTABLE_H */ -- cgit v1.2.3 From 09095f74130dfb2110ef2bcdd9ad0d42addaa1d5 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:41 +0530 Subject: mm/mmap: build protect protection_map[] with ARCH_HAS_VM_GET_PAGE_PROT Now that protection_map[] has been moved inside those platforms that enable ARCH_HAS_VM_GET_PAGE_PROT. Hence generic protection_map[] array now can be protected with CONFIG_ARCH_HAS_VM_GET_PAGE_PROT intead of __P000. Link: https://lkml.kernel.org/r/20220711070600.2378316-8-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1a435ce146a2..4b4dc93f9bc3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -425,7 +425,7 @@ extern unsigned int kobjsize(const void *objp); * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ -#ifdef __P000 +#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT extern pgprot_t protection_map[16]; #endif -- cgit v1.2.3 From 3d923c5f1e21ad491acd4c0d62bf2481ce94016c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:36:00 +0530 Subject: mm/mmap: drop ARCH_HAS_VM_GET_PAGE_PROT Now all the platforms enable ARCH_HAS_GET_PAGE_PROT. They define and export own vm_get_page_prot() whether custom or standard DECLARE_VM_GET_PAGE_PROT. Hence there is no need for default generic fallback for vm_get_page_prot(). Just drop this fallback and also ARCH_HAS_GET_PAGE_PROT mechanism. Link: https://lkml.kernel.org/r/20220711070600.2378316-27-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Geert Uytterhoeven Reviewed-by: Christoph Hellwig Reviewed-by: Christophe Leroy Acked-by: Geert Uytterhoeven Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4b4dc93f9bc3..61e3101c44ea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -425,9 +425,6 @@ extern unsigned int kobjsize(const void *objp); * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ -#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT -extern pgprot_t protection_map[16]; -#endif /* * The default fault flags that should be used by most of the -- cgit v1.2.3 From 3ce4fee4401206cf5a2c476ec0ee6c90191dfade Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:55 +0800 Subject: mm/huge_memory: check pmd_present first in is_huge_zero_pmd When pmd is non-present, pmd_pfn returns an insane value. So we should check pmd_present first to avoid acquiring such insane value and also avoid touching possible cold huge_zero_pfn cache line when pmd isn't present. Link: https://lkml.kernel.org/r/20220704132201.14611-11-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ae3d8e2fd9e2..12b297f9951d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -273,7 +273,7 @@ static inline bool is_huge_zero_page(struct page *page) static inline bool is_huge_zero_pmd(pmd_t pmd) { - return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd); + return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); } static inline bool is_huge_zero_pud(pud_t pud) -- cgit v1.2.3 From 121c1781aeb00475d163246d9ae7d8746e377040 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:58 +0800 Subject: mm/huge_memory: fix comment of page_deferred_list The current comment is confusing because if global or memcg deferred list in the second tail page is occupied by compound_head, why we still use page[2].deferred_list here? I think it wants to say that Global or memcg deferred list in the first tail page is occupied by compound_mapcount and compound_pincount so we use the second tail page's deferred_list instead. Link: https://lkml.kernel.org/r/20220704132201.14611-14-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 12b297f9951d..37f2f11a6d7e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -294,8 +294,8 @@ static inline bool thp_migration_supported(void) static inline struct list_head *page_deferred_list(struct page *page) { /* - * Global or memcg deferred list in the second tail pages is - * occupied by compound_head. + * See organization of tail pages of compound page in + * "struct page" definition. */ return &page[2].deferred_list; } -- cgit v1.2.3 From dcadcf1c30619ead2f3280bfb7f74de8304be2bb Mon Sep 17 00:00:00 2001 From: Gang Li Date: Wed, 6 Jul 2022 11:46:54 +0800 Subject: mm, hugetlb: skip irrelevant nodes in show_free_areas() show_free_areas() allows to filter out node specific data which is irrelevant to the allocation request. But hugetlb_show_meminfo() still shows hugetlb on all nodes, which is redundant and unnecessary. Use show_mem_node_skip() to skip irrelevant nodes. And replace hugetlb_show_meminfo() with hugetlb_show_meminfo_node(nid). before-and-after sample output of OOM: before: ``` [ 214.362453] Node 1 active_anon:148kB inactive_anon:4050920kB active_file:112kB inactive_file:100kB [ 214.375429] Node 1 Normal free:45100kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig [ 214.388334] lowmem_reserve[]: 0 0 0 0 0 [ 214.390251] Node 1 Normal: 423*4kB (UE) 320*8kB (UME) 187*16kB (UE) 117*32kB (UE) 57*64kB (UME) 20 [ 214.397626] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB [ 214.401518] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB ``` after: ``` [ 145.069705] Node 1 active_anon:128kB inactive_anon:4049412kB active_file:56kB inactive_file:84kB u [ 145.110319] Node 1 Normal free:45424kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig [ 145.152315] lowmem_reserve[]: 0 0 0 0 0 [ 145.155244] Node 1 Normal: 470*4kB (UME) 373*8kB (UME) 247*16kB (UME) 168*32kB (UE) 86*64kB (UME) [ 145.164119] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB ``` Link: https://lkml.kernel.org/r/20220706034655.1834-1-ligang.bdlg@bytedance.com Signed-off-by: Gang Li Reviewed-by: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 75ee739d815b..4cdfce976644 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -152,7 +152,7 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct page *ref_page, zap_flags_t zap_flags); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); -void hugetlb_show_meminfo(void); +void hugetlb_show_meminfo_node(int nid); unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); @@ -298,7 +298,7 @@ static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid) return 0; } -static inline void hugetlb_show_meminfo(void) +static inline void hugetlb_show_meminfo_node(int nid) { } -- cgit v1.2.3 From 0d8bc0b10aeab543bdccb86180f58db1f79f7cee Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 13 Jul 2022 20:53:14 +0800 Subject: writeback: cleanup bdi_sched_wait() bdi_sched_wait() is no longer used since commit 839a8e8660b6 ("writeback: replace custom worker pool implementation with unbound workqueue"), so remove it. Link: https://lkml.kernel.org/r/20220713125314.171345-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Acked-by: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index d452071db572..e84b745a6811 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -140,12 +140,6 @@ static inline bool mapping_can_writeback(struct address_space *mapping) return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } -static inline int bdi_sched_wait(void *word) -{ - schedule(); - return 0; -} - #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, -- cgit v1.2.3 From fef3e9066d19230f661048ca86937d954c12cd50 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 14 Jul 2022 16:41:47 +0800 Subject: writeback: remove inode_to_wb_is_valid() inode_to_wb_is_valid() is no longer used since commit fe55d563d417 ("remove inode_congested()"), remove it. Link: https://lkml.kernel.org/r/20220714084147.140324-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Reviewed-by: Johannes Thumshirn Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e84b745a6811..439815cc1ab9 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -229,18 +229,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } -/** - * inode_to_wb_is_valid - test whether an inode has a wb associated - * @inode: inode of interest - * - * Returns %true if @inode has a wb associated. May be called without any - * locking. - */ -static inline bool inode_to_wb_is_valid(struct inode *inode) -{ - return inode->i_wb; -} - /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest @@ -339,11 +327,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } -static inline bool inode_to_wb_is_valid(struct inode *inode) -{ - return true; -} - static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; -- cgit v1.2.3 From 73b73bac90d97400e29e585c678c4d0ebfd2680d Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 14 Jul 2022 06:49:18 +0000 Subject: mm: vmpressure: don't count proactive reclaim in vmpressure memory.reclaim is a cgroup v2 interface that allows users to proactively reclaim memory from a memcg, without real memory pressure. Reclaim operations invoke vmpressure, which is used: (a) To notify userspace of reclaim efficiency in cgroup v1, and (b) As a signal for a memcg being under memory pressure for networking (see mem_cgroup_under_socket_pressure()). For (a), vmpressure notifications in v1 are not affected by this change since memory.reclaim is a v2 feature. For (b), the effects of the vmpressure signal (according to Shakeel [1]) are as follows: 1. Reducing send and receive buffers of the current socket. 2. May drop packets on the rx path. 3. May throttle current thread on the tx path. Since proactive reclaim is invoked directly by userspace, not by memory pressure, it makes sense not to throttle networking. Hence, this change makes sure that proactive reclaim caused by memory.reclaim does not trigger vmpressure. [1] https://lore.kernel.org/lkml/CALvZod68WdrXEmBpOkadhB5GPYmCXaDZzXH=yyGOCAjFRn4NDQ@mail.gmail.com/ [yosryahmed@google.com: update documentation] Link: https://lkml.kernel.org/r/20220721173015.2643248-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20220714064918.2576464-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Johannes Weiner Cc: Roman Gushchin Cc: Muchun Song Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Miaohe Lin Cc: NeilBrown Cc: Alistair Popple Cc: Suren Baghdasaryan Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 6d11c51b2b62..ea895b40e6ff 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -411,10 +411,13 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page, extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); + +#define MEMCG_RECLAIM_MAY_SWAP (1 << 1) +#define MEMCG_RECLAIM_PROACTIVE (1 << 2) extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - bool may_swap); + unsigned int reclaim_options); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, -- cgit v1.2.3 From e408e695f5f1f60d784913afc45ff2c387a5aeb8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 Jul 2022 21:59:12 -0400 Subject: mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs This allows userspace to set flags like FS_APPEND_FL, FS_IMMUTABLE_FL, FS_NODUMP_FL, etc., like all other standard Linux file systems. [akpm@linux-foundation.org: fix CONFIG_TMPFS_XATTR=n warnings] Link: https://lkml.kernel.org/r/20220715015912.2560575-1-tytso@mit.edu Signed-off-by: Theodore Ts'o Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a68f982f22d1..1b6c4013f691 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -25,9 +25,20 @@ struct shmem_inode_info { struct simple_xattrs xattrs; /* list of xattrs */ atomic_t stop_eviction; /* hold when working on inode */ struct timespec64 i_crtime; /* file creation time */ + unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ struct inode vfs_inode; }; +#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE +#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE +#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) + struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ -- cgit v1.2.3 From bb077c3ffd5362a6d9e60574e1bcc83fe8e3fb27 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 26 Jul 2022 21:18:16 +0800 Subject: mm: cleanup is_highmem() It is unnecessary to add CONFIG_HIGHMEM check in is_highmem(), which has been done in is_highmem_idx(), and move is_highmem() close to is_highmem_idx(). This has no functional impact. Link: https://lkml.kernel.org/r/20220726131816.149075-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 578247a341b2..e24b40c52468 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1137,15 +1137,6 @@ static inline int is_highmem_idx(enum zone_type idx) #endif } -#ifdef CONFIG_ZONE_DMA -bool has_managed_dma(void); -#else -static inline bool has_managed_dma(void) -{ - return false; -} -#endif - /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references @@ -1155,12 +1146,17 @@ static inline bool has_managed_dma(void) */ static inline int is_highmem(struct zone *zone) { -#ifdef CONFIG_HIGHMEM return is_highmem_idx(zone_idx(zone)); +} + +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void); #else - return 0; -#endif +static inline bool has_managed_dma(void) +{ + return false; } +#endif /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -- cgit v1.2.3