From bfc1d1782984903457b2707d05a35b24ce5fbea1 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 26 Nov 2024 09:56:54 -0600 Subject: mm: migrate: remove unused argument vma from migrate_misplaced_folio() Commit ee86814b0562 ("mm/migrate: move NUMA hinting fault folio isolation + checks under PTL") removed the code that had used the vma argument in migrate_misplaced_folio. Since the vma argument was no longer used in migrate_misplaced_folio, this patch removes it. Link: https://lkml.kernel.org/r/20241126155655.466186-1-donettom@linux.ibm.com Signed-off-by: Donet Tom Reviewed-by: Baolin Wang Reviewed-by: Zi Yan Acked-by: David Hildenbrand Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- include/linux/migrate.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 002e49b2ebd9..29919faea2f1 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -144,16 +144,14 @@ const struct movable_operations *page_movable_ops(struct page *page) #ifdef CONFIG_NUMA_BALANCING int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node); -int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, - int node); +int migrate_misplaced_folio(struct folio *folio, int node); #else static inline int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node) { return -EAGAIN; /* can't migrate now */ } -static inline int migrate_misplaced_folio(struct folio *folio, - struct vm_area_struct *vma, int node) +static inline int migrate_misplaced_folio(struct folio *folio, int node) { return -EAGAIN; /* can't migrate now */ } -- cgit v1.2.3 From 38558b2460d7881a3de3bdc31a23fa7034384d00 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:34 +0000 Subject: mm: make alloc_pages_mpol() static All callers outside mempolicy.c now use folio_alloc_mpol() thanks to Kefeng's cleanups, so we can remove this as a visible symbol. And also remove the alloc_hooks for alloc_pages_mpol(), since all users in mempolicy.c are using the nonprof version. Link: https://lkml.kernel.org/r/20241125210149.2976098-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b0fe9f62d15b..c96d5d7f7b89 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -300,8 +300,6 @@ static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); -struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, - struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); @@ -312,11 +310,6 @@ static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order { return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); } -static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, - struct mempolicy *mpol, pgoff_t ilx, int nid) -{ - return alloc_pages_noprof(gfp, order); -} static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node_noprof(gfp, order, numa_node_id()); @@ -331,7 +324,6 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) -#define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) #define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) -- cgit v1.2.3 From 9023691d75f29fde884f6e243bcdad6a9dbadb19 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 25 Nov 2024 09:16:17 -0800 Subject: mm: mmap_lock: optimize mmap_lock tracepoints We are starting to deploy mmap_lock tracepoint monitoring across our fleet and the early results showed that these tracepoints are consuming significant amount of CPUs in kernfs_path_from_node when enabled. It seems like the kernel is trying to resolve the cgroup path in the fast path of the locking code path when the tracepoints are enabled. In addition for some application their metrics are regressing when monitoring is enabled. The cgroup path resolution can be slow and should not be done in the fast path. Most userspace tools, like bpftrace, provides functionality to get the cgroup path from cgroup id, so let's just trace the cgroup id and the users can use better tools to get the path in the slow path. Link: https://lkml.kernel.org/r/20241125171617.113892-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Yosry Ahmed Acked-by: Vlastimil Babka Acked-by: Roman Gushchin Reviewed-by: Axel Rasmussen Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Steven Rostedt Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 22 ++++++++++++++++++++++ include/trace/events/mmap_lock.h | 32 +++++++++++++++----------------- 2 files changed, 37 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5502aa8e138e..b28180269e75 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1046,6 +1046,23 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, int old_order, int new_order); +static inline u64 cgroup_id_from_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + u64 id; + + if (mem_cgroup_disabled()) + return 0; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (!memcg) + memcg = root_mem_cgroup; + id = cgroup_id(memcg->css.cgroup); + rcu_read_unlock(); + return id; +} + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1466,6 +1483,11 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } + +static inline u64 cgroup_id_from_mm(struct mm_struct *mm) +{ + return 0; +} #endif /* CONFIG_MEMCG */ /* diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h index bc2e3ad787b3..cf9f9faf8914 100644 --- a/include/trace/events/mmap_lock.h +++ b/include/trace/events/mmap_lock.h @@ -5,6 +5,7 @@ #if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MMAP_LOCK_H +#include #include #include @@ -12,64 +13,61 @@ struct mm_struct; DECLARE_EVENT_CLASS(mmap_lock, - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), + TP_PROTO(struct mm_struct *mm, bool write), - TP_ARGS(mm, memcg_path, write), + TP_ARGS(mm, write), TP_STRUCT__entry( __field(struct mm_struct *, mm) - __string(memcg_path, memcg_path) + __field(u64, memcg_id) __field(bool, write) ), TP_fast_assign( __entry->mm = mm; - __assign_str(memcg_path); + __entry->memcg_id = cgroup_id_from_mm(mm); __entry->write = write; ), TP_printk( - "mm=%p memcg_path=%s write=%s", - __entry->mm, - __get_str(memcg_path), + "mm=%p memcg_id=%llu write=%s", + __entry->mm, __entry->memcg_id, __entry->write ? "true" : "false" ) ); #define DEFINE_MMAP_LOCK_EVENT(name) \ DEFINE_EVENT(mmap_lock, name, \ - TP_PROTO(struct mm_struct *mm, const char *memcg_path, \ - bool write), \ - TP_ARGS(mm, memcg_path, write)) + TP_PROTO(struct mm_struct *mm, bool write), \ + TP_ARGS(mm, write)) DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking); DEFINE_MMAP_LOCK_EVENT(mmap_lock_released); TRACE_EVENT(mmap_lock_acquire_returned, - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, - bool success), + TP_PROTO(struct mm_struct *mm, bool write, bool success), - TP_ARGS(mm, memcg_path, write, success), + TP_ARGS(mm, write, success), TP_STRUCT__entry( __field(struct mm_struct *, mm) - __string(memcg_path, memcg_path) + __field(u64, memcg_id) __field(bool, write) __field(bool, success) ), TP_fast_assign( __entry->mm = mm; - __assign_str(memcg_path); + __entry->memcg_id = cgroup_id_from_mm(mm); __entry->write = write; __entry->success = success; ), TP_printk( - "mm=%p memcg_path=%s write=%s success=%s", + "mm=%p memcg_id=%llu write=%s success=%s", __entry->mm, - __get_str(memcg_path), + __entry->memcg_id, __entry->write ? "true" : "false", __entry->success ? "true" : "false" ) -- cgit v1.2.3 From 20f3ab257211594c110c43e71c31bd25ba31e851 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 22 Nov 2024 15:36:52 +0800 Subject: mm: pgtable: make ptep_clear() non-atomic In the generic ptep_get_and_clear() implementation, it is just a simple combination of ptep_get() and pte_clear(). But for some architectures (such as x86 and arm64, etc), the hardware will modify the A/D bits of the page table entry, so the ptep_get_and_clear() needs to be overwritten and implemented as an atomic operation to avoid contention, which has a performance cost. The commit d283d422c6c4 ("x86: mm: add x86_64 support for page table check") adds the ptep_clear() on the x86, and makes it call ptep_get_and_clear() when CONFIG_PAGE_TABLE_CHECK is enabled. The page table check feature does not actually care about the A/D bits, so only ptep_get() + pte_clear() should be called. But considering that the page table check is a debug option, this should not have much of an impact. But then the commit de8c8e52836d ("mm: page_table_check: add hooks to public helpers") changed ptep_clear() to unconditionally call ptep_get_and_clear(), so that the CONFIG_PAGE_TABLE_CHECK check can be put into the page table check stubs (in include/linux/page_table_check.h). This also cause performance loss to the kernel without CONFIG_PAGE_TABLE_CHECK enabled, which doesn't make sense. Currently ptep_clear() is only used in debug code and in khugepaged collapse paths, which are fairly expensive. So the cost of an extra atomic RMW operation does not matter. But this may be used for other paths in the future. After all, for the present pte entry, we need to call ptep_clear() instead of pte_clear() to ensure that PAGE_TABLE_CHECK works properly. So to be more precise, just calling ptep_get() and pte_clear() in the ptep_clear(). Link: https://lkml.kernel.org/r/20241122073652.54030-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Pasha Tatashin Reviewed-by: Jann Horn Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Lorenzo Stoakes Cc: Peter Xu Cc: Ryan Roberts Cc: Tong Tiangen Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index adef9d6e9b1b..94d267d02372 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -533,7 +533,14 @@ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - ptep_get_and_clear(mm, addr, ptep); + pte_t pte = ptep_get(ptep); + + pte_clear(mm, addr, ptep); + /* + * No need for ptep_get_and_clear(): page table check doesn't care about + * any bits that could have been set by HW concurrently. + */ + page_table_check_pte_clear(mm, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH -- cgit v1.2.3 From d40797d6720e861196e848f3615bb09dae5be7ce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Nov 2024 16:54:51 +0100 Subject: kasan: make kasan_record_aux_stack_noalloc() the default behaviour kasan_record_aux_stack_noalloc() was introduced to record a stack trace without allocating memory in the process. It has been added to callers which were invoked while a raw_spinlock_t was held. More and more callers were identified and changed over time. Is it a good thing to have this while functions try their best to do a locklessly setup? The only downside of having kasan_record_aux_stack() not allocate any memory is that we end up without a stacktrace if stackdepot runs out of memory and at the same stacktrace was not recorded before To quote Marco Elver from https://lore.kernel.org/all/CANpmjNPmQYJ7pv1N3cuU8cP18u7PP_uoZD8YxwZd4jtbof9nVQ@mail.gmail.com/ | I'd be in favor, it simplifies things. And stack depot should be | able to replenish its pool sufficiently in the "non-aux" cases | i.e. regular allocations. Worst case we fail to record some | aux stacks, but I think that's only really bad if there's a bug | around one of these allocations. In general the probabilities | of this being a regression are extremely small [...] Make the kasan_record_aux_stack_noalloc() behaviour default as kasan_record_aux_stack(). [bigeasy@linutronix.de: dressed the diff as patch] Link: https://lkml.kernel.org/r/20241122155451.Mb2pmeyJ@linutronix.de Fixes: 7cb3007ce2da ("kasan: generic: introduce kasan_record_aux_stack_noalloc()") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Reported-by: syzbot+39f85d612b7c20d8db48@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67275485.050a0220.3c8d68.0a37.GAE@google.com Reviewed-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Waiman Long Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ben Segall Cc: Boqun Feng Cc: Christoph Lameter Cc: David Rientjes Cc: Dietmar Eggemann Cc: Dmitry Vyukov Cc: Frederic Weisbecker Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes (Google) Cc: Joonsoo Kim Cc: Josh Triplett Cc: Juri Lelli Cc: Cc: Lai Jiangshan Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Neeraj Upadhyay Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt Cc: syzkaller-bugs@googlegroups.com Cc: Tejun Heo Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vincenzo Frascino Cc: Vlastimil Babka Cc: Zqiang Signed-off-by: Andrew Morton --- include/linux/kasan.h | 2 -- include/linux/task_work.h | 3 --- 2 files changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 6bbfc8aa42e8..1c1b3d39e7b6 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -491,7 +491,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); -void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ @@ -509,7 +508,6 @@ static inline void kasan_cache_create(struct kmem_cache *cache, static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} -static inline void kasan_record_aux_stack_noalloc(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 2964171856e0..0646804860ff 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -19,9 +19,6 @@ enum task_work_notify_mode { TWA_SIGNAL, TWA_SIGNAL_NO_IPI, TWA_NMI_CURRENT, - - TWA_FLAGS = 0xff00, - TWAF_NO_ALLOC = 0x0100, }; static inline bool task_work_pending(struct task_struct *task) -- cgit v1.2.3 From da243c5479add600bdd58c910c9fae3355b4f026 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 28 Nov 2024 15:40:39 +0800 Subject: mm: factor out the order calculation into a new helper Patch series "Support large folios for tmpfs", v3. Traditionally, tmpfs only supported PMD-sized large folios. However nowadays with other file systems supporting any sized large folios, and extending anonymous to support mTHP, we should not restrict tmpfs to allocating only PMD-sized large folios, making it more special. Instead, we should allow tmpfs can allocate any sized large folios. Considering that tmpfs already has the 'huge=' option to control the PMD-sized large folios allocation, we can extend the 'huge=' option to allow any sized large folios. The semantics of the 'huge=' mount option are: huge=never: no any sized large folios huge=always: any sized large folios huge=within_size: like 'always' but respect the i_size huge=advise: like 'always' if requested with madvise() Note: for tmpfs mmap() faults, due to the lack of a write size hint, still allocate the PMD-sized large folios if huge=always/within_size/advise is set. Moreover, the 'deny' and 'force' testing options controlled by '/sys/kernel/mm/transparent_hugepage/shmem_enabled', still retain the same semantics. The 'deny' can disable any sized large folios for tmpfs, while the 'force' can enable PMD sized large folios for tmpfs. This patch (of 6): Factor out the order calculation into a new helper, which can be reused by shmem in the following patch. Link: https://lkml.kernel.org/r/cover.1732779148.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/5505f9ea50942820c1924d1803bfdd3a524e54f6.1732779148.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Suggested-by: Matthew Wilcox Reviewed-by: Barry Song Reviewed-by: David Hildenbrand Reviewed-by: Daniel Gomez Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bcf0865a38ae..d796c8a33647 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -727,6 +727,16 @@ typedef unsigned int __bitwise fgf_t; #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) +static inline unsigned int filemap_get_order(size_t size) +{ + unsigned int shift = ilog2(size); + + if (shift <= PAGE_SHIFT) + return 0; + + return shift - PAGE_SHIFT; +} + /** * fgf_set_order - Encode a length in the fgf_t flags. * @size: The suggested size of the folio to create. @@ -740,11 +750,11 @@ typedef unsigned int __bitwise fgf_t; */ static inline fgf_t fgf_set_order(size_t size) { - unsigned int shift = ilog2(size); + unsigned int order = filemap_get_order(size); - if (shift <= PAGE_SHIFT) + if (!order) return 0; - return (__force fgf_t)((shift - PAGE_SHIFT) << 26); + return (__force fgf_t)(order << 26); } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); -- cgit v1.2.3 From da80f4ffb0dbee2419ac04f23aad0533658f1523 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Fri, 29 Nov 2024 14:58:25 +0000 Subject: list_lru: expand list_lru_add() docs with info about sublists The documentation for list_lru_add() and list_lru_del() has not been updated since lru lists were originally introduced by commit a38e40824844 ("list: add a new LRU list type"). Back then, list_lru stored all of the items in a single list, but the implementation has since been expanded to use many sublists internally. Thus, update the docs to mention that the requirements about not using the item with several lists at the same time also applies not using different sublists. Also mention that list_lru items are reparented when the memcg is deleted as discussed on the LKML [1]. Also fix incorrect use of 'Return value:' which should be 'Return:'. Link: https://lore.kernel.org/all/Z0eXrllVhRI9Ag5b@dread.disaster.area/ [1] Link: https://lkml.kernel.org/r/20241129-list_lru_memcg_docs-v2-1-e285ff1c481b@google.com Signed-off-by: Alice Ryhl Reviewed-by: Dave Chinner Acked-by: Muchun Song Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 05c166811f6b..fe739d35a864 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -91,13 +91,24 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * @memcg: the cgroup of the sublist to add the item to. * * If the element is already part of a list, this function returns doing - * nothing. Therefore the caller does not need to keep state about whether or - * not the element already belongs in the list and is allowed to lazy update - * it. Note however that this is valid for *a* list, not *this* list. If - * the caller organize itself in a way that elements can be in more than - * one type of list, it is up to the caller to fully remove the item from - * the previous list (with list_lru_del() for instance) before moving it - * to @lru. + * nothing. This means that it is not necessary to keep state about whether or + * not the element already belongs in the list. That said, this logic only + * works if the item is in *this* list. If the item might be in some other + * list, then you cannot rely on this check and you must remove it from the + * other list before trying to insert it. + * + * The lru list consists of many sublists internally; the @nid and @memcg + * parameters are used to determine which sublist to insert the item into. + * It's important to use the right value of @nid and @memcg when deleting the + * item, since it might otherwise get deleted from the wrong sublist. + * + * This also applies when attempting to insert the item multiple times - if + * the item is currently in one sublist and you call list_lru_add() again, you + * must pass the right @nid and @memcg parameters so that the same sublist is + * used. + * + * You must ensure that the memcg is not freed during this call (e.g., with + * rcu or by taking a css refcnt). * * Return: true if the list was updated, false otherwise */ @@ -113,7 +124,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, * memcg of the sublist is determined by @item list_head. This assumption is * valid for slab objects LRU such as dentries, inodes, etc. * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); @@ -125,8 +136,19 @@ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); * @memcg: the cgroup of the sublist to delete the item from. * * This function works analogously as list_lru_add() in terms of list - * manipulation. The comments about an element already pertaining to - * a list are also valid for list_lru_del(). + * manipulation. + * + * The comments in list_lru_add() about an element already being in a list are + * also valid for list_lru_del(), that is, you can delete an item that has + * already been removed or never been added. However, if the item is in a + * list, it must be in *this* list, and you must pass the right value of @nid + * and @memcg so that the right sublist is used. + * + * You must ensure that the memcg is not freed during this call (e.g., with + * rcu or by taking a css refcnt). When a memcg is deleted, list_lru entries + * are automatically moved to the parent memcg. This is done in a race-free + * way, so during deletion of an memcg both the old and new memcg will resolve + * to the same sublist internally. * * Return: true if the list was updated, false otherwise */ @@ -142,7 +164,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, * memcg of the sublist is determined by @item list_head. This assumption is * valid for slab objects LRU such as dentries, inodes, etc. * - * Return value: true if the list was updated, false otherwise. + * Return: true if the list was updated, false otherwise. */ bool list_lru_del_obj(struct list_lru *lru, struct list_head *item); -- cgit v1.2.3 From 1168b2bec7660f5146de7b14c14b52417e900d18 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 16 Nov 2024 15:14:46 +0000 Subject: filemap: remove unused folio_add_wait_queue folio_add_wait_queue() has been unused since 2021's commit 850cba069c26 ("cachefiles: Delete the cachefiles driver pending rewrite") Remove it. Link: https://lkml.kernel.org/r/20241116151446.95555-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Reviewed-by: David Hildenbrand Reviewed-by: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d796c8a33647..fc2e1319c7bb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1280,11 +1280,6 @@ void folio_end_private_2(struct folio *folio); void folio_wait_private_2(struct folio *folio); int folio_wait_private_2_killable(struct folio *folio); -/* - * Add an arbitrary waiter to a page's wait queue - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter); - /* * Fault in userspace address range. */ -- cgit v1.2.3 From 21641bd9a7a7ce0360106a5a8e5b89a4fc74529d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 4 Nov 2024 11:23:18 -0300 Subject: lazy tlb: fix hotplug exit race with MMU_LAZY_TLB_SHOOTDOWN CPU unplug first calls __cpu_disable(), and that's where powerpc calls cleanup_cpu_mmu_context(), which clears this CPU from mm_cpumask() of all mms in the system. However this CPU may still be using a lazy tlb mm, and its mm_cpumask bit will be cleared from it. The CPU does not switch away from the lazy tlb mm until arch_cpu_idle_dead() calls idle_task_exit(). If that user mm exits in this window, it will not be subject to the lazy tlb mm shootdown and may be freed while in use as a lazy mm by the CPU that is being unplugged. cleanup_cpu_mmu_context() could be moved later, but it looks better to move the lazy tlb mm switching earlier. The problem with doing the lazy mm switching in idle_task_exit() is explained in commit bf2c59fce4074 ("sched/core: Fix illegal RCU from offline CPUs"), which added a wart to switch away from the mm but leave it set in active_mm to be cleaned up later. So instead, switch away from the lazy tlb mm at sched_cpu_wait_empty(), which is the last hotplug state before teardown (CPUHP_AP_SCHED_WAIT_EMPTY). This CPU will never switch to a user thread from this point, so it has no chance to pick up a new lazy tlb mm. This removes the lazy tlb mm handling wart in CPU unplug. With this, idle_task_exit() is not needed anymore and can be cleaned up. This leaves the prototype alone, to be cleaned after this change. herton: took the suggestions from https://lore.kernel.org/all/87jzvyprsw.ffs@tglx/ and made adjustments on the initial patch proposed by Nicholas. Link: https://lkml.kernel.org/r/20230524060455.147699-1-npiggin@gmail.com Link: https://lore.kernel.org/all/20230525205253.E2FAEC433EF@smtp.kernel.org/ Link: https://lkml.kernel.org/r/20241104142318.3295663-1-herton@redhat.com Fixes: 2655421ae69f ("lazy tlb: shoot lazies, non-refcounting lazy tlb mm reference handling scheme") Signed-off-by: Nicholas Piggin Signed-off-by: Herton R. Krzesinski Suggested-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Michael Ellerman Signed-off-by: Andrew Morton --- include/linux/sched/hotplug.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h index 412cdaba33eb..17e04859b9a4 100644 --- a/include/linux/sched/hotplug.h +++ b/include/linux/sched/hotplug.h @@ -18,10 +18,6 @@ extern int sched_cpu_dying(unsigned int cpu); # define sched_cpu_dying NULL #endif -#ifdef CONFIG_HOTPLUG_CPU -extern void idle_task_exit(void); -#else static inline void idle_task_exit(void) {} -#endif #endif /* _LINUX_SCHED_HOTPLUG_H */ -- cgit v1.2.3 From 7a5714991872f0a4805cc6004a5bff19a71d0459 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Dec 2024 18:05:10 +0000 Subject: mm: abstract get_arg_page() stack expansion and mmap read lock Right now fs/exec.c invokes expand_downwards(), an otherwise internal implementation detail of the VMA logic in order to ensure that an arg page can be obtained by get_user_pages_remote(). In order to be able to move the stack expansion logic into mm/vma.c to make it available to userland testing we need to find an alternative approach here. We do so by providing the mmap_read_lock_maybe_expand() function which also helpfully documents what get_arg_page() is doing here and adds an additional check against VM_GROWSDOWN to make explicit that the stack expansion logic is only invoked when the VMA is indeed a downward-growing stack. This allows expand_downwards() to become a static function. Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be currently user-visible in any way, that is place within an rmap or VMA tree. It must be a newly allocated VMA. This is the case when exec invokes this function. Link: https://lkml.kernel.org/r/5295d1c70c58e6aa63d14be68d4e1de9fa1c8e6d.1733248985.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Al Viro Cc: Christian Brauner Cc: Eric W. Biederman Cc: Jan Kara Cc: Jann Horn Cc: Kees Cook Cc: Liam R. Howlett Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index b1c3db9cf355..2e5ef71b8629 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3324,6 +3324,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, @@ -3437,9 +3439,6 @@ extern unsigned long stack_guard_gap; int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); -/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address); - /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, -- cgit v1.2.3 From b9e40605daa94ae1817ceb5ce9e9b34093c6d850 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 3 Dec 2024 10:47:28 +0100 Subject: mm/page_isolation: don't pass gfp flags to start_isolate_page_range() The parameter is unused, so let's stop passing it. Link: https://lkml.kernel.org/r/20241203094732.200195-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Reviewed-by: Oscar Salvador Reviewed-by: Vishal Moola (Oracle) Cc: Christophe Leroy Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Naveen N Rao Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 73dc2c1841ec..898bb788243b 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -31,7 +31,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, int migratetype); int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags, gfp_t gfp_flags); + int migratetype, int flags); void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int migratetype); -- cgit v1.2.3 From 735fad44b5a86edf0fe65a8e8d43595bd1cf1d58 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:46 +0800 Subject: mm: zap_install_uffd_wp_if_needed: return whether uffd-wp pte has been re-installed In some cases, we'll replace the none pte with an uffd-wp swap special pte marker when necessary. Let's expose this information to the caller through the return value, so that subsequent commits can use this information to detect whether the PTE page is empty. Link: https://lkml.kernel.org/r/9d4516554724eda87d6576468042a1741c475413.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Peter Zijlstra Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 1b6a917fffa4..34e5097182a0 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -564,9 +564,9 @@ static inline pte_marker copy_pte_marker( * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * - * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled. + * Returns true if an uffd-wp pte was installed, false otherwise. */ -static inline void +static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { @@ -583,7 +583,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) - return; + return false; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) @@ -596,10 +596,13 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; - if (unlikely(arm_uffd_pte)) + if (unlikely(arm_uffd_pte)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); + return true; + } #endif + return false; } static inline bool vma_has_recency(struct vm_area_struct *vma) -- cgit v1.2.3 From 6375e95f381e3dc85065b6f74263a61522736203 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:49 +0800 Subject: mm: pgtable: reclaim empty PTE page in madvise(MADV_DONTNEED) Now in order to pursue high performance, applications mostly use some high-performance user-mode memory allocators, such as jemalloc or tcmalloc. These memory allocators use madvise(MADV_DONTNEED or MADV_FREE) to release physical memory, but neither MADV_DONTNEED nor MADV_FREE will release page table memory, which may cause huge page table memory usage. The following are a memory usage snapshot of one process which actually happened on our server: VIRT: 55t RES: 590g VmPTE: 110g In this case, most of the page table entries are empty. For such a PTE page where all entries are empty, we can actually free it back to the system for others to use. As a first step, this commit aims to synchronously free the empty PTE pages in madvise(MADV_DONTNEED) case. We will detect and free empty PTE pages in zap_pte_range(), and will add zap_details.reclaim_pt to exclude cases other than madvise(MADV_DONTNEED). Once an empty PTE is detected, we first try to hold the pmd lock within the pte lock. If successful, we clear the pmd entry directly (fast path). Otherwise, we wait until the pte lock is released, then re-hold the pmd and pte locks and loop PTRS_PER_PTE times to check pte_none() to re-detect whether the PTE page is empty and free it (slow path). For other cases such as madvise(MADV_FREE), consider scanning and freeing empty PTE pages asynchronously in the future. The following code snippet can show the effect of optimization: mmap 50G while (1) { for (; i < 1024 * 25; i++) { touch 2M memory madvise MADV_DONTNEED 2M } } As we can see, the memory usage of VmPTE is reduced: before after VIRT 50.0 GB 50.0 GB RES 3.1 MB 3.1 MB VmPTE 102640 KB 240 KB [zhengqi.arch@bytedance.com: fix uninitialized symbol 'ptl'] Link: https://lkml.kernel.org/r/20241206112348.51570-1-zhengqi.arch@bytedance.com Link: https://lore.kernel.org/linux-mm/224e6a4e-43b5-4080-bdd8-b0a6fb2f0853@stanley.mountain/ Link: https://lkml.kernel.org/r/92aba2b319a734913f18ba41e7d86a265f0b84e2.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Peter Zijlstra Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e5ef71b8629..9372bc058b43 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2320,6 +2320,7 @@ extern void pagefault_out_of_memory(void); struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ + bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; -- cgit v1.2.3 From 718b13861d2256ac95d65b892953282a63faf240 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:50 +0800 Subject: x86: mm: free page table pages by RCU instead of semi RCU Now, if CONFIG_MMU_GATHER_RCU_TABLE_FREE is selected, the page table pages will be freed by semi RCU, that is: - batch table freeing: asynchronous free by RCU - single table freeing: IPI + synchronous free In this way, the page table can be lockless traversed by disabling IRQ in paths such as fast GUP. But this is not enough to free the empty PTE page table pages in paths other that munmap and exit_mmap path, because IPI cannot be synchronized with rcu_read_lock() in pte_offset_map{_lock}(). In preparation for supporting empty PTE page table pages reclaimation, let single table also be freed by RCU like batch table freeing. Then we can also use pte_offset_map() etc to prevent PTE page from being freed. Like pte_free_defer(), we can also safely use ptdesc->pt_rcu_head to free the page table pages: - The pt_rcu_head is unioned with pt_list and pmd_huge_pte. - For pt_list, it is used to manage the PGD page in x86. Fortunately tlb_remove_table() will not be used for free PGD pages, so it is safe to use pt_rcu_head. - For pmd_huge_pte, it is used for THPs, so it is safe. After applying this patch, if CONFIG_PT_RECLAIM is enabled, the function call of free_pte() is as follows: free_pte pte_free_tlb __pte_free_tlb ___pte_free_tlb paravirt_tlb_remove_table tlb_remove_table [!CONFIG_PARAVIRT, Xen PV, Hyper-V, KVM] [no-free-memory slowpath:] tlb_table_invalidate tlb_remove_table_one __tlb_remove_table_one [frees via RCU] [fastpath:] tlb_table_flush tlb_remove_table_free [frees via RCU] native_tlb_remove_table [CONFIG_PARAVIRT on native] tlb_remove_table [see above] Link: https://lkml.kernel.org/r/0287d442a973150b0e1019cc406e6322d148277a.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 332cee285662..7490d84af310 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -438,7 +438,9 @@ FOLIO_MATCH(compound_head, _head_2a); * struct ptdesc - Memory descriptor for page tables. * @__page_flags: Same as page flags. Powerpc only. * @pt_rcu_head: For freeing page table pages. - * @pt_list: List of used page tables. Used for s390 and x86. + * @pt_list: List of used page tables. Used for s390 gmap shadow pages + * (which are not linked into the user page tables) and x86 + * pgds. * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. -- cgit v1.2.3 From 67c8b11bd58aee4644c9a6e495d0c234771e9175 Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Mon, 2 Dec 2024 20:47:30 +0800 Subject: mm: add per-order mTHP swap-in fallback/fallback_charge counters Currently, large folio swap-in is supported, but we lack a method to analyze their success ratio. Similar to anon_fault_fallback, we introduce per-order mTHP swpin_fallback and swpin_fallback_charge counters for calculating their success ratio. The new counters are located at: /sys/kernel/mm/transparent_hugepage/hugepages-/stats/ swpin_fallback swpin_fallback_charge Link: https://lkml.kernel.org/r/20241202124730.2407037-1-haowenchao22@gmail.com Signed-off-by: Wenchao Hao Reviewed-by: Barry Song Reviewed-by: Lance Yang Cc: Baolin Wang Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Peter Xu Cc: Ryan Roberts Cc: Usama Arif Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b94c2e8ee918..93e509b6c00e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -121,6 +121,8 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_ZSWPOUT, MTHP_STAT_SWPIN, + MTHP_STAT_SWPIN_FALLBACK, + MTHP_STAT_SWPIN_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, -- cgit v1.2.3 From dba4761a3e40433a8d9e434d515ecbae19b3dcb1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:14 -0800 Subject: seqlock: add raw_seqcount_try_begin Add raw_seqcount_try_begin() to opens a read critical section of the given seqcount_t if the counter is even. This enables eliding the critical section entirely if the counter is odd, instead of doing the speculation knowing it will fail. Link: https://lkml.kernel.org/r/20241122174416.1367052-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: David Hildenbrand Reviewed-by: Liam R. Howlett Suggested-by: Peter Zijlstra Cc: Christian Brauner Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Paul E. McKenney Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/seqlock.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..22c2c48b4265 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -318,6 +318,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) __seq; \ }) +/** + * raw_seqcount_try_begin() - begin a seqcount_t read critical section + * w/o lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * Similar to raw_seqcount_begin(), except it enables eliding the critical + * section entirely if odd, instead of doing the speculation knowing it will + * fail. + * + * Useful when counter stabilization is more or less equivalent to taking + * the lock and there is a slowpath that does that. + * + * If true, start will be set to the (even) sequence count read. + * + * Return: true when a read critical section is started. + */ +#define raw_seqcount_try_begin(s, start) \ +({ \ + start = raw_read_seqcount(s); \ + !(start & 1); \ +}) + /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization -- cgit v1.2.3 From e5e7fb278e5924f29ceab42bbbb891cde528f7cc Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:15 -0800 Subject: mm: convert mm_lock_seq to a proper seqcount Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock variants to increment it, in-line with the usual seqcount usage pattern. This lets us check whether the mmap_lock is write-locked by checking mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be used when implementing mmap_lock speculation functions. As a result vm_lock_seq is also change to be unsigned to match the type of mm_lock_seq.sequence. Link: https://lkml.kernel.org/r/20241122174416.1367052-2-surenb@google.com Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Paul E. McKenney Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 +++++------ include/linux/mm_types.h | 7 ++++-- include/linux/mmap_lock.h | 55 +++++++++++++++++++++++++++++++---------------- 3 files changed, 47 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9372bc058b43..a3a50c37603e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) @@ -728,7 +728,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } @@ -743,7 +743,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -751,7 +751,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq; + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } @@ -762,7 +762,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) */ static inline void vma_start_write(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; @@ -780,7 +780,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7490d84af310..5f1b2dc788e2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -729,7 +729,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; + unsigned int vm_lock_seq; /* Unstable RCU readers are allowed to read this. */ struct vma_lock *vm_lock; #endif @@ -923,6 +923,9 @@ struct mm_struct { * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. + * Incremented every time mmap_lock is write-locked/unlocked. + * Initialized to 0, therefore odd values indicate mmap_lock + * is write-locked and even values that it's released. * * Can be modified under write mmap_lock using RELEASE * semantics. @@ -931,7 +934,7 @@ struct mm_struct { * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ - int mm_lock_seq; + seqcount_t mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index de9dc20b01ba..9715326f5a85 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,39 +71,39 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK -/* - * Drop all currently-held per-VMA locks. - * This is called from the mmap_lock implementation directly before releasing - * a write-locked mmap_lock (or downgrading it to read-locked). - * This should normally NOT be called manually from other places. - * If you want to call this manually anyway, keep in mind that this will release - * *all* VMA write locks, including ones from further up the stack. - */ -static inline void vma_end_write_all(struct mm_struct *mm) +static inline void mm_lock_seqcount_init(struct mm_struct *mm) { - mmap_assert_write_locked(mm); - /* - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive - * mmap_lock being held. - * We need RELEASE semantics here to ensure that preceding stores into - * the VMA take effect before we unlock it with this store. - * Pairs with ACQUIRE semantics in vma_start_read(). - */ - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); + seqcount_init(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) +{ + do_raw_write_seqcount_begin(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_end(struct mm_struct *mm) +{ + ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); + do_raw_write_seqcount_end(&mm->mm_lock_seq); } + #else -static inline void vma_end_write_all(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} #endif static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); } static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -111,6 +111,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -120,10 +121,26 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm) __mmap_lock_trace_start_locking(mm, true); ret = down_write_killable(&mm->mmap_lock); + if (!ret) + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, ret == 0); return ret; } +/* + * Drop all currently-held per-VMA locks. + * This is called from the mmap_lock implementation directly before releasing + * a write-locked mmap_lock (or downgrading it to read-locked). + * This should normally NOT be called manually from other places. + * If you want to call this manually anyway, keep in mind that this will release + * *all* VMA write locks, including ones from further up the stack. + */ +static inline void vma_end_write_all(struct mm_struct *mm) +{ + mmap_assert_write_locked(mm); + mm_lock_seqcount_end(mm); +} + static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); -- cgit v1.2.3 From 6f030e32e4499942a223677169d006085d8c57ce Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:16 -0800 Subject: mm: introduce mmap_lock_speculate_{try_begin|retry} Add helper functions to speculatively perform operations without read-locking mmap_lock, expecting that mmap_lock will not be write-locked and mm is not modified from under us. [akpm@linux-foundation.org: use read_seqcount_retry() in mmap_lock_speculate_retry(), per Wei Yang] Link: https://lkml.kernel.org/r/20241122174416.1367052-3-surenb@google.com Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Paul E. McKenney Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 9715326f5a85..45a21faa3ff6 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,6 +71,7 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK + static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); @@ -87,11 +88,39 @@ static inline void mm_lock_seqcount_end(struct mm_struct *mm) do_raw_write_seqcount_end(&mm->mm_lock_seq); } -#else +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + /* + * Since mmap_lock is a sleeping lock, and waiting for it to become + * unlocked is more or less equivalent with taking it ourselves, don't + * bother with the speculative path if mmap_lock is already write-locked + * and take the slow path, which takes the lock. + */ + return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return read_seqcount_retry(&mm->mm_lock_seq, seq); +} + +#else /* CONFIG_PER_VMA_LOCK */ + static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} -#endif + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + return false; +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return true; +} + +#endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_init_lock(struct mm_struct *mm) { -- cgit v1.2.3 From fa00b8ef1803fe133b4897c25227aa0d298dd093 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 6 Dec 2024 21:28:46 +0000 Subject: mm: perform all memfd seal checks in a single place We no longer actually need to perform these checks in the f_op->mmap() hook any longer. We already moved the operation which clears VM_MAYWRITE on a read-only mapping of a write-sealed memfd in order to work around the restrictions imposed by commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour"). There is no reason for us not to simply go ahead and additionally check to see if any pre-existing seals are in place here rather than defer this to the f_op->mmap() hook. By doing this we remove more logic from shmem_mmap() which doesn't belong there, as well as doing the same for hugetlbfs_file_mmap(). We also remove dubious shared logic in mm.h which simply does not belong there either. It makes sense to do these checks at the earliest opportunity, we know these are shmem (or hugetlbfs) mappings whose relevant VMA flags will not change from the invoking do_mmap() so there is simply no need to wait. This also means the implementation of further memfd seal flags can be done within mm/memfd.c and also have the opportunity to modify VMA flags as necessary early in the mapping logic. [lorenzo.stoakes@oracle.com: fix typos in !memfd inline stub] Link: https://lkml.kernel.org/r/7dee6c5d-480b-4c24-b98e-6fa47dbd8a23@lucifer.local Link: https://lkml.kernel.org/r/20241206212846.210835-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Tested-by: Isaac J. Manjarres Cc: Hugh Dickins Cc: Jann Horn Cc: Kalesh Singh Cc: Liam R. Howlett Cc: Muchun Song Cc: Vlastimil Babka Cc: Jeff Xu Signed-off-by: Andrew Morton --- include/linux/memfd.h | 23 +++++++++++---------- include/linux/mm.h | 55 --------------------------------------------------- 2 files changed, 11 insertions(+), 67 deletions(-) (limited to 'include') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index d437e3070850..246daadbfde8 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,7 +7,14 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); -unsigned int *memfd_file_seals_ptr(struct file *file); +/* + * Check for any existing seals on mmap, return an error if access is denied due + * to sealing, or 0 otherwise. + * + * We also update VMA flags if appropriate by manipulating the VMA flags pointed + * to by vm_flags_ptr. + */ +int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -17,19 +24,11 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } - -static inline unsigned int *memfd_file_seals_ptr(struct file *file) +static inline int memfd_check_seals_mmap(struct file *file, + unsigned long *vm_flags_ptr) { - return NULL; + return 0; } #endif -/* Retrieve memfd seals associated with the file, if any. */ -static inline unsigned int memfd_file_seals(struct file *file) -{ - unsigned int *sealsp = memfd_file_seals_ptr(file); - - return sealsp ? *sealsp : 0; -} - #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index a3a50c37603e..e7c54b9aac6d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4102,61 +4102,6 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif -static inline bool is_write_sealed(int seals) -{ - return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); -} - -/** - * is_readonly_sealed - Checks whether write-sealed but mapped read-only, - * in which case writes should be disallowing moving - * forwards. - * @seals: the seals to check - * @vm_flags: the VMA flags to check - * - * Returns whether readonly sealed, in which case writess should be disallowed - * going forward. - */ -static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) -{ - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (is_write_sealed(seals) && - ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) - return true; - - return false; -} - -/** - * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and - * handle them. - * @seals: the seals to check - * @vma: the vma to operate on - * - * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper - * check/handling on the vma flags. Return 0 if check pass, or <0 for errors. - */ -static inline int seal_check_write(int seals, struct vm_area_struct *vma) -{ - if (!is_write_sealed(seals)) - return 0; - - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - return 0; -} - #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, -- cgit v1.2.3 From 991135774c0e05a4734e6d32aa03b00355e4cac9 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Wed, 11 Dec 2024 12:39:50 -0800 Subject: memcg/hugetlb: introduce mem_cgroup_charge_hugetlb This patch introduces mem_cgroup_charge_hugetlb which combines the logic of mem_cgroup_hugetlb_try_charge / mem_cgroup_hugetlb_commit_charge and removes the need for mem_cgroup_hugetlb_cancel_charge. It also reduces the footprint of memcg in hugetlb code and consolidates all memcg related error paths into one. Link: https://lkml.kernel.org/r/20241211203951.764733-3-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Acked-by: Shakeel Butt Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b28180269e75..387470bed399 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -649,6 +649,8 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages); +int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); + int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); @@ -1169,6 +1171,11 @@ static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, return 0; } +static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp) +{ + return 0; +} + static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { -- cgit v1.2.3 From 1d8f136a421f26747e58c01281cba5bffae8d289 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Wed, 11 Dec 2024 12:39:51 -0800 Subject: memcg/hugetlb: remove memcg hugetlb try-commit-cancel protocol This patch fully removes the mem_cgroup_{try, commit, cancel}_charge functions, as well as their hugetlb variants. Link: https://lkml.kernel.org/r/20241211203951.764733-4-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Acked-by: Shakeel Butt Reviewed-by: Nhat Pham Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 387470bed399..6e74b8254d9b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -620,8 +620,6 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, page_counter_read(&memcg->memory); } -void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg); - int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); /** @@ -646,9 +644,6 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, return __mem_cgroup_charge(folio, mm, gfp); } -int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, - long nr_pages); - int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, @@ -679,7 +674,6 @@ static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) __mem_cgroup_uncharge_folios(folios); } -void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_replace_folio(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new); @@ -1154,23 +1148,12 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, return false; } -static inline void mem_cgroup_commit_charge(struct folio *folio, - struct mem_cgroup *memcg) -{ -} - static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { return 0; } -static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, - gfp_t gfp, long nr_pages) -{ - return 0; -} - static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp) { return 0; @@ -1194,11 +1177,6 @@ static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } -static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) -{ -} - static inline void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { -- cgit v1.2.3 From ccd582059a132f2bdc3486766ac57c24c465f471 Mon Sep 17 00:00:00 2001 From: Guo Weikang Date: Thu, 12 Dec 2024 18:10:00 +0800 Subject: mm/early_ioremap: add null pointer checks to prevent NULL-pointer dereference The early_ioremap interface can fail and return NULL in certain cases. To prevent NULL-pointer dereference crashes, fixed issues in the acpi_extlog and copy_early_mem interfaces, improving robustness when handling early memory. Link: https://lkml.kernel.org/r/20241212101004.1544070-1-guoweikang.kernel@gmail.com Signed-off-by: Guo Weikang Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Julian Stecklina Cc: Kevin Loughlin Cc: Len Brown Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Xin Li (Intel) Signed-off-by: Andrew Morton --- include/asm-generic/early_ioremap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h index 9d0479f50f97..5db59a1efb65 100644 --- a/include/asm-generic/early_ioremap.h +++ b/include/asm-generic/early_ioremap.h @@ -35,7 +35,7 @@ extern void early_ioremap_reset(void); /* * Early copy from unmapped memory to kernel mapped memory. */ -extern void copy_from_early_mem(void *dest, phys_addr_t src, +extern int copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size); #else -- cgit v1.2.3 From 40733e7e0c260d540447d3646e451274bc5d3374 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Dec 2024 19:46:31 +0800 Subject: mm/swap_cgroup: remove swap_cgroup_cmpxchg This function is never used after commit 6b611388b626 ("memcg-v1: remove charge move code"). Link: https://lkml.kernel.org/r/20241218114633.85196-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Yosry Ahmed Reviewed-by: Roman Gushchin Acked-by: Shakeel Butt Acked-by: Chris Li Cc: Barry Song Cc: Hugh Dickins Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/swap_cgroup.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index ae73a87775b3..d521ad1c4164 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -6,8 +6,6 @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new); extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); -- cgit v1.2.3 From 6769183166b33b1a5de8f938d1ff4d5f4be0f428 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Dec 2024 19:46:33 +0800 Subject: mm/swap_cgroup: decouple swap cgroup recording and clearing The current implementation of swap cgroup tracking is a bit complex and fragile: On charging path, swap_cgroup_record always records an actual memcg id, and it depends on the caller to make sure all entries passed in must belong to one single folio. As folios are always charged or uncharged as a whole, and always charged and uncharged in order, swap_cgroup doesn't need an extra lock. On uncharging path, swap_cgroup_record always sets the record to zero. These entries won't be charged again until uncharging is done. So there is no extra lock needed either. Worth noting that swap cgroup clearing may happen without folio involved, eg. exiting processes will zap its page table without swapin. The xchg/cmpxchg provides atomic operations and barriers to ensure no tearing or synchronization issue of these swap cgroup records. It works but quite error-prone. Things can be much clear and robust by decoupling recording and clearing into two helpers. Recording takes the actual folio being charged as argument, and clearing always set the record to zero, and refine the debug sanity checks to better reflect their usage Benchmark even showed a very slight improvement as it saved some extra arguments and lookups: make -j96 with defconfig on tmpfs in 1.5G memory cgroup using 4k folios: Before: sys 9617.23 (stdev 37.764062) After : sys 9541.54 (stdev 42.973976) make -j96 with defconfig on tmpfs in 2G memory cgroup using 64k folios: Before: sys 7358.98 (stdev 54.927593) After : sys 7337.82 (stdev 39.398956) Link: https://lkml.kernel.org/r/20241218114633.85196-5-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Barry Song Cc: Hugh Dickins Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap_cgroup.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index d521ad1c4164..b5ec038069da 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -6,8 +6,8 @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents); +extern void swap_cgroup_record(struct folio *folio, swp_entry_t ent); +extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); extern void swap_cgroup_swapoff(int type); @@ -15,8 +15,12 @@ extern void swap_cgroup_swapoff(int type); #else static inline -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents) +void swap_cgroup_record(struct folio *folio, swp_entry_t ent) +{ +} + +static inline +unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) { return 0; } -- cgit v1.2.3 From 04f13d241b8b146b23038bffd907cb8278391d07 Mon Sep 17 00:00:00 2001 From: yangge Date: Sat, 11 Jan 2025 15:58:20 +0800 Subject: mm: replace free hugepage folios after migration My machine has 4 NUMA nodes, each equipped with 32GB of memory. I have configured each NUMA node with 16GB of CMA and 16GB of in-use hugetlb pages. The allocation of contiguous memory via cma_alloc() can fail probabilistically. When there are free hugetlb folios in the hugetlb pool, during the migration of in-use hugetlb folios, new folios are allocated from the free hugetlb pool. After the migration is completed, the old folios are released back to the free hugetlb pool instead of being returned to the buddy system. This can cause test_pages_isolated() check to fail, ultimately leading to the failure of cma_alloc(). Call trace: cma_alloc() __alloc_contig_migrate_range() // migrate in-use hugepage test_pages_isolated() __test_page_isolated_in_pageblock() PageBuddy(page) // check if the page is in buddy To address this issue, we introduce a function named replace_free_hugepage_folios(). This function will replace the hugepage in the free hugepage pool with a new one and release the old one to the buddy system. After the migration of in-use hugetlb pages is completed, we will invoke replace_free_hugepage_folios() to ensure that these hugepages are properly released to the buddy system. Following this step, when test_pages_isolated() is executed for inspection, it will successfully pass. Additionally, when alloc_contig_range() is used to migrate multiple in-use hugetlb pages, it can result in some in-use hugetlb pages being released back to the free hugetlb pool and subsequently being reallocated and used again. For example: [huge 0] [huge 1] To migrate huge 0, we obtain huge x from the pool. After the migration is completed, we return the now-freed huge 0 back to the pool. When it's time to migrate huge 1, we can simply reuse the now-freed huge 0 from the pool. As a result, when replace_free_hugepage_folios() is executed, it cannot release huge 0 back to the buddy system. To address this issue, we should prevent the reuse of isolated free hugepages during the migration process. Link: https://lkml.kernel.org/r/1734503588-16254-1-git-send-email-yangge1116@126.com Link: https://lkml.kernel.org/r/1736582300-11364-1-git-send-email-yangge1116@126.com Signed-off-by: yangge Cc: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ae4fe8615bb6..10faf42ca96a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -681,6 +681,7 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, @@ -1059,6 +1060,12 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } +static inline int replace_free_hugepage_folios(unsigned long start_pfn, + unsigned long end_pfn) +{ + return 0; +} + static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -- cgit v1.2.3 From 44d46b76c3a4b514a0cc9dab147ed430e5c1d699 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 20 Dec 2024 16:07:09 -0500 Subject: mm: add build-time option for hotplug memory default online type Memory hotplug presently auto-onlines memory into a zone the kernel deems appropriate if CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y. The memhp_default_state boot param enables runtime config, but it's not possible to do this at build-time. Remove CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE, and replace it with CONFIG_MHP_DEFAULT_ONLINE_TYPE_* choices that sync with the boot param. Selections: CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE => mhp_default_online_type = "offline" Memory will not be onlined automatically. CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO => mhp_default_online_type = "online" Memory will be onlined automatically in a zone deemed. appropriate by the kernel. CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL => mhp_default_online_type = "online_kernel" Memory will be onlined automatically. The zone may allow kernel data (e.g. ZONE_NORMAL). CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE => mhp_default_online_type = "online_movable" Memory will be onlined automatically. The zone will be ZONE_MOVABLE. Default to CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE to match the existing default CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=n behavior. Existing users of CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y should use CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO. [gourry@gourry.net: update KConfig comments] Link: https://lkml.kernel.org/r/20241226182918.648799-1-gourry@gourry.net Link: https://lkml.kernel.org/r/20241220210709.300066-1-gourry@gourry.net Signed-off-by: Gregory Price Acked-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Huacai Chen Cc: Jonathan Corbet Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index b27ddce5d324..eaac5ae8c05c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -144,8 +144,6 @@ extern u64 max_mem_size; extern int mhp_online_type_from_str(const char *str); -/* Default online_type (MMOP_*) when new memory blocks are added. */ -extern int mhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) @@ -303,6 +301,9 @@ static inline void __remove_memory(u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_MEMORY_HOTPLUG +/* Default online_type (MMOP_*) when new memory blocks are added. */ +extern int mhp_get_default_online_type(void); +extern void mhp_set_default_online_type(int online_type); extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); -- cgit v1.2.3 From 98a7e47faa3ec38260b851a1c5823cbd45d5a229 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Wed, 8 Jan 2025 14:57:19 +0800 Subject: asm-generic: pgalloc: provide generic p4d_{alloc_one,free} Four architectures currently implement 5-level pgtables: arm64, riscv, x86 and s390. The first three have essentially the same implementation for p4d_alloc_one() and p4d_free(), so we've got an opportunity to reduce duplication like at the lower levels. Provide a generic version of p4d_alloc_one() and p4d_free(), and make use of it on those architectures. Their implementation is the same as at PUD level, except that p4d_free() performs a runtime check by calling mm_p4d_folded(). 5-level pgtables depend on a runtime-detected hardware feature on all supported architectures, so we might as well include this check in the generic implementation. No runtime check is required in p4d_alloc_one() as the top-level p4d_alloc() already does the required check. Link: https://lkml.kernel.org/r/26d69c74a29183ecc335b9b407040d8e4cd70c6a.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Kevin Brodsky Signed-off-by: Qi Zheng Acked-by: Dave Hansen Acked-by: Arnd Bergmann [asm-generic] Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Peter Zijlstra (Intel) Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 45 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 7c48f5fbf8aa..59131629ac9c 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -215,6 +215,51 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) #endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#if CONFIG_PGTABLE_LEVELS > 4 + +static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_PGTABLE_USER; + struct ptdesc *ptdesc; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + gfp &= ~__GFP_HIGHMEM; + + ptdesc = pagetable_alloc_noprof(gfp, 0); + if (!ptdesc) + return NULL; + + return ptdesc_address(ptdesc); +} +#define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) + +#ifndef __HAVE_ARCH_P4D_ALLOC_ONE +static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) +{ + return __p4d_alloc_one_noprof(mm, addr); +} +#define p4d_alloc_one(...) alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__)) +#endif + +static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(p4d); + + BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); + pagetable_free(ptdesc); +} + +#ifndef __HAVE_ARCH_P4D_FREE +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + if (!mm_p4d_folded(mm)) + __p4d_free(mm, p4d); +} +#endif + +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ + #ifndef __HAVE_ARCH_PGD_FREE static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { -- cgit v1.2.3 From 5fcf5fa61218176acf198d9e63fb5739dd147244 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:20 +0800 Subject: mm: pgtable: add statistics for P4D level page table Like other levels of page tables, add statistics for P4D level page table. Link: https://lkml.kernel.org/r/d55fe3c286305aae84457da9e1066df99b3de125.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Originally-by: Peter Zijlstra (Intel) Reviewed-by: Kevin Brodsky Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 2 ++ include/linux/mm.h | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 59131629ac9c..bb482eeca0c3 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -230,6 +230,7 @@ static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long if (!ptdesc) return NULL; + pagetable_p4d_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) @@ -247,6 +248,7 @@ static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) struct ptdesc *ptdesc = virt_to_ptdesc(p4d); BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); + pagetable_p4d_dtor(ptdesc); pagetable_free(ptdesc); } diff --git a/include/linux/mm.h b/include/linux/mm.h index e7c54b9aac6d..2e56a9634a97 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3175,6 +3175,22 @@ static inline void pagetable_pud_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } +static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); +} + +static inline void pagetable_p4d_dtor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} + extern void __init pagecache_init(void); extern void free_initmem(void); -- cgit v1.2.3 From db6b435d731a8d82c38e558175db55466cb5832a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:23 +0800 Subject: mm: pgtable: introduce pagetable_dtor() The pagetable_p*_dtor() are exactly the same except for the handling of ptlock. If we make ptlock_free() handle the case where ptdesc->ptl is NULL and remove VM_BUG_ON_PAGE() from pmd_ptlock_free(), we can unify pagetable_p*_dtor() into one function. Let's introduce pagetable_dtor() to do this. Later, pagetable_dtor() will be moved to tlb_remove_ptdesc(), so that ptlock and page table pages can be freed together (regardless of whether RCU is used). This prevents the use-after-free problem where the ptlock is freed immediately but the page table pages is freed later via RCU. Link: https://lkml.kernel.org/r/47f44fff9dc68d9d9e9a0d6c036df275f820598a.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Originally-by: Peter Zijlstra (Intel) Reviewed-by: Kevin Brodsky Acked-by: Alexander Gordeev [s390] Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 8 +++---- include/linux/mm.h | 52 ++++++++----------------------------------- 2 files changed, 13 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index bb482eeca0c3..4afb346eae25 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -109,7 +109,7 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { struct ptdesc *ptdesc = page_ptdesc(pte_page); - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } @@ -153,7 +153,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) struct ptdesc *ptdesc = virt_to_ptdesc(pmd); BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - pagetable_pmd_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } #endif @@ -202,7 +202,7 @@ static inline void __pud_free(struct mm_struct *mm, pud_t *pud) struct ptdesc *ptdesc = virt_to_ptdesc(pud); BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - pagetable_pud_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } @@ -248,7 +248,7 @@ static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) struct ptdesc *ptdesc = virt_to_ptdesc(p4d); BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); - pagetable_p4d_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e56a9634a97..a3b2263f1c1a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2992,6 +2992,15 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ +static inline void pagetable_dtor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + ptlock_free(ptdesc); + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} + static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3003,15 +3012,6 @@ static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) return true; } -static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) @@ -3088,14 +3088,6 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) return ptlock_init(ptdesc); } -static inline void pmd_ptlock_free(struct ptdesc *ptdesc) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); -#endif - ptlock_free(ptdesc); -} - #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else @@ -3106,7 +3098,6 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } -static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -3131,15 +3122,6 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) return true; } -static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - pmd_ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be @@ -3167,14 +3149,6 @@ static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) lruvec_stat_add_folio(folio, NR_PAGETABLE); } -static inline void pagetable_pud_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3183,14 +3157,6 @@ static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) lruvec_stat_add_folio(folio, NR_PAGETABLE); } -static inline void pagetable_p4d_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - extern void __init pagecache_init(void); extern void free_initmem(void); -- cgit v1.2.3 From 2dccdf7076f671764d02c850e83a8b457105268d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:30 +0800 Subject: mm: pgtable: introduce generic __tlb_remove_table() Several architectures (arm, arm64, riscv and x86) define exactly the same __tlb_remove_table(), just introduce generic __tlb_remove_table() to eliminate these duplications. The s390 __tlb_remove_table() is nearly the same, so also make s390 __tlb_remove_table() version generic. Link: https://lkml.kernel.org/r/ea372633d94f4d3f9f56a7ec5994bf050bf77e39.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Kevin Brodsky Acked-by: Andreas Larsson [sparc] Acked-by: Alexander Gordeev [s390] Acked-by: Arnd Bergmann [asm-generic] Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V (Arm) Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Peter Zijlstra (Intel) Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 709830274b75..69de47c7ef3c 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -153,8 +153,9 @@ * * Useful if your architecture has non-page page directories. * - * When used, an architecture is expected to provide __tlb_remove_table() - * which does the actual freeing of these pages. + * When used, an architecture is expected to provide __tlb_remove_table() or + * use the generic __tlb_remove_table(), which does the actual freeing of these + * pages. * * MMU_GATHER_RCU_TABLE_FREE * @@ -207,6 +208,16 @@ struct mmu_table_batch { #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) +#ifndef __HAVE_ARCH_TLB_REMOVE_TABLE +static inline void __tlb_remove_table(void *table) +{ + struct ptdesc *ptdesc = (struct ptdesc *)table; + + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); +} +#endif + extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */ -- cgit v1.2.3 From 92ec7fd136a1f900656bbf6788e15127529e5387 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:31 +0800 Subject: mm: pgtable: completely move pagetable_dtor() to generic tlb_remove_table() For the generic tlb_remove_table(), it is implemented in the following two forms: 1) CONFIG_MMU_GATHER_TABLE_FREE is enabled tlb_remove_table --> generic __tlb_remove_table() 2) CONFIG_MMU_GATHER_TABLE_FREE is disabled tlb_remove_table --> tlb_remove_page For case 1), the pagetable_dtor() has already been moved to generic __tlb_remove_table(). For case 2), now only arm will call tlb_remove_table()/tlb_remove_ptdesc() when CONFIG_MMU_GATHER_TABLE_FREE is disabled. Let's move pagetable_dtor() completely to generic tlb_remove_table(), so that the architectures can follow more easily. Link: https://lkml.kernel.org/r/0c733ac867b287ec08190676496d1decebf49da2.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Suggested-by: Kevin Brodsky Reviewed-by: Kevin Brodsky Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Peter Zijlstra (Intel) Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 69de47c7ef3c..53ae7748f555 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -220,14 +220,20 @@ static inline void __tlb_remove_table(void *table) extern void tlb_remove_table(struct mmu_gather *tlb, void *table); -#else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */ +#else /* !CONFIG_MMU_GATHER_TABLE_FREE */ +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ -#define tlb_remove_table(tlb, page) tlb_remove_page((tlb), (page)) +static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct page *page = (struct page *)table; + pagetable_dtor(page_ptdesc(page)); + tlb_remove_page(tlb, page); +} #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE -- cgit v1.2.3 From 553e77529fb61e5520b839a0ce412a46cba996e0 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:33 +0800 Subject: mm: pgtable: introduce generic pagetable_dtor_free() The pte_free(), pmd_free(), __pud_free() and __p4d_free() in asm-generic/pgalloc.h and the generic __tlb_remove_table() are basically the same, so let's introduce pagetable_dtor_free() to deduplicate them. In addition, the pagetable_dtor_free() in s390 does the same thing, so let's s390 also calls generic pagetable_dtor_free(). Link: https://lkml.kernel.org/r/1663a0565aca881d1338ceb7d1db4aa9c333abd6.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Suggested-by: Peter Zijlstra (Intel) Reviewed-by: Kevin Brodsky Acked-by: Alexander Gordeev [s390] Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 12 ++++-------- include/asm-generic/tlb.h | 3 +-- include/linux/mm.h | 6 ++++++ 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 4afb346eae25..e3977ddca15e 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -109,8 +109,7 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { struct ptdesc *ptdesc = page_ptdesc(pte_page); - pagetable_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } @@ -153,8 +152,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) struct ptdesc *ptdesc = virt_to_ptdesc(pmd); BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - pagetable_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #endif @@ -202,8 +200,7 @@ static inline void __pud_free(struct mm_struct *mm, pud_t *pud) struct ptdesc *ptdesc = virt_to_ptdesc(pud); BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - pagetable_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PUD_FREE @@ -248,8 +245,7 @@ static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) struct ptdesc *ptdesc = virt_to_ptdesc(p4d); BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); - pagetable_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_P4D_FREE diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 53ae7748f555..e402aef79c93 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -213,8 +213,7 @@ static inline void __tlb_remove_table(void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; - pagetable_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index a3b2263f1c1a..15a903d59d09 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3001,6 +3001,12 @@ static inline void pagetable_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } +static inline void pagetable_dtor_free(struct ptdesc *ptdesc) +{ + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); +} + static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); -- cgit v1.2.3 From 30cee1e4861b59200aa09c94a4d789c461e5f408 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Mon, 30 Dec 2024 15:40:43 +0530 Subject: lib/list_debug.c: add object information in case of invalid object As of now during link list corruption it prints about cluprit address and its wrong value, but sometime it is not enough to catch the actual issue point. If it prints allocation and free path of that corrupted node, it will be a lot easier to find and fix the issues. Adding the same information when data mismatch is found in link list debug data: [ 14.243055] slab kmalloc-32 start ffff0000cda19320 data offset 32 pointer offset 8 size 32 allocated at add_to_list+0x28/0xb0 [ 14.245259] __kmalloc_cache_noprof+0x1c4/0x358 [ 14.245572] add_to_list+0x28/0xb0 ... [ 14.248632] do_el0_svc_compat+0x1c/0x34 [ 14.249018] el0_svc_compat+0x2c/0x80 [ 14.249244] Free path: [ 14.249410] kfree+0x24c/0x2f0 [ 14.249724] do_force_corruption+0xbc/0x100 ... [ 14.252266] el0_svc_common.constprop.0+0x40/0xe0 [ 14.252540] do_el0_svc_compat+0x1c/0x34 [ 14.252763] el0_svc_compat+0x2c/0x80 [ 14.253071] ------------[ cut here ]------------ [ 14.253303] list_del corruption. next->prev should be ffff0000cda192a8, but was 6b6b6b6b6b6b6b6b. (next=ffff0000cda19348) [ 14.254255] WARNING: CPU: 3 PID: 84 at lib/list_debug.c:65 __list_del_entry_valid_or_report+0x158/0x164 Moved prototype of mem_dump_obj() to bug.h, as mm.h can not be included in bug.h. Link: https://lkml.kernel.org/r/20241230101043.53773-1-maninder1.s@samsung.com Signed-off-by: Maninder Singh Acked-by: Jan Kara Cc: Al Viro Cc: Christian Brauner Cc: Marco Elver Cc: Rohit Thapliyal Signed-off-by: Andrew Morton --- include/linux/bug.h | 10 +++++++++- include/linux/mm.h | 6 ------ 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index 348acf2558f3..a9948a9f1093 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -73,15 +73,23 @@ static inline void generic_bug_clear_once(void) {} #endif /* CONFIG_GENERIC_BUG */ +#ifdef CONFIG_PRINTK +void mem_dump_obj(void *object); +#else +static inline void mem_dump_obj(void *object) {} +#endif + /* * Since detected data corruption should stop operation on the affected * structures. Return value must be checked and sanely acted on by caller. */ static inline __must_check bool check_data_corruption(bool v) { return v; } -#define CHECK_DATA_CORRUPTION(condition, fmt, ...) \ +#define CHECK_DATA_CORRUPTION(condition, addr, fmt, ...) \ check_data_corruption(({ \ bool corruption = unlikely(condition); \ if (corruption) { \ + if (addr) \ + mem_dump_obj(addr); \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ pr_err(fmt, ##__VA_ARGS__); \ BUG(); \ diff --git a/include/linux/mm.h b/include/linux/mm.h index 15a903d59d09..c550912a5d6d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4084,12 +4084,6 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, extern int sysctl_nr_trim_pages; -#ifdef CONFIG_PRINTK -void mem_dump_obj(void *object); -#else -static inline void mem_dump_obj(void *object) {} -#endif - #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, -- cgit v1.2.3 From b0d66d82fce60161de6f3d57f87016c3a6f7a121 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 3 Jan 2025 19:35:35 +0000 Subject: mm/debug: introduce VM_WARN_ON_VMG() to dump VMA merge state Patch series "mm/debug: introduce and use VM_WARN_ON_VMG()". We use a number of asserts, enabled only when CONFIG_DEBUG_VM is set, during VMA merge operations to ensure state is as expected. However, when syzkaller or the like encounters these asserts, often the information provided by the report is insufficient to narrow down what the problem is. We noticed this recently in [0], where a non-repro issue resisted debugging due to simply not having sufficient information to go on. This series improves the situation by providing VM_WARN_ON_VMG() which acts like VM_WARN_ON() (i.e. only actually being invoked if CONFIG_DEBUG_VM is set), while dumping significant information about the VMA merge state, the mm_struct describing the virtual address space, all associated VMAs and, if CONFIG_DEBUG_VM_MAPLE_TREE is set, the associated maple tree. [0]:https://lore.kernel.org/all/6774c98f.050a0220.25abdd.0991.GAE@google.com/ This patch (of 2): We use a number of asserts, enabled only when CONFIG_DEBUG_VM is set, during VMA merge operations to ensure state is as expected. However, when syzkaller or the like encounters these asserts, often the information provided by the report is insufficient to narrow down what the problem is. This might not be so much of an issue if the reported problem is reproducible, but if it is a rarely encountered race or some other case which precludes a repro, it is a very big problem (see [0] for the motivating case). It is therefore sensible to provide a means by which we can easily and conveniently dump a lot more information in these circumstances. The aggregation of merge state into a single struct threaded through the operation makes this trivial - we can simply introduce a variant on VM_WARN_ON() which takes the VMA merge state object (vmg) and use that to dump information. This patch therefore introduces VM_WARN_ON_VMG() which provides this functionality. It additionally dumps full mm state, VMA state for each of the three VMAs the vmg contains (prev, next, vma) and if CONFIG_DEBUG_VM_MAPLE_TREE is enabled, dumps the maple tree from the provided VMA iterator if non-NULL. This patch has no functional impact if CONFIG_DEBUG_VM is not set. [0]:https://lore.kernel.org/all/6774c98f.050a0220.25abdd.0991.GAE@google.com/ Link: https://lkml.kernel.org/r/cover.1735932169.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/13b09b52d4d103ee86acaf0ae612539648ae29e0.1735932169.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Jann Horn Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mmdebug.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index d7cb1e5ecbda..a0a3894900ed 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -9,10 +9,12 @@ struct page; struct vm_area_struct; struct mm_struct; struct vma_iterator; +struct vma_merge_struct; void dump_page(const struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); +void dump_vmg(const struct vma_merge_struct *vmg, const char *reason); void vma_iter_dump_tree(const struct vma_iterator *vmi); #ifdef CONFIG_DEBUG_VM @@ -87,6 +89,15 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_VMG(cond, vmg) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_vmg(vmg, "VM_WARN_ON_VMG(" __stringify(cond)")"); \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) #define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) @@ -104,9 +115,10 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_MM(cond, mm) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_VMG(cond, vmg) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) -#endif +#endif /* CONFIG_DEBUG_VM */ #ifdef CONFIG_DEBUG_VM_IRQSOFF #define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled()) -- cgit v1.2.3 From 11e2400b21a3e2dfbc95e31a9a849a30191f7a92 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Fri, 3 Jan 2025 18:44:10 +0000 Subject: mm: move common part of pagetable_*_ctor to helper Patch series "Account page tables at all levels". This series should be considered in conjunction with Qi's series [1]. Together, they ensure that page table ctor/dtor are called at all levels (PTE to PGD) and all architectures, where page tables are regular pages. Besides the improvement in accounting and general cleanup, this also create a single place where construction/destruction hooks can be called for all page tables, namely the now-generic pagetable_dtor() introduced by Qi, and __pagetable_ctor() introduced in this series. [1] https://lore.kernel.org/linux-mm/cover.1735549103.git.zhengqi.arch@bytedance.com/ This patch (of 6): pagetable_*_ctor all have the same basic implementation. Move the common part to a helper to reduce duplication. Link: https://lkml.kernel.org/r/20250103184415.2744423-1-kevin.brodsky@arm.com Link: https://lkml.kernel.org/r/20250103184415.2744423-2-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: Dave Hansen Acked-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Linus Walleij Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Will Deacon Cc: Ingo Molnar Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c550912a5d6d..2949b58fd633 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2992,6 +2992,14 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ +static inline void __pagetable_ctor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); +} + static inline void pagetable_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3009,12 +3017,9 @@ static inline void pagetable_dtor_free(struct ptdesc *ptdesc) static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - if (!ptlock_init(ptdesc)) return false; - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); return true; } @@ -3118,13 +3123,10 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - if (!pmd_ptlock_init(ptdesc)) return false; - __folio_set_pgtable(folio); ptdesc_pmd_pts_init(ptdesc); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); return true; } @@ -3149,18 +3151,12 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); } static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); -- cgit v1.2.3 From a9b3c355c2e6388b0a3b67627460a516d88bdbc9 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Fri, 3 Jan 2025 18:44:14 +0000 Subject: asm-generic: pgalloc: provide generic __pgd_{alloc,free} We already have a generic implementation of alloc/free up to P4D level, as well as pgd_free(). Let's finish the work and add a generic PGD-level alloc helper as well. Unlike at lower levels, almost all architectures need some specific magic at PGD level (typically initialising PGD entries), so introducing a generic pgd_alloc() isn't worth it. Instead we introduce two new helpers, __pgd_alloc() and __pgd_free(), and make use of them in the arch-specific pgd_alloc() and pgd_free() wherever possible. To accommodate as many arch as possible, __pgd_alloc() takes a page allocation order. Because pagetable_alloc() allocates zeroed pages, explicit zeroing in pgd_alloc() becomes redundant and we can get rid of it. Some trivial implementations of pgd_free() also become unnecessary once __pgd_alloc() is used; remove them. Another small improvement is consistent accounting of PGD pages by using GFP_PGTABLE_{USER,KERNEL} as appropriate. Not all PGD allocations can be handled by the generic helpers. In particular, multiple architectures allocate PGDs from a kmem_cache, and those PGDs may not be page-sized. Link: https://lkml.kernel.org/r/20250103184415.2744423-6-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: Dave Hansen Acked-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Ingo Molnar Cc: Linus Walleij Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index e3977ddca15e..de4df14158e6 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -258,10 +258,35 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) #endif /* CONFIG_PGTABLE_LEVELS > 4 */ +static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order) +{ + gfp_t gfp = GFP_PGTABLE_USER; + struct ptdesc *ptdesc; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + gfp &= ~__GFP_HIGHMEM; + + ptdesc = pagetable_alloc_noprof(gfp, order); + if (!ptdesc) + return NULL; + + return ptdesc_address(ptdesc); +} +#define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) + +static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); + + BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); + pagetable_free(ptdesc); +} + #ifndef __HAVE_ARCH_PGD_FREE static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - pagetable_free(virt_to_ptdesc(pgd)); + __pgd_free(mm, pgd); } #endif -- cgit v1.2.3 From d95936a2267c11a38917d5fc7bf3862a64fe13d8 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Fri, 3 Jan 2025 18:44:15 +0000 Subject: mm: introduce ctor/dtor at PGD level Following on from the introduction of P4D-level ctor/dtor, let's finish the job and introduce ctor/dtor at PGD level. The incurred improvement in page accounting is minimal - the main motivation is to create a single, generic place where construction/destruction hooks can be added for all page table pages. This patch should cover all architectures and all configurations where PGDs are one or more regular pages. This excludes any configuration where PGDs are allocated from a kmem_cache object. Link: https://lkml.kernel.org/r/20250103184415.2744423-7-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: Dave Hansen Acked-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Ingo Molnar Cc: Linus Walleij Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 3 ++- include/linux/mm.h | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index de4df14158e6..892ece4558a2 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -271,6 +271,7 @@ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order if (!ptdesc) return NULL; + pagetable_pgd_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) @@ -280,7 +281,7 @@ static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd) struct ptdesc *ptdesc = virt_to_ptdesc(pgd); BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PGD_FREE diff --git a/include/linux/mm.h b/include/linux/mm.h index 2949b58fd633..3550cbeed488 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3159,6 +3159,11 @@ static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) __pagetable_ctor(ptdesc); } +static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) +{ + __pagetable_ctor(ptdesc); +} + extern void __init pagecache_init(void); extern void free_initmem(void); -- cgit v1.2.3 From 42b7491af14cbba2393329ce43d508a957bd94fa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Jan 2025 09:43:53 -0800 Subject: mm/damon/core: introduce damon_call() Introduce a new DAMON core API function, damon_call(). It aims to replace some damon_callback usages that access damon_ctx of ongoing kdamond with additional synchronizations. It receives a function pointer, let the parallel kdamond invokes the function, and returns after the invocation is finished, or canceled due to some races. kdamond invokes the function inside the main loop after sampling is done. If it is deactivated by DAMOS watermarks or already out of the main loop, mark the request as canceled so that damon_call() can wakeup and return. Link: https://lkml.kernel.org/r/20250103174400.54890-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index a67f2c4940e9..ac2d42a50751 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -552,6 +552,27 @@ struct damon_callback { void (*before_terminate)(struct damon_ctx *context); }; +/* + * struct damon_call_control - Control damon_call(). + * + * @fn: Function to be called back. + * @data: Data that will be passed to @fn. + * @return_code: Return code from @fn invocation. + * + * Control damon_call(), which requests specific kdamond to invoke a given + * function. Refer to damon_call() for more details. + */ +struct damon_call_control { + int (*fn)(void *data); + void *data; + int return_code; +/* private: internal use only */ + /* informs if the kdamond finished handling of the request */ + struct completion completion; + /* informs if the kdamond canceled @fn infocation */ + bool canceled; +}; + /** * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * @@ -632,6 +653,9 @@ struct damon_ctx { /* for scheme quotas prioritization */ unsigned long *regions_score_histogram; + struct damon_call_control *call_control; + struct mutex call_control_lock; + /* public: */ struct task_struct *kdamond; struct mutex kdamond_lock; @@ -779,6 +803,8 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); +int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); + int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end); -- cgit v1.2.3 From bf0eaba0ff9c9c8e6fd58ddfa1a8b6df4b813f61 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Jan 2025 09:43:57 -0800 Subject: mm/damon/core: implement damos_walk() Introduce a new core layer interface, damos_walk(). It aims to replace some damon_callback usages that access DAMOS schemes applied regions of ongoing kdamond with additional synchronizations. It receives a function pointer and asks kdamond to invoke it for any region that it tried to apply any DAMOS action within one scheme apply interval for every scheme of it. The function further waits until the kdamond finishes the invocations for every scheme, or cancels the request, and returns. The kdamond invokes the function as requested within the main loop. If it is deactivated by DAMOS watermarks or going out of the main loop, it marks the request as canceled, so that damos_walk() can wakeup and return. Link: https://lkml.kernel.org/r/20250103174400.54890-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index ac2d42a50751..2889de3526c3 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -352,6 +352,31 @@ struct damos_filter { struct list_head list; }; +struct damon_ctx; +struct damos; + +/** + * struct damos_walk_control - Control damos_walk(). + * + * @walk_fn: Function to be called back for each region. + * @data: Data that will be passed to walk functions. + * + * Control damos_walk(), which requests specific kdamond to invoke the given + * function to each region that eligible to apply actions of the kdamond's + * schemes. Refer to damos_walk() for more details. + */ +struct damos_walk_control { + void (*walk_fn)(void *data, struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s); + void *data; +/* private: internal use only */ + /* informs if the kdamond finished handling of the walk request */ + struct completion completion; + /* informs if the walk is canceled. */ + bool canceled; +}; + /** * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. @@ -415,6 +440,8 @@ struct damos { * @action */ unsigned long next_apply_sis; + /* informs if ongoing DAMOS walk for this scheme is finished */ + bool walk_completed; /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; @@ -442,8 +469,6 @@ enum damon_ops_id { NR_DAMON_OPS, }; -struct damon_ctx; - /** * struct damon_operations - Monitoring operations for given use cases. * @@ -656,6 +681,9 @@ struct damon_ctx { struct damon_call_control *call_control; struct mutex call_control_lock; + struct damos_walk_control *walk_control; + struct mutex walk_control_lock; + /* public: */ struct task_struct *kdamond; struct mutex kdamond_lock; @@ -804,6 +832,7 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); +int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end); -- cgit v1.2.3 From 626ffabe67c2359f3a88bb61fdc83a6280ef16e9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2025 11:33:46 -0800 Subject: mm/damon: clarify trying vs applying on damos_stat kernel-doc comment Patch series "mm/damon: enable page level properties based monitoring". TL; DR ====== This patch series enables access monitoring based on page level properties including their anonymousness, belonging cgroups and young-ness, by extending DAMOS stats and regions walk features with region-internal DAMOS filters. Background ========== DAMOS has initially developed for only access-aware system operations. But, efficient acces monitoring results querying is yet another major usage of today's DAMOS. DAMOS stats and regions walk, which exposes accumulated counts and per-region monitoring results that filtered by DAMOS parameters including target access pattern, quotas and DAMOS filters, are the key features for that usage. For tunings and investigations, it can be more useful if only the information can be exposed without making real system operational change. Special DAMOS action, DAMOS_STAT, was introduced for the purpose. DAMOS fundametally works with only access pattern information in region granularity. For some use cases, fixed and fine granularity information based on non access pattern properties can be useful, though. For example, on systems having swap devices that much faster than storage devices for files, DAMOS-based proactive reclaim need to be applied differently for anonymous pages and file-backed pages. DAMOS filters is a feature that makes it possible. It supports non access pattern information including page level properties such as anonymousness, belonging cgroups, and young-ness (whether the page has accessed since the last access check of it). The information can be useful for tuning and investigations. DAMOS stat exposes some of it via {nr,sz}_applied, but it is mixed with operation failures. Also, exposing the information without making system operation change is impossible, since DAMOS_STAT simply ignores the page level properties based DAMOS filters. Design ====== Expose the exact information for every DAMOS action including DAMOS_STAT by implementing below changes. Extend the interface for DAMON operations set layer, which contains the implementation of the page level filters, to report back the amount of memory that passed the region-internal DAMOS filters to the core layer. On the core layer, account the operations set layer reported stat with DAMOS stat for per-scheme monitoring. Also, pass the information to regions walk for per-region monitoring. In this way, DAMON API users can efficiently get the fine-grained information. For the user-space, make DAMON sysfs interface collects the information using the updated DAMON core API, and expose those to new per-scheme stats file and per-DAMOS-tried region properties file. Practical Usages ================ With this patch series, DAMON users can query how many bytes of regions of specific access temperature is backed by pages of specific type. The type can be any of DAMOS filter-supporting one, including anonymousness, belonging cgroups, and young-ness. For example, users can visualize access hotness-based page granulairty histogram for different cgroups, backing content type, or youngness. In future, it could be extended to more types such as whether it is THP, position on LRU lists, etc. This can be useful for estimating benefits of a new or an existing access-aware system optimizations without really committing the changes. Patches Sequence ================ The patches are constructed in four sub-sequences. First three patches (patches 1-3) update documents to have missing background knowledges and better structures for easily introducing followup changes. Following three patches (patches 4-6) change the operations set layer interface to report back the region-internal filter passed memory size, and make the operations set implementations support the changed symantic. Following five patches (patches 7-11) implement per-scheme accumulated stat for region-internal filter-passed memory size on core API (damos_stat) and DAMON sysfs interface. First two patches of those are for code change, and following three patches are for documentation. Finally, five patches (patches 12-16) implementing per-region region-internal filter-passed memory size follows. Similar to that for per-scheme stat, first two patches implement core-API and sysfs interface change. Then three patches for documentation update follow. This patch (of 16): DAMOS stat kernel-doc documentation is using terms that bit ambiguous. Without reading the code, understanding it correctly is not that easy. Add the clarification on the kernel-doc comment. Link: https://lkml.kernel.org/r/20250106193401.109161-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250106193401.109161-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2889de3526c3..b85eae388f5b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -287,6 +287,23 @@ struct damos_watermarks { * @nr_applied: Total number of regions that the scheme is applied. * @sz_applied: Total size of regions that the scheme is applied. * @qt_exceeds: Total number of times the quota of the scheme has exceeded. + * + * "Tried an action to a region" in this context means the DAMOS core logic + * determined the region as eligible to apply the action. The access pattern + * (&struct damos_access_pattern), quotas (&struct damos_quota), watermarks + * (&struct damos_watermarks) and filters (&struct damos_filter) that handled + * on core logic can affect this. The core logic asks the operation set + * (&struct damon_operations) to apply the action to the region. + * + * "Applied an action to a region" in this context means the operation set + * (&struct damon_operations) successfully applied the action to the region, at + * least to a part of the region. The filters (&struct damos_filter) that + * handled on operation set layer and type of the action and pages of the + * region can affect this. For example, if a filter is set to exclude + * anonymous pages and the region has only anonymous pages, the region will be + * failed at applying the action. If the action is &DAMOS_PAGEOUT and all + * pages of the region are already paged out, the region will be failed at + * applying the action. */ struct damos_stat { unsigned long nr_tried; -- cgit v1.2.3 From b5bbe9c08fd1519f96832b82256543a567ce2900 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2025 11:33:49 -0800 Subject: mm/damon: ask apply_scheme() to report filter-passed region-internal bytes Some DAMOS filter types including those for young page, anon page, and belonging memcg are handled by underlying DAMON operations set implementation, via damon_operations->apply_scheme() interface. How many bytes of the region have passed the filter can be useful for DAMOS scheme tuning and access pattern monitoring. Modify the interface to let the callback implementation reports back the number if possible. Link: https://lkml.kernel.org/r/20250106193401.109161-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index b85eae388f5b..da003173210f 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -529,7 +529,8 @@ enum damon_ops_id { * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action * to the region and return bytes of the region that the action is successfully - * applied. + * applied. It should also report how many bytes of the region has passed + * filters (&struct damos_filter) that handled by itself. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -546,7 +547,7 @@ struct damon_operations { struct damos *scheme); unsigned long (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, - struct damos *scheme); + struct damos *scheme, unsigned long *sz_filter_passed); bool (*target_valid)(struct damon_target *t); void (*cleanup)(struct damon_ctx *context); }; -- cgit v1.2.3 From 60fa9355a6c620f7b727d3fdb433fb6cf714a9b0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2025 11:33:52 -0800 Subject: mm/damon/core: implement per-scheme ops-handled filter-passed bytes stat Implement a new per-DAMOS scheme statistic field, namely sz_ops_filter_passed, using the changed damon_operations->apply_scheme() interface. It counts total bytes of memory that given DAMOS action tried to be applied, and passed the operations layer handled region-internal filters of the scheme. DAMON API users can access it using DAMON-internal safe access features such as damon_call() and/or damos_walk(). Link: https://lkml.kernel.org/r/20250106193401.109161-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index da003173210f..2a93dbe06ecc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -286,6 +286,8 @@ struct damos_watermarks { * @sz_tried: Total size of regions that the scheme is tried to be applied. * @nr_applied: Total number of regions that the scheme is applied. * @sz_applied: Total size of regions that the scheme is applied. + * @sz_ops_filter_passed: + * Total bytes that passed ops layer-handled DAMOS filters. * @qt_exceeds: Total number of times the quota of the scheme has exceeded. * * "Tried an action to a region" in this context means the DAMOS core logic @@ -310,6 +312,7 @@ struct damos_stat { unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; }; -- cgit v1.2.3 From cfc33a7d2daca4455ef3ebae63a2e89bd9bb0ebe Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2025 11:33:57 -0800 Subject: mm/damon/core: pass per-region filter-passed bytes to damos_walk_control->walk_fn() Total size of memory that passed DAMON operations set layer-handled DAMOS filters per scheme is provided to DAMON core API and ABI (sysfs interface) users. Having it per-region in non-accumulated way can provide it in finer granularity. Provide it to damos_walk() core API users, by passing the data to damos_walk_control->walk_fn(). Link: https://lkml.kernel.org/r/20250106193401.109161-13-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2a93dbe06ecc..298b1a831e62 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -388,7 +388,7 @@ struct damos; struct damos_walk_control { void (*walk_fn)(void *data, struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, - struct damos *s); + struct damos *s, unsigned long sz_filter_passed); void *data; /* private: internal use only */ /* informs if the kdamond finished handling of the walk request */ -- cgit v1.2.3 From 63db8170bf34ce9e0763f87d993cf9b4c9002b09 Mon Sep 17 00:00:00 2001 From: Bruno Faccini Date: Mon, 6 Jan 2025 04:06:59 -0800 Subject: mm/fake-numa: allow later numa node hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current fake-numa implementation prevents new Numa nodes to be later hot-plugged by drivers. A common symptom of this limitation is the "node was absent from the node_possible_map" message by associated warning in mm/memory_hotplug.c: add_memory_resource(). This comes from the lack of remapping in both pxm_to_node_map[] and node_to_pxm_map[] tables to take fake-numa nodes into account and thus triggers collisions with original and physical nodes only-mapping that had been determined from BIOS tables. This patch fixes this by doing the necessary node-ids translation in both pxm_to_node_map[]/node_to_pxm_map[] tables. node_distance[] table has also been fixed accordingly. Details: When trying to use fake-numa feature on our system where new Numa nodes are being "hot-plugged" upon driver load, this fails with the following type of message and warning with stack : node 8 was absent from the node_possible_map WARNING: CPU: 61 PID: 4259 at mm/memory_hotplug.c:1506 add_memory_resource+0x3dc/0x418 This issue prevents the use of the fake-NUMA debug feature with the system's full configuration, when it has proven to be sometimes extremely useful for performance testing of multi-tasked, memory-bound applications, as it enables better isolation of processes/ranks compared to fat NUMA nodes. Usual numactl output after driver has “hot-plugged”/unveiled some new Numa nodes with and without memory : $ numactl --hardware available: 9 nodes (0-8) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 0 size: 490037 MB node 0 free: 484432 MB node 1 cpus: node 1 size: 97280 MB node 1 free: 97279 MB node 2 cpus: node 2 size: 0 MB node 2 free: 0 MB node 3 cpus: node 3 size: 0 MB node 3 free: 0 MB node 4 cpus: node 4 size: 0 MB node 4 free: 0 MB node 5 cpus: node 5 size: 0 MB node 5 free: 0 MB node 6 cpus: node 6 size: 0 MB node 6 free: 0 MB node 7 cpus: node 7 size: 0 MB node 7 free: 0 MB node 8 cpus: node 8 size: 0 MB node 8 free: 0 MB node distances: node 0 1 2 3 4 5 6 7 8 0: 10 80 80 80 80 80 80 80 80 1: 80 10 255 255 255 255 255 255 255 2: 80 255 10 255 255 255 255 255 255 3: 80 255 255 10 255 255 255 255 255 4: 80 255 255 255 10 255 255 255 255 5: 80 255 255 255 255 10 255 255 255 6: 80 255 255 255 255 255 10 255 255 7: 80 255 255 255 255 255 255 10 255 8: 80 255 255 255 255 255 255 255 10 With recent M.Rapoport set of fake-numa patches in mm-everything and using numa=fake=4 boot parameter : $ numactl --hardware available: 4 nodes (0-3) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 0 size: 122518 MB node 0 free: 117141 MB node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 1 size: 219911 MB node 1 free: 219751 MB node 2 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 2 size: 122599 MB node 2 free: 122541 MB node 3 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 3 size: 122479 MB node 3 free: 122408 MB node distances: node 0 1 2 3 0: 10 10 10 10 1: 10 10 10 10 2: 10 10 10 10 3: 10 10 10 10 With recent M.Rapoport set of fake-numa patches in mm-everything, this patch on top, using numa=fake=4 boot parameter : # numactl —hardware available: 12 nodes (0-11) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 0 size: 122518 MB node 0 free: 116429 MB node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 1 size: 122631 MB node 1 free: 122576 MB node 2 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 2 size: 122599 MB node 2 free: 122544 MB node 3 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 3 size: 122479 MB node 3 free: 122419 MB node 4 cpus: node 4 size: 97280 MB node 4 free: 97279 MB node 5 cpus: node 5 size: 0 MB node 5 free: 0 MB node 6 cpus: node 6 size: 0 MB node 6 free: 0 MB node 7 cpus: node 7 size: 0 MB node 7 free: 0 MB node 8 cpus: node 8 size: 0 MB node 8 free: 0 MB node 9 cpus: node 9 size: 0 MB node 9 free: 0 MB node 10 cpus: node 10 size: 0 MB node 10 free: 0 MB node 11 cpus: node 11 size: 0 MB node 11 free: 0 MB node distances: node 0 1 2 3 4 5 6 7 8 9 10 11 0: 10 10 10 10 80 80 80 80 80 80 80 80 1: 10 10 10 10 80 80 80 80 80 80 80 80 2: 10 10 10 10 80 80 80 80 80 80 80 80 3: 10 10 10 10 80 80 80 80 80 80 80 80 4: 80 80 80 80 10 255 255 255 255 255 255 255 5: 80 80 80 80 255 10 255 255 255 255 255 255 6: 80 80 80 80 255 255 10 255 255 255 255 255 7: 80 80 80 80 255 255 255 10 255 255 255 255 8: 80 80 80 80 255 255 255 255 10 255 255 255 9: 80 80 80 80 255 255 255 255 255 10 255 255 10: 80 80 80 80 255 255 255 255 255 255 10 255 11: 80 80 80 80 255 255 255 255 255 255 255 10 Link: https://lkml.kernel.org/r/20250106120659.359610-2-bfaccini@nvidia.com Signed-off-by: Bruno Faccini Cc: David Hildenbrand Cc: John Hubbard Cc: Mike Rapoport (Microsoft) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/acpi/acpi_numa.h | 5 +++++ include/linux/numa_memblks.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h index b5f594754a9e..99b960bd473c 100644 --- a/include/acpi/acpi_numa.h +++ b/include/acpi/acpi_numa.h @@ -17,11 +17,16 @@ extern int node_to_pxm(int); extern int acpi_map_pxm_to_node(int); extern unsigned char acpi_srat_revision; extern void disable_srat(void); +extern int fix_pxm_node_maps(int max_nid); extern void bad_srat(void); extern int srat_disabled(void); #else /* CONFIG_ACPI_NUMA */ +static inline int fix_pxm_node_maps(int max_nid) +{ + return 0; +} static inline void disable_srat(void) { } diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index cfad6ce7e1bd..dd85613cdd86 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -29,7 +29,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi); int __init numa_memblks_init(int (*init_func)(void), bool memblock_force_top_down); +extern int numa_distance_cnt; + #ifdef CONFIG_NUMA_EMU +extern int emu_nid_to_phys[MAX_NUMNODES]; int numa_emu_cmdline(char *str); void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, unsigned int nr_emu_nids); -- cgit v1.2.3 From b2aad24b53333f1904a55d97e3fde2246ef05bb6 Mon Sep 17 00:00:00 2001 From: Guo Weikang Date: Mon, 6 Jan 2025 10:11:25 +0800 Subject: mm/memmap: prevent double scanning of memmap by kmemleak kmemleak explicitly scans the mem_map through the valid struct page objects. However, memmap_alloc() was also adding this memory to the gray object list, causing it to be scanned twice. Remove memmap_alloc() from the scan list and add a comment to clarify the behavior. Link: https://lore.kernel.org/lkml/CAOm6qn=FVeTpH54wGDFMHuCOeYtvoTx30ktnv9-w3Nh8RMofEA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20250106021126.1678334-1-guoweikang.kernel@gmail.com Signed-off-by: Guo Weikang Reviewed-by: Catalin Marinas Cc: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- include/linux/memblock.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 673d5cae7c81..d48b56c1e558 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -378,6 +378,10 @@ static inline int memblock_get_region_node(const struct memblock_region *r) /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) #define MEMBLOCK_ALLOC_ACCESSIBLE 0 +/* + * MEMBLOCK_ALLOC_NOLEAKTRACE avoids kmemleak tracing. It implies + * MEMBLOCK_ALLOC_ACCESSIBLE + */ #define MEMBLOCK_ALLOC_NOLEAKTRACE 1 /* We are using top down, so it is safe to use 0 here */ -- cgit v1.2.3 From 30cef82bc6e8975a360ec05b707f7fb194c875ed Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 7 Jan 2025 15:39:58 -0500 Subject: mm/hugetlb: rename avoid_reserve to cow_from_owner The old name "avoid_reserve" can be too generic and can be used wrongly in the new call sites that want to allocate a hugetlb folio. It's confusing on two things: (1) whether one can opt-in to avoid global reservation, and (2) whether it should take more than one count. In reality, this flag is only used in an extremely hacky path, in an extremely hacky way in hugetlb CoW path only, and always use with 1 saying "skip global reservation". Rename the flag to avoid future abuse of this flag, making it a boolean so as to reflect its true representation that it's not a counter. To make it even harder to abuse, add a comment above the function to explain it. Link: https://lkml.kernel.org/r/20250107204002.2683356-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Oscar Salvador Cc: Ackerley Tng Cc: Breno Leitao Cc: Muchun Song Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 10faf42ca96a..49ec2362ce92 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -683,7 +683,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve); + unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback); @@ -1068,7 +1068,7 @@ static inline int replace_free_hugepage_folios(unsigned long start_pfn, static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, - int avoid_reserve) + bool cow_from_owner) { return NULL; } -- cgit v1.2.3 From c8b979530f27f90c0353a189b2faa6e50a0ea94a Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 23 Dec 2024 17:00:37 -0500 Subject: mm: alloc_pages_bulk_noprof: drop page_list argument Patch series "mm: alloc_pages_bulk: small API refactor", v2. Today, alloc_pages_bulk_noprof() supports two arguments to return allocated pages: a linked list and an array. There are also higher level APIs for both. However, the linked list API has apparently never been used. So, this series removes it along with the list API and also refactors the remaining API naming for consistency. This patch (of 2): commit 387ba26fb1cb ("mm/page_alloc: add a bulk page allocator") added __alloc_pages_bulk() along with the page_list argument. The next commit 0f87d9d30f21 ("mm/page_alloc: add an array-based interface to the bulk page allocator") added the array-based argument. As it turns out, the page_list argument has no users in the current tree (if it ever had any). Dropping it allows for a slight simplification and eliminates some unnecessary checks, now that page_array is required. Also, note that the removal of the page_list argument was proposed before in the thread below, where Matthew Wilcox mentions that: """ Iterating a linked list is _expensive_. It is about 10x quicker to iterate an array than a linked list. """ (https://lore.kernel.org/linux-mm/20231025093254.xvomlctwhcuerzky@techsingularity.net) Link: https://lkml.kernel.org/r/cover.1734991165.git.luizcap@redhat.com Link: https://lkml.kernel.org/r/f1c75db91d08cafd211eca6a3b199b629d4ffe16.1734991165.git.luizcap@redhat.com Signed-off-by: Luiz Capitulino Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c96d5d7f7b89..f8b33c5e7a14 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -212,7 +212,6 @@ struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *page_list, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) @@ -223,11 +222,8 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ -#define alloc_pages_bulk_list(_gfp, _nr_pages, _list) \ - __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL) - #define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \ - __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array) + __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array) static inline unsigned long alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, @@ -236,7 +232,7 @@ alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array); + return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array); } #define alloc_pages_bulk_array_node(...) \ -- cgit v1.2.3 From 6bf9b5b40af373690313f64a3935b2bf2e5d46d9 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 23 Dec 2024 17:00:38 -0500 Subject: mm: alloc_pages_bulk: rename API The previous commit removed the page_list argument from alloc_pages_bulk_noprof() along with the alloc_pages_bulk_list() function. Now that only the *_array() flavour of the API remains, we can do the following renaming (along with the _noprof() ones): alloc_pages_bulk_array -> alloc_pages_bulk alloc_pages_bulk_array_mempolicy -> alloc_pages_bulk_mempolicy alloc_pages_bulk_array_node -> alloc_pages_bulk_node Link: https://lkml.kernel.org/r/275a3bbc0be20fbe9002297d60045e67ab3d4ada.1734991165.git.luizcap@redhat.com Signed-off-by: Luiz Capitulino Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- include/linux/gfp.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f8b33c5e7a14..6bb1a5a7a4ae 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -215,18 +215,18 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) -unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, +unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array); -#define alloc_pages_bulk_array_mempolicy(...) \ - alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__)) +#define alloc_pages_bulk_mempolicy(...) \ + alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ -#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \ +#define alloc_pages_bulk(_gfp, _nr_pages, _page_array) \ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array) static inline unsigned long -alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, +alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) { if (nid == NUMA_NO_NODE) @@ -235,8 +235,8 @@ alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array); } -#define alloc_pages_bulk_array_node(...) \ - alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__)) +#define alloc_pages_bulk_node(...) \ + alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__)) static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) { -- cgit v1.2.3 From e20f52e8e3b7947e40bd40c6cdc69884c6df716c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Jan 2025 09:51:17 -0800 Subject: mm/damon: fixup damos_filter kernel-doc Patch series "mm/damon: extend DAMOS filters for inclusion", v2. DAMOS fitlers are exclusive filters. It only excludes memory of given criterias from the DAMOS action targets. This has below limitations. First, the name is not explicitly explaining the behavior. This actually resulted in users' confusions[1]. Secondly, combined uses of multiple filters provide only restriced coverages. For example, building a DAMOS scheme that applies the action to memory that belongs to cgroup A "or" cgroup B is impossible. A workaround would be using two schemes that fitlers out memory that not belong to cgroup A and cgroup B, respectively. It is cumbersome, and difficult to control quota-like per-scheme features in an orchestration. Monitoring of filters-passed memory statistic will also be complicated. Extend DAMOS filters to support not only exclusion (rejecting), but also inclusion (allowing) behavior. For this, add a new damos_filter struct field called 'allow' for DAMON kernel API users. The filter works as an inclusion or exclusion filter when it is set or unset, respectively. For DAMON user-space ABI users, add a DAMON sysfs file of same name under DAMOS filter sysfs directory. To prevent exposing a behavioral change to old users, set rejecting as the default behavior. Note that allow-filters work for only inclusion, not exclusion of memory that not satisfying the criteria. And the default behavior of DAMOS for memory that no filter has involved is that the action can be applied to those memory. Also, filters-passed memory statistics are for any memory that passed through the DAMOS filters check stage. These implies installing allow-filters at the endof the filter list is useless. Refer to the design doc change of this series for more details. [1] https://lore.kernel.org/20240320165619.71478-1-sj@kernel.org This patch (of 10): The comment is slightly wrong. DAMOS filters are not only for pages, but general bytes of memory. Also the description of 'matching' is bit confusing, since DAMOS filters do only filtering out. Update the comments to be less confusing. Link: https://lkml.kernel.org/r/20250109175126.57878-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250109175126.57878-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 298b1a831e62..72afba74ac6d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -347,8 +347,8 @@ enum damos_filter_type { /** * struct damos_filter - DAMOS action target memory filter. - * @type: Type of the page. - * @matching: If the matching page should filtered out or in. + * @type: Type of the target memory. + * @matching: If the @type-matching memory should be filtered out. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. * @target_idx: Index of the &struct damon_target of @@ -357,9 +357,10 @@ enum damos_filter_type { * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each - * page of the region matches to this and avoid applying the action if so. - * Support of each filter type depends on the running &struct damon_operations - * and the type. Refer to &enum damos_filter_type for more detai. + * byte of the region matches to this given condition and avoid applying the + * action if so. Support of each filter type depends on the running &struct + * damon_operations and the type. Refer to &enum damos_filter_type for more + * details. */ struct damos_filter { enum damos_filter_type type; -- cgit v1.2.3 From fe6d7fdd62491524d11433b9ff8d3db5dde32700 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Jan 2025 09:51:18 -0800 Subject: mm/damon/core: add damos_filter->allow field DAMOS filters work as only exclusive (reject) filters. This makes it easy to be confused, and restrictive at combining multiple filters for covering various types of memory. Add a field named 'allow' to damos_filter. The field will be used to indicate whether the filter should work for inclusion or exclusion. To keep the old behavior, set it as 'false' (work as exclusive filter) by default, from damos_new_filter(). Following two commits will make the core and operations set layers, which handles damos_filter objects, respect the field, respectively. Link: https://lkml.kernel.org/r/20250109175126.57878-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 72afba74ac6d..8a2d104df5a3 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -348,7 +348,8 @@ enum damos_filter_type { /** * struct damos_filter - DAMOS action target memory filter. * @type: Type of the target memory. - * @matching: If the @type-matching memory should be filtered out. + * @matching: Whether this is for @type-matching memory. + * @allow: Whether to include or exclude the @matching memory. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. * @target_idx: Index of the &struct damon_target of @@ -365,6 +366,7 @@ enum damos_filter_type { struct damos_filter { enum damos_filter_type type; bool matching; + bool allow; union { unsigned short memcg_id; struct damon_addr_range addr_range; -- cgit v1.2.3 From e2fbfedad03401a38b8c3b7fd52d8fdcd039d0bc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Jan 2025 09:51:21 -0800 Subject: mm/damon: add 'allow' argument to damos_new_filter() DAMON API users should set damos_filter->allow manually to use a DAMOS allow-filter, since damos_new_filter() unsets the field always. It is cumbersome and easy to mistake. Add an arugment for setting the field to damos_new_filter(). Link: https://lkml.kernel.org/r/20250109175126.57878-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 8a2d104df5a3..0834d7ffcb84 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -801,7 +801,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damon_attrs *attrs); struct damos_filter *damos_new_filter(enum damos_filter_type type, - bool matching); + bool matching, bool allow); void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); -- cgit v1.2.3 From 07438779313caafe52ac1a1a6958d735a5938988 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Dec 2024 13:16:38 -0800 Subject: alloc_tag: avoid current->alloc_tag manipulations when profiling is disabled When memory allocation profiling is disabled there is no need to update current->alloc_tag and these manipulations add unnecessary overhead. Fix the overhead by skipping these extra updates. I ran comprehensive testing on Pixel 6 on Big, Medium and Little cores: Overhead before fixes Overhead after fixes slab alloc page alloc slab alloc page alloc Big 6.21% 5.32% 3.31% 4.93% Medium 4.51% 5.05% 3.79% 4.39% Little 7.62% 1.82% 6.68% 1.02% This is an allocation microbenchmark doing allocations in a tight loop. Not a really realistic scenario and useful only to make performance comparisons. Link: https://lkml.kernel.org/r/20241226211639.1357704-1-surenb@google.com Fixes: b951aaff5035 ("mm: enable page allocation tagging") Signed-off-by: Suren Baghdasaryan Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Yu Zhao Cc: Zhenhua Huang Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 0bbbe537c5f9..a946e0203e6d 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -224,9 +224,14 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} #define alloc_hooks_tag(_tag, _do_alloc) \ ({ \ - struct alloc_tag * __maybe_unused _old = alloc_tag_save(_tag); \ - typeof(_do_alloc) _res = _do_alloc; \ - alloc_tag_restore(_tag, _old); \ + typeof(_do_alloc) _res; \ + if (mem_alloc_profiling_enabled()) { \ + struct alloc_tag * __maybe_unused _old; \ + _old = alloc_tag_save(_tag); \ + _res = _do_alloc; \ + alloc_tag_restore(_tag, _old); \ + } else \ + _res = _do_alloc; \ _res; \ }) -- cgit v1.2.3 From 7277433096f6ce4a84a1620529ac4ba3e1041ee1 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:22 +0800 Subject: mm, swap: remove old allocation path for HDD We are currently using different swap allocation algorithm for HDD and non-HDD. This leads to the existence of a different set of locks, and the code path is heavily bloated, causing difficulties for further optimization and maintenance. This commit removes all HDD swap allocation and related dead code, and uses the cluster allocation algorithm instead. The performance may drop temporarily, but this should be negligible: The main advantage of the legacy HDD allocation algorithm is that it tends to use continuous slots, but swap device gets fragmented quickly anyway, and the attempt to use continuous slots will fail easily. This commit also enables mTHP swap on HDD, which is expected to be beneficial, and following commits will adapt and optimize the cluster allocator for HDD. Link: https://lkml.kernel.org/r/20250113175732.48099-4-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Suggested-by: "Huang, Ying" Reviewed-by: Baoquan He Cc: Barry Song Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index f3e0ac20c2e8..3a71198a6957 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -309,9 +309,6 @@ struct swap_info_struct { unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ unsigned int inuse_pages; /* number of those currently in use */ - unsigned int cluster_next; /* likely index for next allocation */ - unsigned int cluster_nr; /* countdown to next cluster search */ - unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ -- cgit v1.2.3 From 27701521beb5897d6b97e2f8c20de41e74cbcb7b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:24 +0800 Subject: mm, swap: clean up device availability check Remove highest_bit and lowest_bit. After the HDD allocation path has been removed, the only purpose of these two fields is to determine whether the device is full or not, which can instead be determined by checking the inuse_pages. Link: https://lkml.kernel.org/r/20250113175732.48099-6-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 3a71198a6957..c0d49dad7a4b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -305,8 +305,6 @@ struct swap_info_struct { struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; - unsigned int lowest_bit; /* index of first free in swap_map */ - unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ unsigned int inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ -- cgit v1.2.3 From b228386cf237e659cdf5d8037a19db0b0a06f6b5 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:25 +0800 Subject: mm, swap: clean up plist removal and adding When the swap device is full (inuse_pages == pages), it should be removed from the allocation available plist. If any slot is freed, the swap device should be added back to the plist. Additionally, during swapon or swapoff, the swap device is forcefully added or removed. Currently, the condition (inuse_pages == pages) is checked after every counter update, then remove or add the device accordingly. This is serialized by si->lock. This commit decouples it from the protection of si->lock and reworked plist removal and adding, making it possible to get rid of the hard dependency on si->lock in allocation path in later commits. To achieve this, simply using another lock is not an optimal approach, as the overhead is observable for a hot counter, and may cause complex locking issues. Thus, this commit manages to make it a lock-free atomic operation, by embedding the plist state into the second highest bit of the atomic counter. Simply making the counter an atomic will not work, if the update and plist status check are not performed atomically, we may miss an addition or removal. With the embedded info we can update the counter and check the plist status with single atomic operations, and avoid any extra overheads: If the counter is full (inuse_pages == pages) and the off-list bit is unset, we attempt to remove it from the plist. If the counter is not full (inuse_pages != pages) and the off-list bit is set, we attempt to add it to the plist. Removing, adding and bit update is serialized with a lock, which is a cold path. Ordinary counter updates will be lock-free. Link: https://lkml.kernel.org/r/20250113175732.48099-7-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index c0d49dad7a4b..16dcf8bd1a4e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -306,7 +306,7 @@ struct swap_info_struct { /* list of cluster that are fragmented or contented */ unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ - unsigned int inuse_pages; /* number of those currently in use */ + atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ -- cgit v1.2.3 From 9a0ddeb7988095a5c21994c37005a45b240039ef Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:26 +0800 Subject: mm, swap: hold a reference during scan and cleanup flag usage The flag SWP_SCANNING was used as an indicator of whether a device is being scanned for allocation, and prevents swapoff. Combined with SWP_WRITEOK, they work as a set of barriers for a clean swapoff: 1. Swapoff clears SWP_WRITEOK, allocation requests will see ~SWP_WRITEOK and abort as it's serialized by si->lock. 2. Swapoff unuses all allocated entries. 3. Swapoff waits for SWP_SCANNING flag to be cleared, so ongoing allocations will stop, preventing UAF. 4. Now swapoff can free everything safely. This will make the allocation path have a hard dependency on si->lock. Allocation always have to acquire si->lock first for setting SWP_SCANNING and checking SWP_WRITEOK. This commit removes this flag, and just uses the existing per-CPU refcount instead to prevent UAF in step 3, which serves well for such usage without dependency on si->lock, and scales very well too. Just hold a reference during the whole scan and allocation process. Swapoff will kill and wait for the counter. And for preventing any allocation from happening after step 1 so the unuse in step 2 can ensure all slots are free, swapoff will acquire the ci->lock of each cluster one by one to ensure all allocations see ~SWP_WRITEOK and abort. This way these dependences on si->lock are gone. And worth noting we can't kill the refcount as the first step for swapoff as the unuse process have to acquire the refcount. Link: https://lkml.kernel.org/r/20250113175732.48099-8-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 16dcf8bd1a4e..1651174959c8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -219,7 +219,6 @@ enum { SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL -- cgit v1.2.3 From 3494d184706ff5e7d28481de0c841b039caa38b1 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:27 +0800 Subject: mm, swap: use an enum to define all cluster flags and wrap flags changes Currently, we are only using flags to indicate which list the cluster is on. Using one bit for each list type might be a waste, as the list type grows, we will consume too many bits. Additionally, the current mixed usage of '&' and '==' is a bit confusing. Make it clean by using an enum to define all possible cluster statuses. Only an off-list cluster will have the NONE (0) flag. And use a wrapper to annotate and sanitize all flag settings and list movements. Link: https://lkml.kernel.org/r/20250113175732.48099-9-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 1651174959c8..0e59cb158b15 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -256,10 +256,19 @@ struct swap_cluster_info { u8 order; struct list_head list; }; -#define CLUSTER_FLAG_FREE 1 /* This cluster is free */ -#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ -#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ -#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ + +/* All on-list cluster must have a non-zero flag. */ +enum swap_cluster_flags { + CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ + CLUSTER_FLAG_FREE, + CLUSTER_FLAG_NONFULL, + CLUSTER_FLAG_FRAG, + /* Clusters with flags above are allocatable */ + CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, + CLUSTER_FLAG_FULL, + CLUSTER_FLAG_DISCARD, + CLUSTER_FLAG_MAX, +}; /* * The first page in the swap file is the swap header, which is always marked -- cgit v1.2.3 From 3b644773eefda88112d3ee5d57620f6e58fccfc6 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:28 +0800 Subject: mm, swap: reduce contention on device lock Currently, swap locking is mainly composed of two locks: the cluster lock (ci->lock) and the device lock (si->lock). The cluster lock is much more fine-grained, so it is best to use ci->lock instead of si->lock as much as possible. We have cleaned up other hard dependencies on si->lock. Following the new cluster allocator design, most operations don't need to touch si->lock at all. In practice, we only need to take si->lock when moving clusters between lists. To achieve this, this commit reworks the locking pattern of all si->lock and ci->lock users, eliminates all usage of ci->lock inside si->lock, and introduces a new design to avoid touching si->lock unless needed. For minimal contention and easier understanding of the system, two ideas are introduced with the corresponding helpers: isolation and relocation. - Clusters will be `isolated` from the list when iterating the list to search for an allocatable cluster. This ensures other CPUs won't walk into the same cluster easily, and it releases si->lock after acquiring ci->lock, providing the only place that handles the inversion of two locks, and avoids contention. Iterating the cluster list almost always moves the cluster (free -> nonfull, nonfull -> frag, frag -> frag tail), but it doesn't know where the cluster should be moved to until scanning is done. So keeping the cluster off-list is a good option with low overhead. The off-list time window of a cluster is also minimal. In the worst case, one CPU will return the cluster after scanning the 512 entries on it, which we used to busy wait with a spin lock. This is done with the new helper `isolate_lock_cluster`. - Clusters will be `relocated` after allocation or freeing, according to their usage count and status. Allocations no longer hold si->lock now, and may drop ci->lock for reclaim, so the cluster could be moved to any location while no lock is held. Besides, isolation clears all flags when it takes the cluster off the list (the flags must be in sync with the list status, so cluster users don't need to touch si->lock for checking its list status). So the cluster has to be relocated to the right list according to its usage after allocation or freeing. Relocation is optional, if the cluster flags indicate it's already on the right list, it will skip touching the list or si->lock. This is done with `relocate_cluster` after allocation or with `[partial_]free_cluster` after freeing. This handled usage of all kinds of clusters in a clean way. Scanning and allocation by iterating the cluster list is handled by "isolate - - relocate". Scanning and allocation of per-CPU clusters will only involve " - relocate", as it knows which cluster to lock and use. Freeing will only involve "relocate". Each CPU will keep using its per-CPU cluster until the 512 entries are all consumed. Freeing also has to free 512 entries to trigger cluster movement in the best case, so si->lock is rarely touched. Testing with building the Linux kernel with defconfig showed huge improvement: tiem make -j96 / 768M memcg, 4K pages, 10G ZRAM, on Intel 8255C: Before: Sys time: 73578.30, Real time: 864.05 After: (-50.7% sys time, -44.8% real time) Sys time: 36227.49, Real time: 476.66 time make -j96 / 1152M memcg, 64K mTHP, 10G ZRAM, on Intel 8255C: (avg of 4 test run) Before: Sys time: 74044.85, Real time: 846.51 hugepages-64kB/stats/swpout: 1735216 hugepages-64kB/stats/swpout_fallback: 430333 After: (-40.4% sys time, -37.1% real time) Sys time: 44160.56, Real time: 532.07 hugepages-64kB/stats/swpout: 1786288 hugepages-64kB/stats/swpout_fallback: 243384 time make -j32 / 512M memcg, 4K pages, 5G ZRAM, on AMD 7K62: Before: Sys time: 8098.21, Real time: 401.3 After: (-22.6% sys time, -12.8% real time ) Sys time: 6265.02, Real time: 349.83 The allocation success rate also slightly improved as we sanitized the usage of clusters with new defined helpers, previously dropping si->lock or ci->lock during scan will cause cluster order shuffle. Link: https://lkml.kernel.org/r/20250113175732.48099-10-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0e59cb158b15..5fe650beb77d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -290,6 +290,7 @@ enum swap_cluster_flags { * throughput. */ struct percpu_cluster { + local_lock_t lock; /* Protect the percpu_cluster above */ unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; @@ -312,7 +313,7 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ - unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; + atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ -- cgit v1.2.3 From e3ae2dec849ba8bc5649c2d0507e02bd4379da71 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:29 +0800 Subject: mm, swap: simplify percpu cluster updating Instead of using a returning argument, we can simply store the next cluster offset to the fixed percpu location, which reduce the stack usage and simplify the function: Object size: ./scripts/bloat-o-meter mm/swapfile.o mm/swapfile.o.new add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-271 (-271) Function old new delta get_swap_pages 2847 2733 -114 alloc_swap_scan_cluster 894 737 -157 Total: Before=30833, After=30562, chg -0.88% Stack usage: Before: swapfile.c:1190:5:get_swap_pages 240 static After: swapfile.c:1185:5:get_swap_pages 216 static Link: https://lkml.kernel.org/r/20250113175732.48099-11-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 5fe650beb77d..75b2b0166cb1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -274,9 +274,9 @@ enum swap_cluster_flags { * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the * cluster to which it belongs being marked free. Therefore 0 is safe to use as - * a sentinel to indicate next is not valid in percpu_cluster. + * a sentinel to indicate an entry is not valid. */ -#define SWAP_NEXT_INVALID 0 +#define SWAP_ENTRY_INVALID 0 #ifdef CONFIG_THP_SWAP #define SWAP_NR_ORDERS (PMD_ORDER + 1) -- cgit v1.2.3 From bae8a4ef3efb56bb7e83bafd3c0856845aeaf605 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:31 +0800 Subject: mm, swap: use a global swap cluster for non-rotation devices Non-rotational devices (SSD / ZRAM) can tolerate fragmentation, so the goal of the SWAP allocator is to avoid contention for clusters. It uses a per-CPU cluster design, and each CPU will use a different cluster as much as possible. However, HDDs are very sensitive to fragmentation, contention is trivial in comparison. Therefore, we use one global cluster instead. This ensures that each order will be written to the same cluster as much as possible, which helps make the I/O more continuous. This ensures that the performance of the cluster allocator is as good as that of the old allocator. Tests after this commit compared to those before this series: Tested using 'make -j32' with tinyconfig, a 1G memcg limit, and HDD swap: make -j32 with tinyconfig, using 1G memcg limit and HDD swap: Before this series: 114.44user 29.11system 39:42.90elapsed 6%CPU (0avgtext+0avgdata 157284maxresident)k 2901232inputs+0outputs (238877major+4227640minor)pagefaults After this commit: 113.90user 23.81system 38:11.77elapsed 6%CPU (0avgtext+0avgdata 157260maxresident)k 2548728inputs+0outputs (235471major+4238110minor)pagefaults [ryncsn@gmail.com: check kmalloc() return in setup_clusters] Link: https://lkml.kernel.org/r/CAMgjq7Au+o04ckHyT=iU-wVx9az=t0B-ZiC5E0bDqNrAtNOP-g@mail.gmail.com Link: https://lkml.kernel.org/r/20250113175732.48099-13-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 75b2b0166cb1..a5f475335aea 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -317,6 +317,8 @@ struct swap_info_struct { unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ + struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ + spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ -- cgit v1.2.3 From 4f79384a25d57a59e142009e52f40ae1f25102fe Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:32 +0800 Subject: mm, swap_slots: remove slot cache for freeing path The slot cache for freeing path is mostly for reducing the overhead of si->lock. As we have basically eliminated the si->lock usage for freeing path, it can be removed. This helps simplify the code, and avoids swap entries from being hold in cache upon freeing. The delayed freeing of entries have been causing trouble for further optimizations for zswap [1] and in theory will also cause more fragmentation, and extra overhead. Test with build linux kernel showed both performance and fragmentation is better without the cache: tiem make -j96 / 768M memcg, 4K pages, 10G ZRAM, avg of 4 test run:: Before: Sys time: 36047.78, Real time: 472.43 After: (-7.6% sys time, -7.3% real time) Sys time: 33314.76, Real time: 437.67 time make -j96 / 1152M memcg, 64K mTHP, 10G ZRAM, avg of 4 test run: Before: Sys time: 46859.04, Real time: 562.63 hugepages-64kB/stats/swpout: 1783392 hugepages-64kB/stats/swpout_fallback: 240875 After: (-23.3% sys time, -21.3% real time) Sys time: 35958.87, Real time: 442.69 hugepages-64kB/stats/swpout: 1866267 hugepages-64kB/stats/swpout_fallback: 158330 Sequential SWAP should be also slightly faster, tests didn't show a measurable difference though, at least no regression: Swapin 4G zero page on ZRAM (time in us): Before (avg. 1923756) 1912391 1927023 1927957 1916527 1918263 1914284 1934753 1940813 1921791 After (avg. 1922290): 1919101 1925743 1916810 1917007 1923930 1935152 1917403 1923549 1921913 Link: https://lore.kernel.org/all/CAMgjq7ACohT_uerSz8E_994ZZCv709Zor+43hdmesW_59W1BWw@mail.gmail.com/[1] Link: https://lkml.kernel.org/r/20250113175732.48099-14-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap_slots.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 15adfb8c813a..840aec3523b2 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -16,15 +16,12 @@ struct swap_slots_cache { swp_entry_t *slots; int nr; int cur; - spinlock_t free_lock; /* protects slots_ret, n_ret */ - swp_entry_t *slots_ret; int n_ret; }; void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); void enable_swap_slots_cache(void); -void free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; -- cgit v1.2.3 From f8d4a6cabb74f82c37ccb7c5e9dc3fdad50393d4 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 2 Jan 2025 12:10:52 +0000 Subject: mm: make mmap_region() internal Now that we have removed the one user of mmap_region() outside of mm, make it internal and add it to vma.c so it can be userland tested. This ensures that all external memory mappings are performed using the appropriate interfaces and allows us to modify memory mapping logic as we see fit. Additionally expand test stubs to allow for the mmap_region() code to compile and be userland testable. Link: https://lkml.kernel.org/r/de5a3c574d35c26237edf20a1d8652d7305709c9.1735819274.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Jann Horn Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3550cbeed488..8483e09aeb2c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3363,9 +3363,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return __get_unmapped_area(file, addr, len, pgoff, flags, 0); } -extern unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, -- cgit v1.2.3 From c6f239796b55dbc4225a6fca9f96232092b9df83 Mon Sep 17 00:00:00 2001 From: Guo Weikang Date: Thu, 2 Jan 2025 15:25:28 +0800 Subject: mm/memblock: add memblock_alloc_or_panic interface Before SLUB initialization, various subsystems used memblock_alloc to allocate memory. In most cases, when memory allocation fails, an immediate panic is required. To simplify this behavior and reduce repetitive checks, introduce `memblock_alloc_or_panic`. This function ensures that memory allocation failures result in a panic automatically, improving code readability and consistency across subsystems that require this behavior. [guoweikang.kernel@gmail.com: arch/s390: save_area_alloc default failure behavior changed to panic] Link: https://lkml.kernel.org/r/20250109033136.2845676-1-guoweikang.kernel@gmail.com Link: https://lore.kernel.org/lkml/Z2fknmnNtiZbCc7x@kernel.org/ Link: https://lkml.kernel.org/r/20250102072528.650926-1-guoweikang.kernel@gmail.com Signed-off-by: Guo Weikang Acked-by: Geert Uytterhoeven [m68k] Reviewed-by: Alexander Gordeev [s390] Acked-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- include/linux/memblock.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index d48b56c1e558..e79eb6ac516f 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -421,6 +421,12 @@ static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align) MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } +void *__memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, + const char *func); + +#define memblock_alloc_or_panic(size, align) \ + __memblock_alloc_or_panic(size, align, __func__) + static inline void *memblock_alloc_raw(phys_addr_t size, phys_addr_t align) { -- cgit v1.2.3 From 798c0330c2ca078cc3e155e567c77c4d61345a38 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:34 -0700 Subject: mm/mglru: rework aging feedback The aging feedback is based on both the number of generations and the distribution of folios in each generation. The number of generations is currently the distance between max_seq and anon min_seq. This is because anon min_seq is not allowed to move past file min_seq. The rationale for that is that file is always evictable whereas anon is not. However, for use cases where anon is a lot cheaper than file: 1. Anon in the second oldest generation can be a better choice than file in the oldest generation. 2. A large amount of file in the oldest generation can skew the distribution, making should_run_aging() return false negative. Allow anon and file min_seq to move independently, and use solely the number of generations as the feedback for aging. Specifically, when both anon and file are evictable, anon min_seq can now be greater than file min_seq, and therefore the number of generations becomes the distance between max_seq and min(min_seq[0],min_seq[1]). And should_run_aging() returns true if and only if the number of generations is less than MAX_NR_GENS. As the first step to the final optimization, this change by itself should not have userspace-visiable effects beyond performance. The next twos patch will take advantage of this change; the last patch in this series will better distribute folios across MAX_NR_GENS. [yuzhao@google.com: restore behaviour for systems with swappiness == 200] Link: https://lkml.kernel.org/r/Z4S3-aJy5dj9tBTk@google.com Link: https://lkml.kernel.org/r/20241231043538.4075764-4-yuzhao@google.com Signed-off-by: Yu Zhao Reported-by: David Stevens Tested-by: Kalesh Singh Cc: Barry Song Cc: Bharata B Rao Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b36124145a16..8245ecb0400b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -421,12 +421,11 @@ enum { /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are - * stored in min_seq[] separately for anon and file types as clean file pages - * can be evicted regardless of swap constraints. - * - * Normally anon and file min_seq are in sync. But if swapping is constrained, - * e.g., out of swap space, file min_seq is allowed to advance and leave anon - * min_seq behind. + * stored in min_seq[] separately for anon and file types so that they can be + * incremented independently. Ideally min_seq[] are kept in sync when both anon + * and file types are evictable. However, to adapt to situations like extreme + * swappiness, they are allowed to be out of sync by at most + * MAX_NR_GENS-MIN_NR_GENS-1. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. @@ -446,8 +445,8 @@ struct lru_gen_folio { unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; - /* the first tier doesn't need protection, hence the minus one */ - unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; + /* can only be modified under the LRU lock */ + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; @@ -498,7 +497,7 @@ struct lru_gen_mm_walk { int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; - bool can_swap; + int swappiness; bool force_scan; }; -- cgit v1.2.3 From 4d5d14a01e2c9091b128fb46e1d07475e9a7bb72 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:37 -0700 Subject: mm/mglru: rework workingset protection With the aging feedback no longer considering the distribution of folios in each generation, rework workingset protection to better distribute folios across MAX_NR_GENS. This is achieved by reusing PG_workingset and PG_referenced/LRU_REFS_FLAGS in a slightly different way. For folios accessed multiple times through file descriptors, make lru_gen_inc_refs() set additional bits of LRU_REFS_WIDTH in folio->flags after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily promoted into the second oldest generation in the eviction path. And when folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that lru_gen_inc_refs() can start over. For this case, LRU_REFS_MASK is only valid when PG_referenced is set. For folios accessed multiple times through page tables, folio_update_gen() from a page table walk or lru_gen_set_refs() from a rmap walk sets PG_referenced after the accessed bit is cleared for the first time. Thereafter, those two paths set PG_workingset and promote folios to the youngest generation. Like folio_inc_gen(), when folio_update_gen() does that, it also clears PG_referenced. For this case, LRU_REFS_MASK is not used. For both of the cases, after PG_workingset is set on a folio, it remains until this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It can be set again if lru_gen_test_recent() returns true upon a refault. When adding folios to the LRU lists, lru_gen_folio_seq() distributes them as follows: +---------------------------------+---------------------------------+ | Accessed thru page tables | Accessed thru file descriptors | +---------------------------------+---------------------------------+ | PG_active (set while isolated) | | +----------------+----------------+----------------+----------------+ | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | +---------------------------------+---------------------------------+ |<--------- MIN_NR_GENS --------->| | |<-------------------------- MAX_NR_GENS -------------------------->| After this patch, some typical client and server workloads showed improvements under heavy memory pressure. For example, Python TPC-C, which was used to benchmark a different approach [1] to better detect refault distances, showed a significant decrease in total refaults: Before After Change Time (seconds) 10801 10801 0% Executed (transactions) 41472 43663 +5% workingset_nodes 109070 120244 +10% workingset_refault_anon 5019627 7281831 +45% workingset_refault_file 1294678786 554855564 -57% workingset_refault_total 1299698413 562137395 -57% [1] https://lore.kernel.org/20230920190244.16839-1-ryncsn@gmail.com/ Link: https://lkml.kernel.org/r/20241231043538.4075764-7-yuzhao@google.com Signed-off-by: Yu Zhao Reported-by: Kairui Song Closes: https://lore.kernel.org/CAOUHufahuWcKf5f1Sg3emnqX+cODuR=2TQo7T4Gr-QYLujn4RA@mail.gmail.com/ Tested-by: Kalesh Singh Cc: Barry Song Cc: Bharata B Rao Cc: David Stevens Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 88 +++++++++++++++++++++++------------------------ include/linux/mmzone.h | 82 ++++++++++++++++++++++++++----------------- 2 files changed, 94 insertions(+), 76 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 34e5097182a0..f9157a0c42a5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -133,31 +133,25 @@ static inline int lru_hist_from_seq(unsigned long seq) return seq % NR_HIST_GENS; } -static inline int lru_tier_from_refs(int refs) +static inline int lru_tier_from_refs(int refs, bool workingset) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); - /* see the comment in folio_lru_refs() */ - return order_base_2(refs + 1); + /* see the comment on MAX_NR_TIERS */ + return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); - bool workingset = flags & BIT(PG_workingset); + if (!(flags & BIT(PG_referenced))) + return 0; /* - * Return the number of accesses beyond PG_referenced, i.e., N-1 if the - * total number of accesses is N>1, since N=0,1 both map to the first - * tier. lru_tier_from_refs() will account for this off-by-one. Also see - * the comment on MAX_NR_TIERS. + * Return the total number of accesses including PG_referenced. Also see + * the comment on LRU_REFS_FLAGS. */ - return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; -} - -static inline void folio_clear_lru_refs(struct folio *folio) -{ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } static inline int folio_lru_gen(struct folio *folio) @@ -223,11 +217,43 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } +static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, + bool reclaiming) +{ + int gen; + int type = folio_is_file_lru(folio); + struct lru_gen_folio *lrugen = &lruvec->lrugen; + + /* + * +-----------------------------------+-----------------------------------+ + * | Accessed through page tables and | Accessed through file descriptors | + * | promoted by folio_update_gen() | and protected by folio_inc_gen() | + * +-----------------------------------+-----------------------------------+ + * | PG_active (set while isolated) | | + * +-----------------+-----------------+-----------------+-----------------+ + * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | + * +-----------------------------------+-----------------------------------+ + * |<---------- MIN_NR_GENS ---------->| | + * |<---------------------------- MAX_NR_GENS ---------------------------->| + */ + if (folio_test_active(folio)) + gen = MIN_NR_GENS - folio_test_workingset(folio); + else if (reclaiming) + gen = MAX_NR_GENS; + else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) || + (folio_test_reclaim(folio) && + (folio_test_dirty(folio) || folio_test_writeback(folio)))) + gen = MIN_NR_GENS; + else + gen = MAX_NR_GENS - folio_test_workingset(folio); + + return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); +} + static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; - unsigned long mask; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -237,40 +263,12 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, if (folio_test_unevictable(folio) || !lrugen->enabled) return false; - /* - * There are four common cases for this page: - * 1. If it's hot, i.e., freshly faulted in, add it to the youngest - * generation, and it's protected over the rest below. - * 2. If it can't be evicted immediately, i.e., a dirty page pending - * writeback, add it to the second youngest generation. - * 3. If it should be evicted first, e.g., cold and clean from - * folio_rotate_reclaimable(), add it to the oldest generation. - * 4. Everything else falls between 2 & 3 above and is added to the - * second oldest generation if it's considered inactive, or the - * oldest generation otherwise. See lru_gen_is_active(). - */ - if (folio_test_active(folio)) - seq = lrugen->max_seq; - else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || - (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio)))) - seq = lrugen->max_seq - 1; - else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq) - seq = lrugen->min_seq[type]; - else - seq = lrugen->min_seq[type] + 1; + seq = lru_gen_folio_seq(lruvec, folio, reclaiming); gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - mask = LRU_GEN_MASK; - /* - * Don't clear PG_workingset here because it can affect PSI accounting - * if the activation is due to workingset refault. - */ - if (folio_test_active(folio)) - mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active); - set_mask_bits(&folio->flags, mask, flags); + set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8245ecb0400b..9540b41894da 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -332,66 +332,88 @@ enum lruvec_flags { #endif /* !__GENERATING_BOUNDS_H */ /* - * Evictable pages are divided into multiple generations. The youngest and the + * Evictable folios are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while - * a page is on one of lrugen->folios[]. Otherwise it stores 0. + * a folio is on one of lrugen->folios[]. Otherwise it stores 0. * - * A page is added to the youngest generation on faulting. The aging needs to - * check the accessed bit at least twice before handing this page over to the - * eviction. The first check takes care of the accessed bit set on the initial - * fault; the second check makes sure this page hasn't been used since then. - * This process, AKA second chance, requires a minimum of two generations, - * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive - * LRU, e.g., /proc/vmstat, these two generations are considered active; the - * rest of generations, if they exist, are considered inactive. See - * lru_gen_is_active(). + * After a folio is faulted in, the aging needs to check the accessed bit at + * least twice before handing this folio over to the eviction. The first check + * clears the accessed bit from the initial fault; the second check makes sure + * this folio hasn't been used since then. This process, AKA second chance, + * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI + * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two + * generations are considered active; the rest of generations, if they exist, + * are considered inactive. See lru_gen_is_active(). * - * PG_active is always cleared while a page is on one of lrugen->folios[] so - * that the aging needs not to worry about it. And it's set again when a page - * considered active is isolated for non-reclaiming purposes, e.g., migration. - * See lru_gen_add_folio() and lru_gen_del_folio(). + * PG_active is always cleared while a folio is on one of lrugen->folios[] so + * that the sliding window needs not to worry about it. And it's set again when + * a folio considered active is isolated for non-reclaiming purposes, e.g., + * migration. See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits - * in folio->flags. + * in folio->flags, masked by LRU_GEN_MASK. */ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U /* - * Each generation is divided into multiple tiers. A page accessed N times - * through file descriptors is in tier order_base_2(N). A page in the first tier - * (N=0,1) is marked by PG_referenced unless it was faulted in through page - * tables or read ahead. A page in any other tier (N>1) is marked by - * PG_referenced and PG_workingset. This implies a minimum of two tiers is - * supported without using additional bits in folio->flags. + * Each generation is divided into multiple tiers. A folio accessed N times + * through file descriptors is in tier order_base_2(N). A folio in the first + * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page + * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by + * PG_workingset. A folio in any other tier (1flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, - * comparisons of refaulted/(evicted+protected) from the first tier and the - * rest infer whether pages accessed multiple times through file descriptors - * are statistically hot and thus worth protecting. + * comparisons of refaulted/(evicted+protected) from the first tier and the rest + * infer whether folios accessed multiple times through file descriptors are + * statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in - * folio->flags. + * folio->flags, masked by LRU_REFS_MASK. */ #define MAX_NR_TIERS 4U #ifndef __GENERATING_BOUNDS_H -struct lruvec; -struct page_vma_mapped_walk; - #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +/* + * For folios accessed multiple times through file descriptors, + * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags + * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its + * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily + * promoted into the second oldest generation in the eviction path. And when + * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that + * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is + * only valid when PG_referenced is set. + * + * For folios accessed multiple times through page tables, folio_update_gen() + * from a page table walk or lru_gen_set_refs() from a rmap walk sets + * PG_referenced after the accessed bit is cleared for the first time. + * Thereafter, those two paths set PG_workingset and promote folios to the + * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears + * PG_referenced. Note that for this case, LRU_REFS_MASK is not used. + * + * For both cases above, after PG_workingset is set on a folio, it remains until + * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It + * can be set again if lru_gen_test_recent() returns true upon a refault. + */ +#define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced)) + +struct lruvec; +struct page_vma_mapped_walk; + #ifdef CONFIG_LRU_GEN enum { @@ -406,8 +428,6 @@ enum { NR_LRU_GEN_CAPS }; -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) - #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) -- cgit v1.2.3 From d670c8e5302af8ccdf5f12242e58816420738bb5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Jan 2025 15:22:21 +0000 Subject: mm: remove PageTransTail() The last caller was removed in October. Also remove the FALSE definition of PageTransCompoundMap(); the normal definition was removed a few years ago. Link: https://lkml.kernel.org/r/20250109152245.1591914-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Acked-by: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 691506bdf2c5..330929b6e062 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -894,21 +894,9 @@ static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } - -/* - * PageTransTail returns true for both transparent huge pages - * and hugetlbfs pages, so it should only be called when it's known - * that hugetlbfs pages aren't involved. - */ -static inline int PageTransTail(const struct page *page) -{ - return PageTail(page); -} #else TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) -TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) -TESTPAGEFLAG_FALSE(TransTail, transtail) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -- cgit v1.2.3 From d783cc5913f17b2b5d9c51cb0904860ec97ed44d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Jan 2025 10:52:32 -0800 Subject: mm/damon: explain "effective quota" on kernel-doc comment The kernel-doc comment for 'struct damos_quota' describes how "effective quota" is calculated, but does not explain what it is. Actually there was an input[1] about it. Add the explanation on the comment. Also, fix a trivial typo on the comment block: s/empt/empty/ [1] https://github.com/damonitor/damo/issues/17#issuecomment-2497525043 Link: https://lkml.kernel.org/r/20250110185232.54907-6-sj@kernel.org Signed-off-by: SeongJae Park Suggested-by: Honggyu Kim Cc: Yunjeong Mun Cc: Honggyu Kim Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 0834d7ffcb84..af525252b853 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -193,11 +193,16 @@ struct damos_quota_goal { * size quota is set, DAMON tries to apply the action only up to &sz bytes * within &reset_interval. * - * Internally, the time quota is transformed to a size quota using estimated - * throughput of the scheme's action. DAMON then compares it against &sz and - * uses smaller one as the effective quota. + * To convince the different types of quotas and goals, DAMON internally + * converts those into one single size quota called "effective quota". DAMON + * internally uses it as the only one real quota. The conversion is made as + * follows. * - * If @goals is not empt, DAMON calculates yet another size quota based on the + * The time quota is transformed to a size quota using estimated throughput of + * the scheme's action. DAMON then compares it against &sz and uses smaller + * one as the effective quota. + * + * If @goals is not empty, DAMON calculates yet another size quota based on the * goals using its internal feedback loop algorithm, for every @reset_interval. * Then, if the new size quota is smaller than the effective quota, it uses the * new size quota as the effective quota. -- cgit v1.2.3 From 3ab76c767bc783c122a8dfe105fbc10a0b029b42 Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 10 Jan 2025 17:40:34 +0800 Subject: ksm: add ksm involvement information for each process In /proc//ksm_stat, add two extra ksm involvement items including KSM_mergeable and KSM_merge_any. It helps administrators to better know the system's KSM behavior at process level. ksm_merge_any: yes/no whether the process'mm is added by prctl() into the candidate list of KSM or not, and fully enabled at process level. ksm_mergeable: yes/no whether any VMAs of the process'mm are currently applicable to KSM. Purpose ======= These two items are just to improve the observability of KSM at process level, so that users can know if a certain process has enabled KSM. For example, if without these two items, when we look at /proc//ksm_stat and there's no merging pages found, We are not sure whether it is because KSM was not enabled or because KSM did not successfully merge any pages. Although "mg" in /proc//smaps indicate VM_MERGEABLE, it's opaque and not very obvious for non professionals. [akpm@linux-foundation.org: wording tweaks, per David and akpm] Link: https://lkml.kernel.org/r/20250110174034304QOb8eDoqtFkp3_t8mqnqc@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Tested-by: Mario Casquero Cc: Wang Yaxin Cc: Yang Yang Signed-off-by: Andrew Morton --- include/linux/ksm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 6a53ac4885bb..d73095b5cd96 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -93,6 +93,7 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); +bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -- cgit v1.2.3 From 8d91fed83cc12306cbb63efa6c473ffee117977a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:06 +0100 Subject: mm/huge_memory: convert has_hwpoisoned into a pure folio flag Patch series "mm: hugetlb+THP folio and migration cleanups", v2. Some cleanups around more folio conversion and migration handling that I collected working on random stuff. This patch (of 6): Let's stop setting it on pages, there is no need to anymore. Link: https://lkml.kernel.org/r/20250113131611.2554758-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Baolin Wang Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 330929b6e062..616b57ddc3fe 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -906,11 +906,9 @@ TESTPAGEFLAG_FALSE(TransCompound, transcompound) * * This flag is set by hwpoison handler. Cleared by THP split or free page. */ -PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) - TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) +FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE) #else -PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) - TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) +FOLIO_FLAG_FALSE(has_hwpoisoned) #endif /* -- cgit v1.2.3 From 4c640f128074e0d4459ecf072595a44df5c2ae18 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:07 +0100 Subject: mm/hugetlb: rename isolate_hugetlb() to folio_isolate_hugetlb() Let's make the function name match "folio_isolate_lru()", and add some kernel doc. Link: https://lkml.kernel.org/r/20250113131611.2554758-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Baolin Wang Cc: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 49ec2362ce92..c95ad5cd7894 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -153,7 +153,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -bool isolate_hugetlb(struct folio *folio, struct list_head *list); +bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -414,7 +414,7 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) +static inline bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { return false; } -- cgit v1.2.3 From b235448e8cab7eea17d164efc7bf55505985ba65 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:09 +0100 Subject: mm/hugetlb: rename folio_putback_active_hugetlb() to folio_putback_hugetlb() Now that folio_putback_hugetlb() is only called on folios that were previously isolated through folio_isolate_hugetlb(), let's rename it to match folio_putback_lru(). Add some kernel doc to clarify how this function is supposed to be used. Link: https://lkml.kernel.org/r/20250113131611.2554758-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c95ad5cd7894..ec8c0ccc8f95 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -157,7 +157,7 @@ bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -void folio_putback_active_hugetlb(struct folio *folio); +void folio_putback_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -430,7 +430,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void folio_putback_active_hugetlb(struct folio *folio) +static inline void folio_putback_hugetlb(struct folio *folio) { } -- cgit v1.2.3 From cceba6f7e46c48deca433030d80fc34599fb9fd8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:42 -0700 Subject: mm: add PG_dropbehind folio flag Add a folio flag that file IO can use to indicate that the cached IO being done should be dropped from the page cache upon completion. Link: https://lkml.kernel.org/r/20241220154831.1086649-5-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++++ include/trace/events/mmflags.h | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 616b57ddc3fe..36d283552f80 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -110,6 +110,7 @@ enum pageflags { PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ + PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif @@ -562,6 +563,10 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) +FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE) + __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE) + #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index bb8a59c6caa2..3bc8656c8359 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -116,7 +116,8 @@ DEF_PAGEFLAG_NAME(head), \ DEF_PAGEFLAG_NAME(reclaim), \ DEF_PAGEFLAG_NAME(swapbacked), \ - DEF_PAGEFLAG_NAME(unevictable) \ + DEF_PAGEFLAG_NAME(unevictable), \ + DEF_PAGEFLAG_NAME(dropbehind) \ IF_HAVE_PG_MLOCK(mlocked) \ IF_HAVE_PG_HWPOISON(hwpoison) \ IF_HAVE_PG_IDLE(idle) \ -- cgit v1.2.3 From 77d075221ae777296e2b18a0a4f5fea6f75daf2c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:43 -0700 Subject: mm/readahead: add readahead_control->dropbehind member If ractl->dropbehind is set to true, then folios created are marked as dropbehind as well. Link: https://lkml.kernel.org/r/20241220154831.1086649-6-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fc2e1319c7bb..d53c49abead6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1358,6 +1358,7 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool dropbehind; bool _workingset; unsigned long _pflags; }; -- cgit v1.2.3 From b9f958d4f146bd11be33a5f2bc3ced50f86d6b23 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:45 -0700 Subject: fs: add RWF_DONTCACHE iocb and FOP_DONTCACHE file_operations flag If a file system supports uncached buffered IO, it may set FOP_DONTCACHE and enable support for RWF_DONTCACHE. If RWF_DONTCACHE is attempted without the file system supporting it, it'll get errored with -EOPNOTSUPP. Link: https://lkml.kernel.org/r/20241220154831.1086649-8-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/fs.h | 14 +++++++++++++- include/uapi/linux/fs.h | 6 +++++- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..6a838b5479a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -322,6 +322,7 @@ struct readahead_control; #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC +#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -356,7 +357,8 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ - { IOCB_ATOMIC, "ATOMIC"}, \ + { IOCB_ATOMIC, "ATOMIC" }, \ + { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -2127,6 +2129,8 @@ struct file_operations { #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) +/* File system supports uncached read/write buffered IO */ +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, @@ -3614,6 +3618,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } + if (flags & RWF_DONTCACHE) { + /* file system must support it */ + if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) + return -EOPNOTSUPP; + /* DAX mappings not supported */ + if (IS_DAX(ki->ki_filp->f_mapping->host)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..56a4f93a08f4 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -332,9 +332,13 @@ typedef int __bitwise __kernel_rwf_t; /* Atomic Write */ #define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) +/* buffered IO that drops the cache after reading or writing data */ +#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ + RWF_DONTCACHE) #define PROCFS_IOCTL_MAGIC 'f' -- cgit v1.2.3 From dddc559f2e7cff9c6525150cd29ef3a4f6692b26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:48 -0700 Subject: mm/filemap: add filemap_fdatawrite_range_kick() helper Works like filemap_fdatawrite_range(), except it's a non-integrity data writeback and hence only starts writeback on the specified range. Will help facilitate generically starting uncached writeback from generic_write_sync(), as header dependencies preclude doing this inline from fs.h. Link: https://lkml.kernel.org/r/20241220154831.1086649-11-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6a838b5479a6..653b5efa3d3f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2878,6 +2878,8 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end); static inline int file_write_and_wait(struct file *file) { -- cgit v1.2.3 From 1d4457576570627e1702614bc060b55d95b85e39 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:49 -0700 Subject: mm: call filemap_fdatawrite_range_kick() after IOCB_DONTCACHE issue When a buffered write submitted with IOCB_DONTCACHE has been successfully submitted, call filemap_fdatawrite_range_kick() to kick off the IO. File systems call generic_write_sync() for any successful buffered write submission, hence add the logic here rather than needing to modify the file system. Link: https://lkml.kernel.org/r/20241220154831.1086649-12-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 653b5efa3d3f..58a618853574 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2912,6 +2912,11 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); if (ret) return ret; + } else if (iocb->ki_flags & IOCB_DONTCACHE) { + struct address_space *mapping = iocb->ki_filp->f_mapping; + + filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, + iocb->ki_pos + count); } return count; -- cgit v1.2.3 From d94d23fdd7529f1f3218235d1e0a69e9856907b7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:50 -0700 Subject: mm: add FGP_DONTCACHE folio creation flag Callers can pass this in for uncached folio creation, in which case if a folio is newly created it gets marked as uncached. If a folio exists for this index and lookup succeeds, then it will not get marked as uncached. If an !uncached lookup finds a cached folio, clear the flag. For that case, there are competeting uncached and cached users of the folio, and it should not get pruned. Link: https://lkml.kernel.org/r/20241220154831.1086649-13-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d53c49abead6..47bfc6b1b632 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -710,6 +710,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, * * %FGP_NOFS - __GFP_FS will get cleared in gfp. * * %FGP_NOWAIT - Don't block on the folio lock. * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) + * * %FGP_DONTCACHE - Uncached buffered IO * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin() * implementation. */ @@ -723,6 +724,7 @@ typedef unsigned int __bitwise fgf_t; #define FGP_NOWAIT ((__force fgf_t)0x00000020) #define FGP_FOR_MMAP ((__force fgf_t)0x00000040) #define FGP_STABLE ((__force fgf_t)0x00000080) +#define FGP_DONTCACHE ((__force fgf_t)0x00000100) #define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */ #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) -- cgit v1.2.3 From 3c7fd94205f86ad89f1d1d01dbfbc4b139860d8f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 16 Jan 2025 10:27:30 -0800 Subject: seqlock: add missing parameter documentation for raw_seqcount_try_begin() Add missing documentation for raw_seqcount_try_begin() start parameter. Link: https://lkml.kernel.org/r/20250116182730.801497-1-surenb@google.com Fixes: dba4761a3e40 ("seqlock: add raw_seqcount_try_begin") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/all/20250116170522.23e884d5@canb.auug.org.au/ Signed-off-by: Suren Baghdasaryan Acked-by: Waiman Long Cc: Boqun Feng Cc: David Hildenbrand Cc: Ingo Molnar Cc: Liam Howlett Cc: Peter Zijlstra (Intel) Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/seqlock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 22c2c48b4265..b783a3a7ed62 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -322,6 +322,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) * raw_seqcount_try_begin() - begin a seqcount_t read critical section * w/o lockdep and w/o counter stabilization * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * @start: count to be passed to read_seqcount_retry() * * Similar to raw_seqcount_begin(), except it enables eliding the critical * section entirely if odd, instead of doing the speculation knowing it will -- cgit v1.2.3