From 800977f6f32e452cba6b04ef21d2f5383ca29209 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:02:52 -0800 Subject: kthread: add the helper function kthread_run_on_cpu() Add a new helper function kthread_run_on_cpu(), which includes kthread_create_on_cpu/wake_up_process(). In some cases, use kthread_run_on_cpu() directly instead of kthread_create_on_node/kthread_bind/wake_up_process() or kthread_create_on_cpu/wake_up_process() or kthreadd_create/kthread_bind/wake_up_process() to simplify the code. [akpm@linux-foundation.org: export kthread_create_on_cpu to modules] Link: https://lkml.kernel.org/r/20211022025711.3673-2-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Cai Huoqing Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kthread.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 346b0f269161..db47aae7c481 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -56,6 +56,31 @@ bool kthread_is_per_cpu(struct task_struct *k); __k; \ }) +/** + * kthread_run_on_cpu - create and wake a cpu bound thread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @cpu: The cpu on which the thread should be bound, + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. + * + * Description: Convenient wrapper for kthread_create_on_cpu() + * followed by wake_up_process(). Returns the kthread or + * ERR_PTR(-ENOMEM). + */ +static inline struct task_struct * +kthread_run_on_cpu(int (*threadfn)(void *data), void *data, + unsigned int cpu, const char *namefmt) +{ + struct task_struct *p; + + p = kthread_create_on_cpu(threadfn, data, cpu, namefmt); + if (!IS_ERR(p)) + wake_up_process(p); + + return p; +} + void free_kthread_struct(struct task_struct *k); void kthread_bind(struct task_struct *k, unsigned int cpu); void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); -- cgit v1.2.3 From 60115fa54ad7b913b7cb5844e6b7ffeb842d55f2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 14 Jan 2022 14:04:11 -0800 Subject: mm: defer kmemleak object creation of module_alloc() Yongqiang reports a kmemleak panic when module insmod/rmmod with KASAN enabled(without KASAN_VMALLOC) on x86[1]. When the module area allocates memory, it's kmemleak_object is created successfully, but the KASAN shadow memory of module allocation is not ready, so when kmemleak scan the module's pointer, it will panic due to no shadow memory with KASAN check. module_alloc __vmalloc_node_range kmemleak_vmalloc kmemleak_scan update_checksum kasan_module_alloc kmemleak_ignore Note, there is no problem if KASAN_VMALLOC enabled, the modules area entire shadow memory is preallocated. Thus, the bug only exits on ARCH which supports dynamic allocation of module area per module load, for now, only x86/arm64/s390 are involved. Add a VM_DEFER_KMEMLEAK flags, defer vmalloc'ed object register of kmemleak in module_alloc() to fix this issue. [1] https://lore.kernel.org/all/6d41e2b9-4692-5ec4-b1cd-cbe29ae89739@huawei.com/ [wangkefeng.wang@huawei.com: fix build] Link: https://lkml.kernel.org/r/20211125080307.27225-1-wangkefeng.wang@huawei.com [akpm@linux-foundation.org: simplify ifdefs, per Andrey] Link: https://lkml.kernel.org/r/CA+fCnZcnwJHUQq34VuRxpdoY6_XbJCDJ-jopksS5Eia4PijPzw@mail.gmail.com Link: https://lkml.kernel.org/r/20211124142034.192078-1-wangkefeng.wang@huawei.com Fixes: 793213a82de4 ("s390/kasan: dynamic shadow mem allocation for modules") Fixes: 39d114ddc682 ("arm64: add KASAN support") Fixes: bebf56a1b176 ("kasan: enable instrumentation of global variables") Signed-off-by: Kefeng Wang Reported-by: Yongqiang Liu Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Catalin Marinas Cc: Will Deacon Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Alexander Gordeev Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Alexander Potapenko Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 4 ++-- include/linux/vmalloc.h | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d8783b682669..89c99e5e67de 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -474,12 +474,12 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, * allocations with real shadow memory. With KASAN vmalloc, the special * case is unnecessary, as the work is handled in the generic case. */ -int kasan_module_alloc(void *addr, size_t size); +int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask); void kasan_free_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ -static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } +static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; } static inline void kasan_free_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 6e022cc712e6..880227b9f044 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -28,6 +28,13 @@ struct notifier_block; /* in notifier.h */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */ +#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ + !defined(CONFIG_KASAN_VMALLOC) +#define VM_DEFER_KMEMLEAK 0x00000800 /* defer kmemleak object creation */ +#else +#define VM_DEFER_KMEMLEAK 0 +#endif + /* * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC. * -- cgit v1.2.3 From c4386bd8ee3a921c3c799b7197dc898ade76a453 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:22 -0800 Subject: mm/memremap: add ZONE_DEVICE support for compound pages Add a new @vmemmap_shift property for struct dev_pagemap which specifies that a devmap is composed of a set of compound pages of order @vmemmap_shift, instead of base pages. When a compound page devmap is requested, all but the first page are initialised as tail pages instead of order-0 pages. For certain ZONE_DEVICE users like device-dax which have a fixed page size, this creates an opportunity to optimize GUP and GUP-fast walkers, treating it the same way as THP or hugetlb pages. Additionally, commit 7118fc2906e2 ("hugetlb: address ref count racing in prep_compound_gigantic_page") removed set_page_count() because the setting of page ref count to zero was redundant. devmap pages don't come from page allocator though and only head page refcount is used for compound pages, hence initialize tail page count to zero. Link: https://lkml.kernel.org/r/20211202204422.26777-5-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memremap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index c0e9d35889e8..61a6a0e27359 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -99,6 +99,11 @@ struct dev_pagemap_ops { * @done: completion for @internal_ref * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior + * @vmemmap_shift: structural definition of how the vmemmap page metadata + * is populated, specifically the metadata page order. + * A zero value (default) uses base pages as the vmemmap metadata + * representation. A bigger value will set up compound struct pages + * of the requested order value. * @ops: method table * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no @@ -114,6 +119,7 @@ struct dev_pagemap { struct completion done; enum memory_type type; unsigned int flags; + unsigned long vmemmap_shift; const struct dev_pagemap_ops *ops; void *owner; int nr_range; @@ -130,6 +136,11 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) return NULL; } +static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) +{ + return 1 << pgmap->vmemmap_shift; +} + #ifdef CONFIG_ZONE_DEVICE void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); -- cgit v1.2.3 From 3e9d80a891df3b1a5d77db47fa7fdf33ba71e5cb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:05:04 -0800 Subject: mm,fs: split dump_mapping() out from dump_page() dump_mapping() is a big chunk of dump_page(), and it'd be handy to be able to call it when we don't have a struct page. Split it out and move it to fs/inode.c. Take the opportunity to simplify some of the debug messages a little. Link: https://lkml.kernel.org/r/20211121121056.2870061-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8..5315fa68f751 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3152,6 +3152,7 @@ extern void unlock_new_inode(struct inode *); extern void discard_new_inode(struct inode *); extern unsigned int get_next_ino(void); extern void evict_inodes(struct super_block *sb); +void dump_mapping(const struct address_space *); /* * Userspace may rely on the the inode number being non-zero. For example, glibc -- cgit v1.2.3 From b6bf9abb0aa44e53ffe9c1e6e1d32568f5b25e4a Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Fri, 14 Jan 2022 14:05:35 -0800 Subject: mm/memcg: add oom_group_kill memory event Our container agent wants to know when a container exits if it was OOM killed or not to report to the user. We use memory.oom.group = 1 to ensure that OOM kills within the container's cgroup kill everything. Existing memory.events are insufficient for knowing if this triggered: 1) Our current approach reads memory.events oom_kill and reports the container was killed if the value is non-zero. This is erroneous in some cases where containers create their children cgroups with memory.oom.group=1 as such OOM kills will get counted against the container cgroup's oom_kill counter despite not actually OOM killing the entire container. 2) Reading memory.events.local will fail to identify OOM kills in leaf cgroups (that don't set memory.oom.group) within the container cgroup. This patch adds a new oom_group_kill event when memory.oom.group triggers to allow userspace to cleanly identify when an entire cgroup is oom killed. [schatzberg.dan@gmail.com: changes from Johannes and Chris] Link: https://lkml.kernel.org/r/20211213162511.2492267-1-schatzberg.dan@gmail.com Link: https://lkml.kernel.org/r/20211203162426.3375036-1-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Chris Down Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Tejun Heo Cc: Zefan Li Cc: Jonathan Corbet Cc: Vladimir Davydov Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Alex Shi Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0c5c403f4be6..951f24f42147 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -42,6 +42,7 @@ enum memcg_memory_event { MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, + MEMCG_OOM_GROUP_KILL, MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, -- cgit v1.2.3 From 4e5aa1f4c2b489bc6f3ab5ca54747b18a847289d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 14 Jan 2022 14:05:45 -0800 Subject: memcg: add per-memcg vmalloc stat The kvmalloc* allocation functions can fallback to vmalloc allocations and more often on long running machines. In addition the kernel does have __GFP_ACCOUNT kvmalloc* calls. So, often on long running machines, the memory.stat does not tell the complete picture which type of memory is charged to the memcg. So add a per-memcg vmalloc stat. [shakeelb@google.com: page_memcg() within rcu lock, per Muchun] Link: https://lkml.kernel.org/r/20211222052457.1960701-1-shakeelb@google.com [akpm@linux-foundation.org: remove cast, per Muchun] [shakeelb@google.com: remove area->page[0] checks and move to page by page accounting per Michal] Link: https://lkml.kernel.org/r/20220104222341.3972772-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20211221215336.1922823-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Muchun Song Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 951f24f42147..0131e5574c88 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -33,6 +33,7 @@ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, + MEMCG_VMALLOC, MEMCG_NR_STAT, }; @@ -992,6 +993,21 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, local_irq_restore(flags); } +static inline void mod_memcg_page_state(struct page *page, + int idx, int val) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return; + + rcu_read_lock(); + memcg = page_memcg(page); + if (memcg) + mod_memcg_state(memcg, idx, val); + rcu_read_unlock(); +} + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return READ_ONCE(memcg->vmstats.state[idx]); @@ -1447,6 +1463,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, { } +static inline void mod_memcg_page_state(struct page *page, + int idx, int val) +{ +} + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return 0; -- cgit v1.2.3 From 9a10064f5625d5572c3626c1516e0bebc6c9fe9b Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Fri, 14 Jan 2022 14:05:59 -0800 Subject: mm: add a field to store names for private anonymous memory In many userspace applications, and especially in VM based applications like Android uses heavily, there are multiple different allocators in use. At a minimum there is libc malloc and the stack, and in many cases there are libc malloc, the stack, direct syscalls to mmap anonymous memory, and multiple VM heaps (one for small objects, one for big objects, etc.). Each of these layers usually has its own tools to inspect its usage; malloc by compiling a debug version, the VM through heap inspection tools, and for direct syscalls there is usually no way to track them. On Android we heavily use a set of tools that use an extended version of the logic covered in Documentation/vm/pagemap.txt to walk all pages mapped in userspace and slice their usage by process, shared (COW) vs. unique mappings, backing, etc. This can account for real physical memory usage even in cases like fork without exec (which Android uses heavily to share as many private COW pages as possible between processes), Kernel SamePage Merging, and clean zero pages. It produces a measurement of the pages that only exist in that process (USS, for unique), and a measurement of the physical memory usage of that process with the cost of shared pages being evenly split between processes that share them (PSS). If all anonymous memory is indistinguishable then figuring out the real physical memory usage (PSS) of each heap requires either a pagemap walking tool that can understand the heap debugging of every layer, or for every layer's heap debugging tools to implement the pagemap walking logic, in which case it is hard to get a consistent view of memory across the whole system. Tracking the information in userspace leads to all sorts of problems. It either needs to be stored inside the process, which means every process has to have an API to export its current heap information upon request, or it has to be stored externally in a filesystem that somebody needs to clean up on crashes. It needs to be readable while the process is still running, so it has to have some sort of synchronization with every layer of userspace. Efficiently tracking the ranges requires reimplementing something like the kernel vma trees, and linking to it from every layer of userspace. It requires more memory, more syscalls, more runtime cost, and more complexity to separately track regions that the kernel is already tracking. This patch adds a field to /proc/pid/maps and /proc/pid/smaps to show a userspace-provided name for anonymous vmas. The names of named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps as [anon:]. Userspace can set the name for a region of memory by calling prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name) Setting the name to NULL clears it. The name length limit is 80 bytes including NUL-terminator and is checked to contain only printable ascii characters (including space), except '[',']','\','$' and '`'. Ascii strings are being used to have a descriptive identifiers for vmas, which can be understood by the users reading /proc/pid/maps or /proc/pid/smaps. Names can be standardized for a given system and they can include some variable parts such as the name of the allocator or a library, tid of the thread using it, etc. The name is stored in a pointer in the shared union in vm_area_struct that points to a null terminated string. Anonymous vmas with the same name (equivalent strings) and are otherwise mergeable will be merged. The name pointers are not shared between vmas even if they contain the same name. The name pointer is stored in a union with fields that are only used on file-backed mappings, so it does not increase memory usage. CONFIG_ANON_VMA_NAME kernel configuration is introduced to enable this feature. It keeps the feature disabled by default to prevent any additional memory overhead and to avoid confusing procfs parsers on systems which are not ready to support named anonymous vmas. The patch is based on the original patch developed by Colin Cross, more specifically on its latest version [1] posted upstream by Sumit Semwal. It used a userspace pointer to store vma names. In that design, name pointers could be shared between vmas. However during the last upstreaming attempt, Kees Cook raised concerns [2] about this approach and suggested to copy the name into kernel memory space, perform validity checks [3] and store as a string referenced from vm_area_struct. One big concern is about fork() performance which would need to strdup anonymous vma names. Dave Hansen suggested experimenting with worst-case scenario of forking a process with 64k vmas having longest possible names [4]. I ran this experiment on an ARM64 Android device and recorded a worst-case regression of almost 40% when forking such a process. This regression is addressed in the followup patch which replaces the pointer to a name with a refcounted structure that allows sharing the name pointer between vmas of the same name. Instead of duplicating the string during fork() or when splitting a vma it increments the refcount. [1] https://lore.kernel.org/linux-mm/20200901161459.11772-4-sumit.semwal@linaro.org/ [2] https://lore.kernel.org/linux-mm/202009031031.D32EF57ED@keescook/ [3] https://lore.kernel.org/linux-mm/202009031022.3834F692@keescook/ [4] https://lore.kernel.org/linux-mm/5d0358ab-8c47-2f5f-8e43-23b89d6a8e95@intel.com/ Changes for prctl(2) manual page (in the options section): PR_SET_VMA Sets an attribute specified in arg2 for virtual memory areas starting from the address specified in arg3 and spanning the size specified in arg4. arg5 specifies the value of the attribute to be set. Note that assigning an attribute to a virtual memory area might prevent it from being merged with adjacent virtual memory areas due to the difference in that attribute's value. Currently, arg2 must be one of: PR_SET_VMA_ANON_NAME Set a name for anonymous virtual memory areas. arg5 should be a pointer to a null-terminated string containing the name. The name length including null byte cannot exceed 80 bytes. If arg5 is NULL, the name of the appropriate anonymous virtual memory areas will be reset. The name can contain only printable ascii characters (including space), except '[',']','\','$' and '`'. This feature is available only if the kernel is built with the CONFIG_ANON_VMA_NAME option enabled. [surenb@google.com: docs: proc.rst: /proc/PID/maps: fix malformed table] Link: https://lkml.kernel.org/r/20211123185928.2513763-1-surenb@google.com [surenb: rebased over v5.15-rc6, replaced userpointer with a kernel copy, added input sanitization and CONFIG_ANON_VMA_NAME config. The bulk of the work here was done by Colin Cross, therefore, with his permission, keeping him as the author] Link: https://lkml.kernel.org/r/20211019215511.3771969-2-surenb@google.com Signed-off-by: Colin Cross Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: Stephen Rothwell Cc: Al Viro Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Rientjes Cc: "Eric W. Biederman" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Glauber Cc: Johannes Weiner Cc: John Stultz Cc: Mel Gorman Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Rob Landley Cc: "Serge E. Hallyn" Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 13 +++++++++- include/linux/mm_types.h | 64 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 72 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a7e4a9e7d807..7000442984b9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2658,7 +2658,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); + struct mempolicy *, struct vm_userfaultfd_ctx, const char *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -3391,5 +3391,16 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) return 0; } +#ifdef CONFIG_ANON_VMA_NAME +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name); +#else +static inline int +madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) { + return 0; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c3a6e6209600..799e2ee626b2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -426,11 +426,19 @@ struct vm_area_struct { /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. + * + * For private anonymous mappings, a pointer to a null terminated string + * containing the name given to the vma, or NULL if unnamed. */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; + + union { + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; + /* Serialized by mmap_sem. */ + char *anon_name; + }; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -875,4 +883,52 @@ typedef struct { unsigned long val; } swp_entry_t; +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + #endif /* _LINUX_MM_TYPES_H */ -- cgit v1.2.3 From 78db3412833dc9c479cd17412035f216cfd01a29 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:06:03 -0800 Subject: mm: add anonymous vma name refcounting While forking a process with high number (64K) of named anonymous vmas the overhead caused by strdup() is noticeable. Experiments with ARM64 Android device show up to 40% performance regression when forking a process with 64k unpopulated anonymous vmas using the max name lengths vs the same process with the same number of anonymous vmas having no name. Introduce anon_vma_name refcounted structure to avoid the overhead of copying vma names during fork() and when splitting named anonymous vmas. When a vma is duplicated, instead of copying the name we increment the refcount of this structure. Multiple vmas can point to the same anon_vma_name as long as they increment the refcount. The name member of anon_vma_name structure is assigned at structure allocation time and is never changed. If vma name changes then the refcount of the original structure is dropped, a new anon_vma_name structure is allocated to hold the new name and the vma pointer is updated to point to the new structure. With this approach the fork() performance regressions is reduced 3-4x times and with usecases using more reasonable number of VMAs (a few thousand) the regressions is not measurable. Link: https://lkml.kernel.org/r/20211019215511.3771969-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: Al Viro Cc: Colin Cross Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Rientjes Cc: "Eric W. Biederman" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Glauber Cc: Johannes Weiner Cc: John Stultz Cc: Mel Gorman Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Rob Landley Cc: "Serge E. Hallyn" Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 799e2ee626b2..449b6eafc695 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -386,6 +387,12 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +struct anon_vma_name { + struct kref kref; + /* The name needs to be at the end because it is dynamically sized. */ + char name[]; +}; + /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -437,7 +444,7 @@ struct vm_area_struct { unsigned long rb_subtree_last; } shared; /* Serialized by mmap_sem. */ - char *anon_name; + struct anon_vma_name *anon_name; }; /* -- cgit v1.2.3 From 17fca131cee21724ee953a17c185c14e9533af5b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jan 2022 14:06:07 -0800 Subject: mm: move anon_vma declarations to linux/mm_inline.h The patch to add anonymous vma names causes a build failure in some configurations: include/linux/mm_types.h: In function 'is_same_vma_anon_name': include/linux/mm_types.h:924:37: error: implicit declaration of function 'strcmp' [-Werror=implicit-function-declaration] 924 | return name && vma_name && !strcmp(name, vma_name); | ^~~~~~ include/linux/mm_types.h:22:1: note: 'strcmp' is defined in header ''; did you forget to '#include '? This should not really be part of linux/mm_types.h in the first place, as that header is meant to only contain structure defintions and need a minimum set of indirect includes itself. While the header clearly includes more than it should at this point, let's not make it worse by including string.h as well, which would pull in the expensive (compile-speed wise) fortify-string logic. Move the new functions into a separate header that only needs to be included in a couple of locations. Link: https://lkml.kernel.org/r/20211207125710.2503446-1-arnd@kernel.org Fixes: "mm: add a field to store names for private anonymous memory" Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Colin Cross Cc: Eric Biederman Cc: Kees Cook Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Stephen Rothwell Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 48 --------------------------------------------- 2 files changed, 50 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index e2ec68b0515c..47d96d2647ca 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,7 @@ #include #include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -135,4 +136,53 @@ static __always_inline void del_page_from_lru_list(struct page *page, { lruvec_del_folio(lruvec, page_folio(page)); } + +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 449b6eafc695..4d5fb84eed5e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -890,52 +890,4 @@ typedef struct { unsigned long val; } swp_entry_t; -#ifdef CONFIG_ANON_VMA_NAME -/* - * mmap_lock should be read-locked when calling vma_anon_name() and while using - * the returned pointer. - */ -extern const char *vma_anon_name(struct vm_area_struct *vma); - -/* - * mmap_lock should be read-locked for orig_vma->vm_mm. - * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be - * isolated. - */ -extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma); - -/* - * mmap_lock should be write-locked or vma should have been isolated under - * write-locked mmap_lock protection. - */ -extern void free_vma_anon_name(struct vm_area_struct *vma); - -/* mmap_lock should be read-locked */ -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) -{ - const char *vma_name = vma_anon_name(vma); - - /* either both NULL, or pointers to same string */ - if (vma_name == name) - return true; - - return name && vma_name && !strcmp(name, vma_name); -} -#else /* CONFIG_ANON_VMA_NAME */ -static inline const char *vma_anon_name(struct vm_area_struct *vma) -{ - return NULL; -} -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) {} -static inline void free_vma_anon_name(struct vm_area_struct *vma) {} -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) -{ - return true; -} -#endif /* CONFIG_ANON_VMA_NAME */ - #endif /* _LINUX_MM_TYPES_H */ -- cgit v1.2.3 From 36090def7bad06a6346f86a7cfdbfda2d138cb64 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jan 2022 14:06:10 -0800 Subject: mm: move tlb_flush_pending inline helpers to mm_inline.h linux/mm_types.h should only define structure definitions, to make it cheap to include elsewhere. The atomic_t helper function definitions are particularly large, so it's better to move the helpers using those into the existing linux/mm_inline.h and only include that where needed. As a follow-up, we may want to go through all the indirect includes in mm_types.h and reduce them as much as possible. Link: https://lkml.kernel.org/r/20211207125710.2503446-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Stephen Rothwell Cc: Suren Baghdasaryan Cc: Colin Cross Cc: Kees Cook Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Yu Zhao Cc: Vlastimil Babka Cc: Matthew Wilcox (Oracle) Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 45 ---------------- include/linux/mm_inline.h | 86 +++++++++++++++++++++++++++++++ include/linux/mm_types.h | 129 ++++++++++++++++------------------------------ 3 files changed, 131 insertions(+), 129 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7000442984b9..c17e5cfc1e47 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -424,51 +424,6 @@ extern unsigned int kobjsize(const void *objp); */ extern pgprot_t protection_map[16]; -/** - * enum fault_flag - Fault flag definitions. - * @FAULT_FLAG_WRITE: Fault was a write fault. - * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. - * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. - * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. - * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. - * @FAULT_FLAG_TRIED: The fault has been tried once. - * @FAULT_FLAG_USER: The fault originated in userspace. - * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. - * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. - * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * - * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify - * whether we would allow page faults to retry by specifying these two - * fault flags correctly. Currently there can be three legal combinations: - * - * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and - * this is the first try - * - * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and - * we've already tried at least once - * - * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry - * - * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never - * be used. Note that page faults can be allowed to retry for multiple times, - * in which case we'll have an initial fault with flags (a) then later on - * continuous faults with flags (b). We should always try to detect pending - * signals before a retry to make sure the continuous page faults can still be - * interrupted if necessary. - */ -enum fault_flag { - FAULT_FLAG_WRITE = 1 << 0, - FAULT_FLAG_MKWRITE = 1 << 1, - FAULT_FLAG_ALLOW_RETRY = 1 << 2, - FAULT_FLAG_RETRY_NOWAIT = 1 << 3, - FAULT_FLAG_KILLABLE = 1 << 4, - FAULT_FLAG_TRIED = 1 << 5, - FAULT_FLAG_USER = 1 << 6, - FAULT_FLAG_REMOTE = 1 << 7, - FAULT_FLAG_INSTRUCTION = 1 << 8, - FAULT_FLAG_INTERRUPTIBLE = 1 << 9, -}; - /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 47d96d2647ca..b725839dfe71 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -2,6 +2,7 @@ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H +#include #include #include #include @@ -185,4 +186,89 @@ static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline void init_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_set(&mm->tlb_flush_pending, 0); +} + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_inc(&mm->tlb_flush_pending); + /* + * The only time this value is relevant is when there are indeed pages + * to flush. And we'll only flush pages after changing them, which + * requires the PTL. + * + * So the ordering here is: + * + * atomic_inc(&mm->tlb_flush_pending); + * spin_lock(&ptl); + * ... + * set_pte_at(); + * spin_unlock(&ptl); + * + * spin_lock(&ptl) + * mm_tlb_flush_pending(); + * .... + * spin_unlock(&ptl); + * + * flush_tlb_range(); + * atomic_dec(&mm->tlb_flush_pending); + * + * Where the increment if constrained by the PTL unlock, it thus + * ensures that the increment is visible if the PTE modification is + * visible. After all, if there is no PTE modification, nobody cares + * about TLB flushes either. + * + * This very much relies on users (mm_tlb_flush_pending() and + * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and + * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc + * locks (PPC) the unlock of one doesn't order against the lock of + * another PTL. + * + * The decrement is ordered by the flush_tlb_range(), such that + * mm_tlb_flush_pending() will not return false unless all flushes have + * completed. + */ +} + +static inline void dec_tlb_flush_pending(struct mm_struct *mm) +{ + /* + * See inc_tlb_flush_pending(). + * + * This cannot be smp_mb__before_atomic() because smp_mb() simply does + * not order against TLB invalidate completion, which is what we need. + * + * Therefore we must rely on tlb_flush_*() to guarantee order. + */ + atomic_dec(&mm->tlb_flush_pending); +} + +static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +{ + /* + * Must be called after having acquired the PTL; orders against that + * PTLs release and therefore ensures that if we observe the modified + * PTE we must also observe the increment from inc_tlb_flush_pending(). + * + * That is, it only guarantees to return true if there is a flush + * pending for _this_ PTL. + */ + return atomic_read(&mm->tlb_flush_pending); +} + +static inline bool mm_tlb_flush_nested(struct mm_struct *mm) +{ + /* + * Similar to mm_tlb_flush_pending(), we must have acquired the PTL + * for which there is a TLB flush pending in order to guarantee + * we've seen both that PTE modification and the increment. + * + * (no requirement on actually still holding the PTL, that is irrelevant) + */ + return atomic_read(&mm->tlb_flush_pending) > 1; +} + + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4d5fb84eed5e..6a89f128c990 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -692,90 +692,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_finish_mmu(struct mmu_gather *tlb); -static inline void init_tlb_flush_pending(struct mm_struct *mm) -{ - atomic_set(&mm->tlb_flush_pending, 0); -} - -static inline void inc_tlb_flush_pending(struct mm_struct *mm) -{ - atomic_inc(&mm->tlb_flush_pending); - /* - * The only time this value is relevant is when there are indeed pages - * to flush. And we'll only flush pages after changing them, which - * requires the PTL. - * - * So the ordering here is: - * - * atomic_inc(&mm->tlb_flush_pending); - * spin_lock(&ptl); - * ... - * set_pte_at(); - * spin_unlock(&ptl); - * - * spin_lock(&ptl) - * mm_tlb_flush_pending(); - * .... - * spin_unlock(&ptl); - * - * flush_tlb_range(); - * atomic_dec(&mm->tlb_flush_pending); - * - * Where the increment if constrained by the PTL unlock, it thus - * ensures that the increment is visible if the PTE modification is - * visible. After all, if there is no PTE modification, nobody cares - * about TLB flushes either. - * - * This very much relies on users (mm_tlb_flush_pending() and - * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and - * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc - * locks (PPC) the unlock of one doesn't order against the lock of - * another PTL. - * - * The decrement is ordered by the flush_tlb_range(), such that - * mm_tlb_flush_pending() will not return false unless all flushes have - * completed. - */ -} - -static inline void dec_tlb_flush_pending(struct mm_struct *mm) -{ - /* - * See inc_tlb_flush_pending(). - * - * This cannot be smp_mb__before_atomic() because smp_mb() simply does - * not order against TLB invalidate completion, which is what we need. - * - * Therefore we must rely on tlb_flush_*() to guarantee order. - */ - atomic_dec(&mm->tlb_flush_pending); -} - -static inline bool mm_tlb_flush_pending(struct mm_struct *mm) -{ - /* - * Must be called after having acquired the PTL; orders against that - * PTLs release and therefore ensures that if we observe the modified - * PTE we must also observe the increment from inc_tlb_flush_pending(). - * - * That is, it only guarantees to return true if there is a flush - * pending for _this_ PTL. - */ - return atomic_read(&mm->tlb_flush_pending); -} - -static inline bool mm_tlb_flush_nested(struct mm_struct *mm) -{ - /* - * Similar to mm_tlb_flush_pending(), we must have acquired the PTL - * for which there is a TLB flush pending in order to guarantee - * we've seen both that PTE modification and the increment. - * - * (no requirement on actually still holding the PTL, that is irrelevant) - */ - return atomic_read(&mm->tlb_flush_pending) > 1; -} - struct vm_fault; /** @@ -890,4 +806,49 @@ typedef struct { unsigned long val; } swp_entry_t; +/** + * enum fault_flag - Fault flag definitions. + * @FAULT_FLAG_WRITE: Fault was a write fault. + * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. + * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. + * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. + * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. + * @FAULT_FLAG_TRIED: The fault has been tried once. + * @FAULT_FLAG_USER: The fault originated in userspace. + * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. + * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. + * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two + * fault flags correctly. Currently there can be three legal combinations: + * + * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and + * this is the first try + * + * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and + * we've already tried at least once + * + * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry + * + * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never + * be used. Note that page faults can be allowed to retry for multiple times, + * in which case we'll have an initial fault with flags (a) then later on + * continuous faults with flags (b). We should always try to detect pending + * signals before a retry to make sure the continuous page faults can still be + * interrupted if necessary. + */ +enum fault_flag { + FAULT_FLAG_WRITE = 1 << 0, + FAULT_FLAG_MKWRITE = 1 << 1, + FAULT_FLAG_ALLOW_RETRY = 1 << 2, + FAULT_FLAG_RETRY_NOWAIT = 1 << 3, + FAULT_FLAG_KILLABLE = 1 << 4, + FAULT_FLAG_TRIED = 1 << 5, + FAULT_FLAG_USER = 1 << 6, + FAULT_FLAG_REMOTE = 1 << 7, + FAULT_FLAG_INSTRUCTION = 1 << 8, + FAULT_FLAG_INTERRUPTIBLE = 1 << 9, +}; + #endif /* _LINUX_MM_TYPES_H */ -- cgit v1.2.3 From cc6dcfee72509868271d42919a3c1081b6b0dc7e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:06:18 -0800 Subject: mm: document locking restrictions for vm_operations_struct::close Add comments for vm_operations_struct::close documenting locking requirements for this callback and its callers. Link: https://lkml.kernel.org/r/20211209191325.3069345-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Christian Brauner Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Florian Weimer Cc: Jan Engelhardt Cc: Jann Horn Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Minchan Kim Cc: Oleg Nesterov Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Murray Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c17e5cfc1e47..4d7245e6802a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -532,6 +532,10 @@ enum page_entry_size { */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); + /** + * @close: Called when the VMA is being removed from the MM. + * Context: User context. May sleep. Caller holds mmap_lock. + */ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); -- cgit v1.2.3 From 08d5b29eac7dd5e6c79b66d390ecbb9219e05931 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Jan 2022 14:06:33 -0800 Subject: mm: ptep_clear() page table helper We have ptep_get_and_clear() and ptep_get_and_clear_full() helpers to clear PTE from user page tables, but there is no variant for simple clear of a present PTE from user page tables without using a low level pte_clear() which can be either native or para-virtualised. Add a new ptep_clear() that can be used in common code to clear PTEs from page table. We will need this call later in order to add a hook for page table check. Link: https://lkml.kernel.org/r/20211221154650.1047963-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Kees Cook Cc: Masahiro Yamada Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Peter Zijlstra Cc: Sami Tolvanen Cc: Thomas Gleixner Cc: Wei Xu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pgtable.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e24d2c992b11..bc8713a76e03 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -258,6 +258,14 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef __HAVE_ARCH_PTEP_CLEAR +static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_clear(mm, addr, ptep); +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, -- cgit v1.2.3 From df4e817b710809425d899340dbfa8504a3ca4ba5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Jan 2022 14:06:37 -0800 Subject: mm: page table check Check user page table entries at the time they are added and removed. Allows to synchronously catch memory corruption issues related to double mapping. When a pte for an anonymous page is added into page table, we verify that this pte does not already point to a file backed page, and vice versa if this is a file backed page that is being added we verify that this page does not have an anonymous mapping We also enforce that read-only sharing for anonymous pages is allowed (i.e. cow after fork). All other sharing must be for file pages. Page table check allows to protect and debug cases where "struct page" metadata became corrupted for some reason. For example, when refcnt or mapcount become invalid. Link: https://lkml.kernel.org/r/20211221154650.1047963-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Kees Cook Cc: Masahiro Yamada Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Peter Zijlstra Cc: Sami Tolvanen Cc: Thomas Gleixner Cc: Wei Xu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_table_check.h | 147 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 include/linux/page_table_check.h (limited to 'include/linux') diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h new file mode 100644 index 000000000000..38cace1da7b6 --- /dev/null +++ b/include/linux/page_table_check.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2021, Google LLC. + * Pasha Tatashin + */ +#ifndef __LINUX_PAGE_TABLE_CHECK_H +#define __LINUX_PAGE_TABLE_CHECK_H + +#ifdef CONFIG_PAGE_TABLE_CHECK +#include + +extern struct static_key_true page_table_check_disabled; +extern struct page_ext_operations page_table_check_ops; + +void __page_table_check_zero(struct page *page, unsigned int order); +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte); +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd); +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud); +void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd); +void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud); + +static inline void page_table_check_alloc(struct page *page, unsigned int order) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_zero(page, order); +} + +static inline void page_table_check_free(struct page *page, unsigned int order) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_zero(page, order); +} + +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_clear(mm, addr, pte); +} + +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pmd_clear(mm, addr, pmd); +} + +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pud_clear(mm, addr, pud); +} + +static inline void page_table_check_pte_set(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pte_t pte) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_set(mm, addr, ptep, pte); +} + +static inline void page_table_check_pmd_set(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pmd_set(mm, addr, pmdp, pmd); +} + +static inline void page_table_check_pud_set(struct mm_struct *mm, + unsigned long addr, pud_t *pudp, + pud_t pud) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pud_set(mm, addr, pudp, pud); +} + +#else + +static inline void page_table_check_alloc(struct page *page, unsigned int order) +{ +} + +static inline void page_table_check_free(struct page *page, unsigned int order) +{ +} + +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) +{ +} + +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) +{ +} + +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) +{ +} + +static inline void page_table_check_pte_set(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pte_t pte) +{ +} + +static inline void page_table_check_pmd_set(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + pmd_t pmd) +{ +} + +static inline void page_table_check_pud_set(struct mm_struct *mm, + unsigned long addr, pud_t *pudp, + pud_t pud) +{ +} + +#endif /* CONFIG_PAGE_TABLE_CHECK */ +#endif /* __LINUX_PAGE_TABLE_CHECK_H */ -- cgit v1.2.3 From 020e87650af9f43683546729f959fdc78422a4b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:44 -0800 Subject: mm: remove last argument of reuse_swap_page() None of the callers care about the total_map_swapcount() any more. Link: https://lkml.kernel.org/r/20211220205943.456187-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Reviewed-by: William Kucharski Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index d1ea44b31f19..bdccbf1efa61 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -514,7 +514,7 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern bool reuse_swap_page(struct page *, int *); +extern bool reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); @@ -680,8 +680,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page, total_map_swapcount) \ - (page_trans_huge_mapcount(page, total_map_swapcount) == 1) +#define reuse_swap_page(page) \ + (page_trans_huge_mapcount(page, NULL) == 1) static inline int try_to_free_swap(struct page *page) { -- cgit v1.2.3 From d08d2b62510e2407cf939e693aefd179dc114913 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:51 -0800 Subject: mm: remove the total_mapcount argument from page_trans_huge_mapcount() All callers pass NULL, so we can stop calculating the value we would store in it. Link: https://lkml.kernel.org/r/20211220205943.456187-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Linus Torvalds Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +++------- include/linux/swap.h | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4d7245e6802a..cef65f9cbdf2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -799,19 +799,15 @@ static inline int page_mapcount(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); -int page_trans_huge_mapcount(struct page *page, int *total_mapcount); +int page_trans_huge_mapcount(struct page *page); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } -static inline int page_trans_huge_mapcount(struct page *page, - int *total_mapcount) +static inline int page_trans_huge_mapcount(struct page *page) { - int mapcount = page_mapcount(page); - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; + return page_mapcount(page); } #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index bdccbf1efa61..1d38d9475c4d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -681,7 +681,7 @@ static inline int swp_swapcount(swp_entry_t entry) } #define reuse_swap_page(page) \ - (page_trans_huge_mapcount(page, NULL) == 1) + (page_trans_huge_mapcount(page) == 1) static inline int try_to_free_swap(struct page *page) { -- cgit v1.2.3 From a421ef303008b0ceee2cfc625c3246fa7654b0ca Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:07 -0800 Subject: mm: allow !GFP_KERNEL allocations for kvmalloc Support for GFP_NO{FS,IO} and __GFP_NOFAIL has been implemented by previous patches so we can allow the support for kvmalloc. This will allow some external users to simplify or completely remove their helpers. GFP_NOWAIT semantic hasn't been supported so far but it hasn't been explicitly documented so let's add a note about that. ceph_kvmalloc is the first helper to be dropped and changed to kvmalloc. Link: https://lkml.kernel.org/r/20211122153233.9924-5-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jeff Layton Cc: Neil Brown Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ceph/libceph.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 409d8c29bc4f..309acbcb5a8a 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -295,7 +295,6 @@ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); -extern void *ceph_kvmalloc(size_t size, gfp_t flags); struct fs_parameter; struct fc_log; -- cgit v1.2.3 From 4034247a0d6ab281ba3293798ce67af494d86129 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2022 14:07:14 -0800 Subject: mm: introduce memalloc_retry_wait() Various places in the kernel - largely in filesystems - respond to a memory allocation failure by looping around and re-trying. Some of these cannot conveniently use __GFP_NOFAIL, for reasons such as: - a GFP_ATOMIC allocation, which __GFP_NOFAIL doesn't work on - a need to check for the process being signalled between failures - the possibility that other recovery actions could be performed - the allocation is quite deep in support code, and passing down an extra flag to say if __GFP_NOFAIL is wanted would be clumsy. Many of these currently use congestion_wait() which (in almost all cases) simply waits the given timeout - congestion isn't tracked for most devices. It isn't clear what the best delay is for loops, but it is clear that the various filesystems shouldn't be responsible for choosing a timeout. This patch introduces memalloc_retry_wait() with takes on that responsibility. Code that wants to retry a memory allocation can call this function passing the GFP flags that were used. It will wait however is appropriate. For now, it only considers __GFP_NORETRY and whatever gfpflags_allow_blocking() tests. If blocking is allowed without __GFP_NORETRY, then alloc_page either made some reclaim progress, or waited for a while, before failing. So there is no need for much further waiting. memalloc_retry_wait() will wait until the current jiffie ends. If this condition is not met, then alloc_page() won't have waited much if at all. In that case memalloc_retry_wait() waits about 200ms. This is the delay that most current loops uses. linux/sched/mm.h needs to be included in some files now, but linux/backing-dev.h does not. Link: https://lkml.kernel.org/r/163754371968.13692.1277530886009912421@noble.neil.brown.name Signed-off-by: NeilBrown Cc: Dave Chinner Cc: Michal Hocko Cc: "Theodore Ts'o" Cc: Jaegeuk Kim Cc: Chao Yu Cc: Darrick J. Wong Cc: Chuck Lever Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index aca874d33fe6..aa5f09ca5bcf 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -214,6 +214,32 @@ static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif +/* Any memory-allocation retry loop should use + * memalloc_retry_wait(), and pass the flags for the most + * constrained allocation attempt that might have failed. + * This provides useful documentation of where loops are, + * and a central place to fine tune the waiting as the MM + * implementation changes. + */ +static inline void memalloc_retry_wait(gfp_t gfp_flags) +{ + /* We use io_schedule_timeout because waiting for memory + * typically included waiting for dirty pages to be + * written out, which requires IO. + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + gfp_flags = current_gfp_context(gfp_flags); + if (gfpflags_allow_blocking(gfp_flags) && + !(gfp_flags & __GFP_NORETRY)) + /* Probably waited already, no need for much more */ + io_schedule_timeout(1); + else + /* Probably didn't wait, and has now released a lock, + * so now is a good time to wait + */ + io_schedule_timeout(HZ/50); +} + /** * might_alloc - Mark possible allocation sites * @gfp_mask: gfp_t flags that would be used to allocate -- cgit v1.2.3 From 1611f74a94ba2e0f2d25b75008ed8e76e122097a Mon Sep 17 00:00:00 2001 From: Changcheng Deng Date: Fri, 14 Jan 2022 14:07:21 -0800 Subject: mm: fix boolreturn.cocci warning Return statements in functions returning bool should use true/false instead of 1/0. Link: https://lkml.kernel.org/r/20211126073327.74815-1-deng.changcheng@zte.com.cn Signed-off-by: Changcheng Deng Reported-by: Zeal Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b5f14d581113..18423c2157e8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -383,7 +383,7 @@ static __always_inline int TestClearPage##uname(struct page *page) \ TESTCLEARFLAG(uname, lname, policy) #define TESTPAGEFLAG_FALSE(uname, lname) \ -static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \ +static inline bool folio_test_##lname(const struct folio *folio) { return false; } \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname, lname) \ -- cgit v1.2.3 From be1a13eb51077b2ec5f7f4306f93dfece503a3f1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:27 -0800 Subject: mm: drop node from alloc_pages_vma alloc_pages_vma is meant to allocate a page with a vma specific memory policy. The initial node parameter is always a local node so it is pointless to waste a function argument for this. Drop the parameter. Link: https://lkml.kernel.org/r/YaSnlv4QpryEpesG@dhcp22.suse.cz Signed-off-by: Michal Hocko Cc: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: "Huang, Ying" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 8fcc38467af6..78b58448f796 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -598,9 +598,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order); struct folio *folio_alloc(gfp_t gfp, unsigned order); extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node, bool hugepage); + bool hugepage); #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) + alloc_pages_vma(gfp_mask, order, vma, addr, true) #else static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { @@ -610,14 +610,14 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } -#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ +#define alloc_pages_vma(gfp_mask, order, vma, addr, false)\ alloc_pages(gfp_mask, order) #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) + alloc_pages_vma(gfp_mask, 0, vma, addr, false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); -- cgit v1.2.3 From 04a536bfbd0f885338eecc2a4503dfca50ac94dd Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Fri, 14 Jan 2022 14:07:30 -0800 Subject: include/linux/gfp.h: further document GFP_DMA32 kmalloc(..., GFP_DMA32) does not return DMA32 memory because the DMA32 kmalloc cache array is not implemented. (Reason: there is no such user in kernel). Put a short comment about this so people can understand this by reading the comment. [1] https://lists.linuxfoundation.org/pipermail/iommu/2018-December/031696.html Link: https://lkml.kernel.org/r/20211207093610.6406-1-miles.chen@mediatek.com Signed-off-by: Miles Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 78b58448f796..80f63c862be5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -302,7 +302,9 @@ struct vm_area_struct; * lowest zone as a type of emergency reserve. * * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit - * address. + * address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory + * because the DMA32 kmalloc cache array is not implemented. + * (Reason: there is no such user in kernel). * * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, * do not need to be directly accessible by the kernel but that cannot -- cgit v1.2.3 From 62b3107073646e0946bd97ff926832bafb846d17 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 14 Jan 2022 14:07:37 -0800 Subject: mm_zone: add function to check if managed dma zone exists Patch series "Handle warning of allocation failure on DMA zone w/o managed pages", v4. **Problem observed: On x86_64, when crash is triggered and entering into kdump kernel, page allocation failure can always be seen. --------------------------------- DMA: preallocated 128 KiB GFP_KERNEL pool for atomic allocations swapper/0: page allocation failure: order:5, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 0 PID: 1 Comm: swapper/0 Call Trace: dump_stack+0x7f/0xa1 warn_alloc.cold+0x72/0xd6 ...... __alloc_pages+0x24d/0x2c0 ...... dma_atomic_pool_init+0xdb/0x176 do_one_initcall+0x67/0x320 ? rcu_read_lock_sched_held+0x3f/0x80 kernel_init_freeable+0x290/0x2dc ? rest_init+0x24f/0x24f kernel_init+0xa/0x111 ret_from_fork+0x22/0x30 Mem-Info: ------------------------------------ ***Root cause: In the current kernel, it assumes that DMA zone must have managed pages and try to request pages if CONFIG_ZONE_DMA is enabled. While this is not always true. E.g in kdump kernel of x86_64, only low 1M is presented and locked down at very early stage of boot, so that this low 1M won't be added into buddy allocator to become managed pages of DMA zone. This exception will always cause page allocation failure if page is requested from DMA zone. ***Investigation: This failure happens since below commit merged into linus's tree. 1a6a9044b967 x86/setup: Remove CONFIG_X86_RESERVE_LOW and reservelow= options 23721c8e92f7 x86/crash: Remove crash_reserve_low_1M() f1d4d47c5851 x86/setup: Always reserve the first 1M of RAM 7c321eb2b843 x86/kdump: Remove the backup region handling 6f599d84231f x86/kdump: Always reserve the low 1M when the crashkernel option is specified Before them, on x86_64, the low 640K area will be reused by kdump kernel. So in kdump kernel, the content of low 640K area is copied into a backup region for dumping before jumping into kdump. Then except of those firmware reserved region in [0, 640K], the left area will be added into buddy allocator to become available managed pages of DMA zone. However, after above commits applied, in kdump kernel of x86_64, the low 1M is reserved by memblock, but not released to buddy allocator. So any later page allocation requested from DMA zone will fail. At the beginning, if crashkernel is reserved, the low 1M need be locked down because AMD SME encrypts memory making the old backup region mechanims impossible when switching into kdump kernel. Later, it was also observed that there are BIOSes corrupting memory under 1M. To solve this, in commit f1d4d47c5851, the entire region of low 1M is always reserved after the real mode trampoline is allocated. Besides, recently, Intel engineer mentioned their TDX (Trusted domain extensions) which is under development in kernel also needs to lock down the low 1M. So we can't simply revert above commits to fix the page allocation failure from DMA zone as someone suggested. ***Solution: Currently, only DMA atomic pool and dma-kmalloc will initialize and request page allocation with GFP_DMA during bootup. So only initializ DMA atomic pool when DMA zone has available managed pages, otherwise just skip the initialization. For dma-kmalloc(), for the time being, let's mute the warning of allocation failure if requesting pages from DMA zone while no manged pages. Meanwhile, change code to use dma_alloc_xx/dma_map_xx API to replace kmalloc(GFP_DMA), or do not use GFP_DMA when calling kmalloc() if not necessary. Christoph is posting patches to fix those under drivers/scsi/. Finally, we can remove the need of dma-kmalloc() as people suggested. This patch (of 3): In some places of the current kernel, it assumes that dma zone must have managed pages if CONFIG_ZONE_DMA is enabled. While this is not always true. E.g in kdump kernel of x86_64, only low 1M is presented and locked down at very early stage of boot, so that there's no managed pages at all in DMA zone. This exception will always cause page allocation failure if page is requested from DMA zone. Here add function has_managed_dma() and the relevant helper functions to check if there's DMA zone with managed pages. It will be used in later patches. Link: https://lkml.kernel.org/r/20211223094435.248523-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20211223094435.248523-2-bhe@redhat.com Fixes: 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified") Signed-off-by: Baoquan He Reviewed-by: David Hildenbrand Acked-by: John Donnelly Cc: Christoph Hellwig Cc: Christoph Lameter Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: David Laight Cc: Borislav Petkov Cc: Marek Szyprowski Cc: Robin Murphy Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 936dc0b6c226..aed44e9b5d89 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1047,6 +1047,15 @@ static inline int is_highmem_idx(enum zone_type idx) #endif } +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void); +#else +static inline bool has_managed_dma(void) +{ + return false; +} +#endif + /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references -- cgit v1.2.3 From f47761999052b1cc987dd3e3d3adf47997358fc0 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Fri, 14 Jan 2022 14:07:48 -0800 Subject: hugetlb: add hugetlb.*.numa_stat file For hugetlb backed jobs/VMs it's critical to understand the numa information for the memory backing these jobs to deliver optimal performance. Currently this technically can be queried from /proc/self/numa_maps, but there are significant issues with that. Namely: 1. Memory can be mapped or unmapped. 2. numa_maps are per process and need to be aggregated across all processes in the cgroup. For shared memory this is more involved as the userspace needs to make sure it doesn't double count shared mappings. 3. I believe querying numa_maps needs to hold the mmap_lock which adds to the contention on this lock. For these reasons I propose simply adding hugetlb.*.numa_stat file, which shows the numa information of the cgroup similarly to memory.numa_stat. On cgroup-v2: cat /sys/fs/cgroup/unified/test/hugetlb.2MB.numa_stat total=2097152 N0=2097152 N1=0 On cgroup-v1: cat /sys/fs/cgroup/hugetlb/test/hugetlb.2MB.numa_stat total=2097152 N0=2097152 N1=0 hierarichal_total=2097152 N0=2097152 N1=0 This patch was tested manually by allocating hugetlb memory and querying the hugetlb.*.numa_stat file of the cgroup and its parents. [colin.i.king@googlemail.com: fix spelling mistake "hierarichal" -> "hierarchical"] Link: https://lkml.kernel.org/r/20211125090635.23508-1-colin.i.king@gmail.com [keescook@chromium.org: fix copy/paste array assignment] Link: https://lkml.kernel.org/r/20211203065647.2819707-1-keescook@chromium.org Link: https://lkml.kernel.org/r/20211123001020.4083653-1-almasrymina@google.com Signed-off-by: Mina Almasry Signed-off-by: Colin Ian King Signed-off-by: Kees Cook Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Shuah Khan Cc: Miaohe Lin Cc: Oscar Salvador Cc: Michal Hocko Cc: David Rientjes Cc: Jue Wang Cc: Yang Yao Cc: Joanna Li Cc: Cannon Matthews Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++-- include/linux/hugetlb_cgroup.h | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 00351ccb49a3..d1897a69c540 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -622,8 +622,8 @@ struct hstate { #endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ - struct cftype cgroup_files_dfl[7]; - struct cftype cgroup_files_legacy[9]; + struct cftype cgroup_files_dfl[8]; + struct cftype cgroup_files_legacy[10]; #endif char name[HSTATE_NAME_LEN]; }; diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index ba025ae27882..379344828e78 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -36,6 +36,11 @@ enum hugetlb_memory_event { HUGETLB_NR_MEMORY_EVENTS, }; +struct hugetlb_cgroup_per_node { + /* hugetlb usage in pages over all hstates. */ + unsigned long usage[HUGE_MAX_HSTATE]; +}; + struct hugetlb_cgroup { struct cgroup_subsys_state css; @@ -57,6 +62,8 @@ struct hugetlb_cgroup { /* Handle for "hugetlb.events.local" */ struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; + + struct hugetlb_cgroup_per_node *nodeinfo[]; }; static inline struct hugetlb_cgroup * -- cgit v1.2.3 From e9ea874a8ffb0f8ebed4f4981531a32c5b663d79 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 14 Jan 2022 14:07:55 -0800 Subject: mm/vmstat: add events for THP max_ptes_* exceeds There are interfaces to adjust max_ptes_none, max_ptes_swap, max_ptes_shared values, see /sys/kernel/mm/transparent_hugepage/khugepaged/. But system administrator may not know which value is the best. So Add those events to support adjusting max_ptes_* to suitable values. For example, if default max_ptes_swap value causes too much failures, and system uses zram whose IO is fast, administrator could increase max_ptes_swap until THP_SCAN_EXCEED_SWAP_PTE not increase anymore. Link: https://lkml.kernel.org/r/20211225094036.574157-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Cc: "Huang, Ying" Cc: Dave Hansen Cc: Minchan Kim Cc: Saravanan D Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a185cc75ff52..7b2363388bfa 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -98,6 +98,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SPLIT_PAGE_FAILED, THP_DEFERRED_SPLIT_PAGE, THP_SPLIT_PMD, + THP_SCAN_EXCEED_NONE_PTE, + THP_SCAN_EXCEED_SWAP_PTE, + THP_SCAN_EXCEED_SHARED_PTE, #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD THP_SPLIT_PUD, #endif -- cgit v1.2.3 From e4b424b7ec8791087375bb1f2480a3ba05d21e0b Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 14 Jan 2022 14:08:07 -0800 Subject: vmscan: make drop_slab_node static drop_slab_node is only used in drop_slab. So remove it's declaration from header file and add keyword static for it's definition. Link: https://lkml.kernel.org/r/20211111062445.5236-1-ligang.bdlg@bytedance.com Signed-off-by: Gang Li Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cef65f9cbdf2..eb67eb699b78 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3122,7 +3122,6 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, #endif void drop_slab(void); -void drop_slab_node(int nid); #ifndef CONFIG_MMU #define randomize_va_space 0 -- cgit v1.2.3 From c6018b4b254971863bd0ad36bb5e7d0fa0f0ddb0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 14 Jan 2022 14:08:17 -0800 Subject: mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 3c7595e81150..668389b4b53d 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -46,6 +46,7 @@ struct mempolicy { unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/perfer */ + int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ -- cgit v1.2.3 From 21b084fdf2a49ca1634e8e360e9ab6f9ff0dee11 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 14 Jan 2022 14:08:21 -0800 Subject: mm/mempolicy: wire up syscall set_mempolicy_home_node Link: https://lkml.kernel.org/r/20211202123810.267175-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 528a478dbda8..819c0cb00b6d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1057,6 +1057,9 @@ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type ru const void __user *rule_attr, __u32 flags); asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); asmlinkage long sys_memfd_secret(unsigned int flags); +asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); /* * Architecture-specific system calls -- cgit v1.2.3 From c9fdc4d5487a16bd1f003fc8b66e91f88efb50e6 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 14 Jan 2022 14:09:06 -0800 Subject: mm/hwpoison: remove MF_MSG_BUDDY_2ND and MF_MSG_POISONED_HUGE These action_page_types are no longer used, so remove them. Link: https://lkml.kernel.org/r/20211115084006.3728254-3-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Acked-by: Yang Shi Cc: "Aneesh Kumar K.V" Cc: David Hildenbrand Cc: Ding Hui Cc: Miaohe Lin Cc: Michal Hocko Cc: Oscar Salvador Cc: Peter Xu Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index eb67eb699b78..7f594da84aca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3201,7 +3201,6 @@ enum mf_action_page_type { MF_MSG_KERNEL_HIGH_ORDER, MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, - MF_MSG_POISONED_HUGE, MF_MSG_HUGE, MF_MSG_FREE_HUGE, MF_MSG_NON_PMD_HUGE, @@ -3216,7 +3215,6 @@ enum mf_action_page_type { MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, - MF_MSG_BUDDY_2ND, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, -- cgit v1.2.3 From bf181c582588f8f7406d52f2ee228539b465f173 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 14 Jan 2022 14:09:09 -0800 Subject: mm/hwpoison: fix unpoison_memory() After recent soft-offline rework, error pages can be taken off from buddy allocator, but the existing unpoison_memory() does not properly undo the operation. Moreover, due to the recent change on __get_hwpoison_page(), get_page_unless_zero() is hardly called for hwpoisoned pages. So __get_hwpoison_page() highly likely returns -EBUSY (meaning to fail to grab page refcount) and unpoison just clears PG_hwpoison without releasing a refcount. That does not lead to a critical issue like kernel panic, but unpoisoned pages never get back to buddy (leaked permanently), which is not good. To (partially) fix this, we need to identify "taken off" pages from other types of hwpoisoned pages. We can't use refcount or page flags for this purpose, so a pseudo flag is defined by hacking ->private field. Someone might think that put_page() is enough to cancel taken-off pages, but the normal free path contains some operations not suitable for the current purpose, and can fire VM_BUG_ON(). Note that unpoison_memory() is now supposed to be cancel hwpoison events injected only by madvise() or /sys/devices/system/memory/{hard,soft}_offline_page, not by MCE injection, so please don't try to use unpoison when testing with MCE injection. [lkp@intel.com: report build failure for ARCH=i386] Link: https://lkml.kernel.org/r/20211115084006.3728254-4-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Cc: Ding Hui Cc: Tony Luck Cc: "Aneesh Kumar K.V" Cc: Miaohe Lin Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + include/linux/page-flags.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7f594da84aca..d4fb49a5d60d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3174,6 +3174,7 @@ enum mf_flags { MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, + MF_UNPOISON = 1 << 4, }; extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 18423c2157e8..7e2b90dc7d3f 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -522,7 +522,11 @@ PAGEFLAG_FALSE(Uncached, uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) +#define MAGIC_HWPOISON 0x48575053U /* HWPS */ +extern void SetPageHWPoisonTakenOff(struct page *page); +extern void ClearPageHWPoisonTakenOff(struct page *page); extern bool take_page_off_buddy(struct page *page); +extern bool put_page_back_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 -- cgit v1.2.3 From 5ee2fa2f063649570c702164f47a558a3432dd9e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 14 Jan 2022 14:09:16 -0800 Subject: mm/rmap: fix potential batched TLB flush race In theory, the following race is possible for batched TLB flushing. CPU0 CPU1 ---- ---- shrink_page_list() unmap zap_pte_range() flush_tlb_batched_pending() flush_tlb_mm() try_to_unmap() set_tlb_ubc_flush_pending() mm->tlb_flush_batched = true mm->tlb_flush_batched = false After the TLB is flushed on CPU1 via flush_tlb_mm() and before mm->tlb_flush_batched is set to false, some PTE is unmapped on CPU0 and the TLB flushing is pended. Then the pended TLB flushing will be lost. Although both set_tlb_ubc_flush_pending() and flush_tlb_batched_pending() are called with PTL locked, different PTL instances may be used. Because the race window is really small, and the lost TLB flushing will cause problem only if a TLB entry is inserted before the unmapping in the race window, the race is only theoretical. But the fix is simple and cheap too. Syzbot has reported this too as follows: ================================================================== BUG: KCSAN: data-race in flush_tlb_batched_pending / try_to_unmap_one write to 0xffff8881072cfbbc of 1 bytes by task 17406 on cpu 1: flush_tlb_batched_pending+0x5f/0x80 mm/rmap.c:691 madvise_free_pte_range+0xee/0x7d0 mm/madvise.c:594 walk_pmd_range mm/pagewalk.c:128 [inline] walk_pud_range mm/pagewalk.c:205 [inline] walk_p4d_range mm/pagewalk.c:240 [inline] walk_pgd_range mm/pagewalk.c:277 [inline] __walk_page_range+0x981/0x1160 mm/pagewalk.c:379 walk_page_range+0x131/0x300 mm/pagewalk.c:475 madvise_free_single_vma mm/madvise.c:734 [inline] madvise_dontneed_free mm/madvise.c:822 [inline] madvise_vma mm/madvise.c:996 [inline] do_madvise+0xe4a/0x1140 mm/madvise.c:1202 __do_sys_madvise mm/madvise.c:1228 [inline] __se_sys_madvise mm/madvise.c:1226 [inline] __x64_sys_madvise+0x5d/0x70 mm/madvise.c:1226 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881072cfbbc of 1 bytes by task 71 on cpu 0: set_tlb_ubc_flush_pending mm/rmap.c:636 [inline] try_to_unmap_one+0x60e/0x1220 mm/rmap.c:1515 rmap_walk_anon+0x2fb/0x470 mm/rmap.c:2301 try_to_unmap+0xec/0x110 shrink_page_list+0xe91/0x2620 mm/vmscan.c:1719 shrink_inactive_list+0x3fb/0x730 mm/vmscan.c:2394 shrink_list mm/vmscan.c:2621 [inline] shrink_lruvec+0x3c9/0x710 mm/vmscan.c:2940 shrink_node_memcgs+0x23e/0x410 mm/vmscan.c:3129 shrink_node+0x8f6/0x1190 mm/vmscan.c:3252 kswapd_shrink_node mm/vmscan.c:4022 [inline] balance_pgdat+0x702/0xd30 mm/vmscan.c:4213 kswapd+0x200/0x340 mm/vmscan.c:4473 kthread+0x2c7/0x2e0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 value changed: 0x01 -> 0x00 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 71 Comm: kswapd0 Not tainted 5.16.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 ================================================================== [akpm@linux-foundation.org: tweak comments] Link: https://lkml.kernel.org/r/20211201021104.126469-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reported-by: syzbot+aa5bebed695edaccf0df@syzkaller.appspotmail.com Cc: Nadav Amit Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Dave Hansen Cc: Will Deacon Cc: Yu Zhao Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a89f128c990..e3b0476a4fda 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -647,7 +647,7 @@ struct mm_struct { atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ - bool tlb_flush_batched; + atomic_t tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_PREEMPT_RT -- cgit v1.2.3 From cab0a7c115546a4865fb7439558af9077a569574 Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Fri, 14 Jan 2022 14:09:28 -0800 Subject: mm: make some vars and functions static or __init "page_idle_ops" as a global var, but its scope of use within this document. So it should be static. "page_ext_ops" is a var used in the kernel initial phase. And other functions are aslo used in the kernel initial phase. So they should be __init or __initdata to reclaim memory. Link: https://lkml.kernel.org/r/20211217095023.67293-1-liuting.0x7c00@bytedance.com Signed-off-by: Ting Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_idle.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 83abf95e9fa7..4663dfed1293 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -13,7 +13,6 @@ * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ -extern struct page_ext_operations page_idle_ops; static inline bool folio_test_young(struct folio *folio) { -- cgit v1.2.3 From cdeed009f3bceee41f73f0137db785fd29a05cb8 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:44 -0800 Subject: mm/damon: remove some unneeded function definitions in damon.h In damon.h some func definitions about VA & PA can only be used in its own file, so there no need to define in the header file, and the header file will look cleaner. If other files later need these functions, the prototypes can be added to damon.h at that time. [sj@kernel.org: remove unnecessary function prototype position changes] Link: https://lkml.kernel.org/r/20211118114827.20052-1-sj@kernel.org Link: https://lkml.kernel.org/r/45fd5b3ef6cce8e28dbc1c92f9dc845ccfc949d7.1636989871.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index b4d4be3cc987..1d1be348f506 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -461,34 +461,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); #endif /* CONFIG_DAMON */ #ifdef CONFIG_DAMON_VADDR - -/* Monitoring primitives for virtual memory address spaces */ -void damon_va_init(struct damon_ctx *ctx); -void damon_va_update(struct damon_ctx *ctx); -void damon_va_prepare_access_checks(struct damon_ctx *ctx); -unsigned int damon_va_check_accesses(struct damon_ctx *ctx); bool damon_va_target_valid(void *t); -void damon_va_cleanup(struct damon_ctx *ctx); -int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); -int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); void damon_va_set_primitives(struct damon_ctx *ctx); - #endif /* CONFIG_DAMON_VADDR */ #ifdef CONFIG_DAMON_PADDR - -/* Monitoring primitives for the physical memory address space */ -void damon_pa_prepare_access_checks(struct damon_ctx *ctx); -unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); bool damon_pa_target_valid(void *t); -int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); -int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); void damon_pa_set_primitives(struct damon_ctx *ctx); - #endif /* CONFIG_DAMON_PADDR */ #endif /* _DAMON_H */ -- cgit v1.2.3 From 9b2a38d6ef25c1748e3964b0ff30a89e4ed26583 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:53 -0800 Subject: mm/damon: move damon_rand() definition into damon.h damon_rand() is called in three files:damon/core.c, damon/ paddr.c, damon/vaddr.c, i think there is no need to redefine this twice, So move it to damon.h will be a good choice. Link: https://lkml.kernel.org/r/20211202075859.51341-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 1d1be348f506..3e91a597a1aa 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -11,12 +11,16 @@ #include #include #include +#include /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION PAGE_SIZE /* Max priority score for DAMON-based operation schemes */ #define DAMOS_MAX_SCORE (99) +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + /** * struct damon_addr_range - Represents an address region of [@start, @end). * @start: Start address of the region (inclusive). -- cgit v1.2.3 From 234d68732b6c135087bdebfa0630a43ae8c27758 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:56 -0800 Subject: mm/damon: modify damon_rand() macro to static inline function damon_rand() cannot be implemented as a macro. Example: damon_rand(a++, b); The value of 'a' will be incremented twice, This is obviously unreasonable, So there fix it. Link: https://lkml.kernel.org/r/110ffcd4e420c86c42b41ce2bc9f0fe6a4f32cd3.1638795127.git.xhao@linux.alibaba.com Fixes: b9a6ac4e4ede ("mm/damon: adaptively adjust regions") Signed-off-by: Xin Hao Reported-by: Andrew Morton Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 3e91a597a1aa..e2c8152985b7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -19,7 +19,10 @@ #define DAMOS_MAX_SCORE (99) /* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) +static inline unsigned long damon_rand(unsigned long l, unsigned long r) +{ + return l + prandom_u32_max(r - l); +} /** * struct damon_addr_range - Represents an address region of [@start, @end). -- cgit v1.2.3 From 88f86dcfa454784f7de550966c60fc78a3e95d6d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:09:59 -0800 Subject: mm/damon: convert macro functions to static inline functions Patch series "mm/damon: Misc cleanups". This patchset contains miscellaneous cleanups for DAMON's macro functions and documentation. This patch (of 6): This commit converts macro functions in DAMON to static inline functions, for better type checking, code documentation, etc[1]. [1] https://lore.kernel.org/linux-mm/20211202151213.6ec830863342220da4141bc5@linux-foundation.org/ Link: https://lkml.kernel.org/r/20211209131806.19317-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211209131806.19317-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index e2c8152985b7..2dbc1f545da2 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -399,14 +399,20 @@ struct damon_ctx { struct list_head schemes; }; -#define damon_next_region(r) \ - (container_of(r->list.next, struct damon_region, list)) +static inline struct damon_region *damon_next_region(struct damon_region *r) +{ + return container_of(r->list.next, struct damon_region, list); +} -#define damon_prev_region(r) \ - (container_of(r->list.prev, struct damon_region, list)) +static inline struct damon_region *damon_prev_region(struct damon_region *r) +{ + return container_of(r->list.prev, struct damon_region, list); +} -#define damon_last_region(t) \ - (list_last_entry(&t->regions_list, struct damon_region, list)) +static inline struct damon_region *damon_last_region(struct damon_target *t) +{ + return list_last_entry(&t->regions_list, struct damon_region, list); +} #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) -- cgit v1.2.3 From f4c6d22c6cf282ef7d24a724b9bd978ee2b74fc6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:14 -0800 Subject: mm/damon: remove a mistakenly added comment for a future feature Due to a mistake in patches reordering, a comment for a future feature called 'arbitrary monitoring target support'[1], which is still under development, has added. Because it only introduces confusion and we don't have a plan to post the patches soon, this commit removes the mistakenly added part. [1] https://lore.kernel.org/linux-mm/20201215115448.25633-3-sjpark@amazon.com/ Link: https://lkml.kernel.org/r/20211209131806.19317-7-sj@kernel.org Fixes: 1f366e421c8f ("mm/damon/core: implement DAMON-based Operation Schemes (DAMOS)") Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2dbc1f545da2..97f4a224e950 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -281,7 +281,7 @@ struct damon_ctx; * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action - * to the region. This is not used for &DAMON_ARBITRARY_TARGET case. + * to the region. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. -- cgit v1.2.3 From 0e92c2ee9f459542c5384d9cfab24873c3dd6398 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:17 -0800 Subject: mm/damon/schemes: account scheme actions that successfully applied Patch series "mm/damon/schemes: Extend stats for better online analysis and tuning". To help online access pattern analysis and tuning of DAMON-based Operation Schemes (DAMOS), DAMOS provides simple statistics for each scheme. Introduction of DAMOS time/space quota further made the tuning easier by making the risk management easier. However, that also made understanding of the working schemes a little bit more difficult. For an example, progress of a given scheme can now be throttled by not only the aggressiveness of the target access pattern, but also the time/space quotas. So, when a scheme is showing unexpectedly slow progress, it's difficult to know by what the progress of the scheme is throttled, with currently provided statistics. This patchset extends the statistics to contain some metrics that can be helpful for such online schemes analysis and tuning (patches 1-2), exports those to users (patches 3 and 5), and add documents (patches 4 and 6). This patch (of 6): DAMON-based operation schemes (DAMOS) stats provide only the number and the amount of regions that the action of the scheme has tried to be applied. Because the action could be failed for some reasons, the currently provided information is sometimes not useful or convenient enough for schemes profiling and tuning. To improve this situation, this commit extends the DAMOS stats to provide the number and the amount of regions that the action has successfully applied. Link: https://lkml.kernel.org/r/20211210150016.35349-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211210150016.35349-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 97f4a224e950..e0ad3d9aaeed 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -192,6 +192,20 @@ struct damos_watermarks { bool activated; }; +/** + * struct damos_stat - Statistics on a given scheme. + * @nr_tried: Total number of regions that the scheme is tried to be applied. + * @sz_tried: Total size of regions that the scheme is tried to be applied. + * @nr_applied: Total number of regions that the scheme is applied. + * @sz_applied: Total size of regions that the scheme is applied. + */ +struct damos_stat { + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; +}; + /** * struct damos - Represents a Data Access Monitoring-based Operation Scheme. * @min_sz_region: Minimum size of target regions. @@ -203,8 +217,7 @@ struct damos_watermarks { * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. - * @stat_count: Total number of regions that this scheme is applied. - * @stat_sz: Total size of regions that this scheme is applied. + * @stat: Statistics of this scheme. * @list: List head for siblings. * * For each aggregation interval, DAMON finds regions which fit in the @@ -235,8 +248,7 @@ struct damos { enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; - unsigned long stat_count; - unsigned long stat_sz; + struct damos_stat stat; struct list_head list; }; @@ -281,7 +293,8 @@ struct damon_ctx; * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action - * to the region. + * to the region and return bytes of the region that the action is successfully + * applied. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -295,8 +308,9 @@ struct damon_primitive { int (*get_scheme_score)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); - int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); + unsigned long (*apply_scheme)(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); }; -- cgit v1.2.3 From 6268eac34ca30af7f6313504d556ec7fcd295621 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:20 -0800 Subject: mm/damon/schemes: account how many times quota limit has exceeded If the time/space quotas of a given DAMON-based operation scheme is too small, the scheme could show unexpectedly slow progress. However, there is no good way to notice the case in runtime. This commit extends the DAMOS stat to provide how many times the quota limits exceeded so that the users can easily notice the case and tune the scheme. Link: https://lkml.kernel.org/r/20211210150016.35349-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index e0ad3d9aaeed..af648388e759 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -198,12 +198,14 @@ struct damos_watermarks { * @sz_tried: Total size of regions that the scheme is tried to be applied. * @nr_applied: Total number of regions that the scheme is applied. * @sz_applied: Total size of regions that the scheme is applied. + * @qt_exceeds: Total number of times the quota of the scheme has exceeded. */ struct damos_stat { unsigned long nr_tried; unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long qt_exceeds; }; /** -- cgit v1.2.3 From 2cd4b8e10cc31eadb5b10b1d73b3f28156f3776c Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jan 2022 14:10:38 -0800 Subject: mm/damon: move the implementation of damon_insert_region to damon.h Usually, inline function is declared static since it should sit between storage and type. And implement it in a header file if used by multiple files. And this change also fixes compile issue when backport damon to 5.10. mm/damon/vaddr.c: In function `damon_va_evenly_split_region': ./include/linux/damon.h:425:13: error: inlining failed in call to `always_inline' `damon_insert_region': function body not available 425 | inline void damon_insert_region(struct damon_region *r, | ^~~~~~~~~~~~~~~~~~~ mm/damon/vaddr.c:86:3: note: called from here 86 | damon_insert_region(n, r, next, t); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Link: https://lkml.kernel.org/r/20211223085703.6142-1-guoqing.jiang@linux.dev Signed-off-by: Guoqing Jiang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index af648388e759..5e1e3a128b77 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -451,9 +451,18 @@ static inline struct damon_region *damon_last_region(struct damon_target *t) #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); -inline void damon_insert_region(struct damon_region *r, + +/* + * Add a region between two other regions + */ +static inline void damon_insert_region(struct damon_region *r, struct damon_region *prev, struct damon_region *next, - struct damon_target *t); + struct damon_target *t) +{ + __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; +} + void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); -- cgit v1.2.3