From f630c7c6f10546ebff15c3a856e7949feb7a2372 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 14 Dec 2020 19:03:14 -0800 Subject: kthread: add kthread_work tracepoints While migrating some code from wq to kthread_worker, I found that I missed the execute_start/end tracepoints. So add similar tracepoints for kthread_work. And for completeness, queue_work tracepoint (although this one differs slightly from the matching workqueue tracepoint). Link: https://lkml.kernel.org/r/20201010180323.126634-1-robdclark@gmail.com Signed-off-by: Rob Clark Cc: Rob Clark Cc: Steven Rostedt Cc: Ingo Molnar Cc: "Peter Zijlstra (Intel)" Cc: Phil Auld Cc: Valentin Schneider Cc: Thara Gopinath Cc: Randy Dunlap Cc: Vincent Donnefort Cc: Mel Gorman Cc: Jens Axboe Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: Ilias Stamatis Cc: Liang Chen Cc: Ben Dooks Cc: Peter Zijlstra Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/sched.h | 84 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'include') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c96a4337afe6..5039af667645 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -5,6 +5,7 @@ #if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCHED_H +#include #include #include #include @@ -51,6 +52,89 @@ TRACE_EVENT(sched_kthread_stop_ret, TP_printk("ret=%d", __entry->ret) ); +/** + * sched_kthread_work_queue_work - called when a work gets queued + * @worker: pointer to the kthread_worker + * @work: pointer to struct kthread_work + * + * This event occurs when a work is queued immediately or once a + * delayed work is actually queued (ie: once the delay has been + * reached). + */ +TRACE_EVENT(sched_kthread_work_queue_work, + + TP_PROTO(struct kthread_worker *worker, + struct kthread_work *work), + + TP_ARGS(worker, work), + + TP_STRUCT__entry( + __field( void *, work ) + __field( void *, function) + __field( void *, worker) + ), + + TP_fast_assign( + __entry->work = work; + __entry->function = work->func; + __entry->worker = worker; + ), + + TP_printk("work struct=%p function=%ps worker=%p", + __entry->work, __entry->function, __entry->worker) +); + +/** + * sched_kthread_work_execute_start - called immediately before the work callback + * @work: pointer to struct kthread_work + * + * Allows to track kthread work execution. + */ +TRACE_EVENT(sched_kthread_work_execute_start, + + TP_PROTO(struct kthread_work *work), + + TP_ARGS(work), + + TP_STRUCT__entry( + __field( void *, work ) + __field( void *, function) + ), + + TP_fast_assign( + __entry->work = work; + __entry->function = work->func; + ), + + TP_printk("work struct %p: function %ps", __entry->work, __entry->function) +); + +/** + * sched_kthread_work_execute_end - called immediately after the work callback + * @work: pointer to struct work_struct + * @function: pointer to worker function + * + * Allows to track workqueue execution. + */ +TRACE_EVENT(sched_kthread_work_execute_end, + + TP_PROTO(struct kthread_work *work, kthread_work_func_t function), + + TP_ARGS(work, function), + + TP_STRUCT__entry( + __field( void *, work ) + __field( void *, function) + ), + + TP_fast_assign( + __entry->work = work; + __entry->function = function; + ), + + TP_printk("work struct %p: function %ps", __entry->work, __entry->function) +); + /* * Tracepoint for waking up a task: */ -- cgit v1.2.3 From a85cbe6159ffc973e5702f70a3bd5185f8f3c38d Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Mon, 14 Dec 2020 19:03:21 -0800 Subject: uapi: move constants from to and include in UAPI headers instead of . The reason is to avoid indirect include when using some network headers: or others -> -> . This indirect include causes on MUSL redefinition of struct sysinfo when included both and some of UAPI headers: In file included from x86_64-buildroot-linux-musl/sysroot/usr/include/linux/kernel.h:5, from x86_64-buildroot-linux-musl/sysroot/usr/include/linux/netlink.h:5, from ../include/tst_netlink.h:14, from tst_crypto.c:13: x86_64-buildroot-linux-musl/sysroot/usr/include/linux/sysinfo.h:8:8: error: redefinition of `struct sysinfo' struct sysinfo { ^~~~~~~ In file included from ../include/tst_safe_macros.h:15, from ../include/tst_test.h:93, from tst_crypto.c:11: x86_64-buildroot-linux-musl/sysroot/usr/include/sys/sysinfo.h:10:8: note: originally defined here Link: https://lkml.kernel.org/r/20201015190013.8901-1-petr.vorel@gmail.com Signed-off-by: Petr Vorel Suggested-by: Rich Felker Acked-by: Rich Felker Cc: Peter Korsgaard Cc: Baruch Siach Cc: Florian Weimer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/const.h | 5 +++++ include/uapi/linux/ethtool.h | 2 +- include/uapi/linux/kernel.h | 9 +-------- include/uapi/linux/lightnvm.h | 2 +- include/uapi/linux/mroute6.h | 2 +- include/uapi/linux/netfilter/x_tables.h | 2 +- include/uapi/linux/netlink.h | 2 +- include/uapi/linux/sysctl.h | 2 +- 8 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index 5ed721ad5b19..af2a44c08683 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -28,4 +28,9 @@ #define _BITUL(x) (_UL(1) << (x)) #define _BITULL(x) (_ULL(1) << (x)) +#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) +#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) + +#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) + #endif /* _UAPI_LINUX_CONST_H */ diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9ca87bc73c44..cde753bb2093 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -14,7 +14,7 @@ #ifndef _UAPI_LINUX_ETHTOOL_H #define _UAPI_LINUX_ETHTOOL_H -#include +#include #include #include diff --git a/include/uapi/linux/kernel.h b/include/uapi/linux/kernel.h index 0ff8f7477847..fadf2db71fe8 100644 --- a/include/uapi/linux/kernel.h +++ b/include/uapi/linux/kernel.h @@ -3,13 +3,6 @@ #define _UAPI_LINUX_KERNEL_H #include - -/* - * 'kernel.h' contains some often-used function prototypes etc - */ -#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) -#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) - -#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#include #endif /* _UAPI_LINUX_KERNEL_H */ diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index f9a1be7fc696..ead2e72e5c88 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -21,7 +21,7 @@ #define _UAPI_LINUX_LIGHTNVM_H #ifdef __KERNEL__ -#include +#include #include #else /* __KERNEL__ */ #include diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h index c36177a86516..a1fd6173e2db 100644 --- a/include/uapi/linux/mroute6.h +++ b/include/uapi/linux/mroute6.h @@ -2,7 +2,7 @@ #ifndef _UAPI__LINUX_MROUTE6_H #define _UAPI__LINUX_MROUTE6_H -#include +#include #include #include #include /* For struct sockaddr_in6. */ diff --git a/include/uapi/linux/netfilter/x_tables.h b/include/uapi/linux/netfilter/x_tables.h index a8283f7dbc51..b8c6bb233ac1 100644 --- a/include/uapi/linux/netfilter/x_tables.h +++ b/include/uapi/linux/netfilter/x_tables.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_X_TABLES_H #define _UAPI_X_TABLES_H -#include +#include #include #define XT_FUNCTION_MAXNAMELEN 30 diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index c3816ff7bfc3..3d94269bbfa8 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -2,7 +2,7 @@ #ifndef _UAPI__LINUX_NETLINK_H #define _UAPI__LINUX_NETLINK_H -#include +#include #include /* for __kernel_sa_family_t */ #include diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 27c1ed2822e6..458179df9b27 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -23,7 +23,7 @@ #ifndef _UAPI_LINUX_SYSCTL_H #define _UAPI_LINUX_SYSCTL_H -#include +#include #include #include -- cgit v1.2.3 From f0dbd2bd1c22c6670e83ddcd46a9beb8b575e86d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 14 Dec 2020 19:03:55 -0800 Subject: mm: slab: provide krealloc_array() When allocating an array of elements, users should check for multiplication overflow or preferably use one of the provided helpers like: kmalloc_array(). There's no krealloc_array() counterpart but there are many users who use regular krealloc() to reallocate arrays. Let's provide an actual krealloc_array() implementation. While at it: add some documentation regarding krealloc. Link: https://lkml.kernel.org/r/20201109110654.12547-3-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski Acked-by: Vlastimil Babka Cc: Alexander Shishkin Cc: Andy Shevchenko Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Knig Cc: Christoph Lameter Cc: Daniel Vetter Cc: Daniel Vetter Cc: David Airlie Cc: David Rientjes Cc: Gustavo Padovan Cc: James Morse Cc: Jaroslav Kysela Cc: Jason Wang Cc: Joonsoo Kim Cc: Linus Walleij Cc: Maarten Lankhorst Cc: Mauro Carvalho Chehab Cc: Maxime Ripard Cc: "Michael S . Tsirkin" Cc: Pekka Enberg Cc: Robert Richter Cc: Sumit Semwal Cc: Takashi Iwai Cc: Takashi Iwai Cc: Thomas Zimmermann Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index dd6897f62010..be4ba5867ac5 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -592,6 +592,24 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) return __kmalloc(bytes, flags); } +/** + * krealloc_array - reallocate memory for an array. + * @p: pointer to the memory chunk to reallocate + * @new_n: new number of elements to alloc + * @new_size: new size of a single member of the array + * @flags: the type of memory to allocate (see kmalloc) + */ +static __must_check inline void * +krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags) +{ + size_t bytes; + + if (unlikely(check_mul_overflow(new_n, new_size, &bytes))) + return NULL; + + return krealloc(p, bytes, flags); +} + /** * kcalloc - allocate memory for an array. The memory is set to zero. * @n: number of elements. -- cgit v1.2.3 From 7fb7ab6d618a4dc7ea3f3eafc92388a35b4f8894 Mon Sep 17 00:00:00 2001 From: Zhenhua Huang Date: Mon, 14 Dec 2020 19:04:46 -0800 Subject: mm: fix page_owner initializing issue for arm32 Page owner of pages used by page owner itself used is missing on arm32 targets. The reason is dummy_handle and failure_handle is not initialized correctly. Buddy allocator is used to initialize these two handles. However, buddy allocator is not ready when page owner calls it. This change fixed that by initializing page owner after buddy initialization. The working flow before and after this change are: original logic: 1. allocated memory for page_ext(using memblock). 2. invoke the init callback of page_ext_ops like page_owner(using buddy allocator). 3. initialize buddy. after this change: 1. allocated memory for page_ext(using memblock). 2. initialize buddy. 3. invoke the init callback of page_ext_ops like page_owner(using buddy allocator). with the change, failure/dummy_handle can get its correct value and page owner output for example has the one for page owner itself: Page allocated via order 2, mask 0x6202c0(GFP_USER|__GFP_NOWARN), pid 1006, ts 67278156558 ns PFN 543776 type Unmovable Block 531 type Unmovable Flags 0x0() init_page_owner+0x28/0x2f8 invoke_init_callbacks_flatmem+0x24/0x34 start_kernel+0x33c/0x5d8 Link: https://lkml.kernel.org/r/1603104925-5888-1-git-send-email-zhenhuah@codeaurora.org Signed-off-by: Zhenhua Huang Acked-by: Vlastimil Babka Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ext.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index cfce186f0c4e..aff81ba31bd8 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -44,8 +44,12 @@ static inline void page_ext_init_flatmem(void) { } extern void page_ext_init(void); +static inline void page_ext_init_flatmem_late(void) +{ +} #else extern void page_ext_init_flatmem(void); +extern void page_ext_init_flatmem_late(void); static inline void page_ext_init(void) { } @@ -76,6 +80,10 @@ static inline void page_ext_init(void) { } +static inline void page_ext_init_flatmem_late(void) +{ +} + static inline void page_ext_init_flatmem(void) { } -- cgit v1.2.3 From 57efa1fe5957694fa541c9062de0a127f0b9acb0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Dec 2020 19:05:44 -0800 Subject: mm/gup: prevent gup_fast from racing with COW during fork Since commit 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") pages under a FOLL_PIN will not be write protected during COW for fork. This means that pages returned from pin_user_pages(FOLL_WRITE) should not become write protected while the pin is active. However, there is a small race where get_user_pages_fast(FOLL_PIN) can establish a FOLL_PIN at the same time copy_present_page() is write protecting it: CPU 0 CPU 1 get_user_pages_fast() internal_get_user_pages_fast() copy_page_range() pte_alloc_map_lock() copy_present_page() atomic_read(has_pinned) == 0 page_maybe_dma_pinned() == false atomic_set(has_pinned, 1); gup_pgd_range() gup_pte_range() pte_t pte = gup_get_pte(ptep) pte_access_permitted(pte) try_grab_compound_head() pte = pte_wrprotect(pte) set_pte_at(); pte_unmap_unlock() // GUP now returns with a write protected page The first attempt to resolve this by using the write protect caused problems (and was missing a barrrier), see commit f3c64eda3e50 ("mm: avoid early COW write protect games during fork()") Instead wrap copy_p4d_range() with the write side of a seqcount and check the read side around gup_pgd_range(). If there is a collision then get_user_pages_fast() fails and falls back to slow GUP. Slow GUP is safe against this race because copy_page_range() is only called while holding the exclusive side of the mmap_lock on the src mm_struct. [akpm@linux-foundation.org: coding style fixes] Link: https://lore.kernel.org/r/CAHk-=wi=iCnYCARbPGjkVJu9eyYeZ13N64tZYLdOB8CP5Q_PLw@mail.gmail.com Link: https://lkml.kernel.org/r/2-v4-908497cf359a+4782-gup_fork_jgg@nvidia.com Fixes: f3c64eda3e50 ("mm: avoid early COW write protect games during fork()") Signed-off-by: Jason Gunthorpe Suggested-by: Linus Torvalds Reviewed-by: John Hubbard Reviewed-by: Jan Kara Reviewed-by: Peter Xu Acked-by: "Ahmed S. Darwish" [seqcount_t parts] Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Christoph Hellwig Cc: Hugh Dickins Cc: Jann Horn Cc: Kirill Shutemov Cc: Kirill Tkhai Cc: Leon Romanovsky Cc: Michal Hocko Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5a9238f6caad..915f4f100383 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -446,6 +447,13 @@ struct mm_struct { */ atomic_t has_pinned; + /** + * @write_protect_seq: Locked when any thread is write + * protecting pages mapped by this mm to enforce a later COW, + * for instance during page table copying for fork(). + */ + seqcount_t write_protect_seq; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif -- cgit v1.2.3 From 52650c8b466bac399aec213c61d74bfe6f7af1a4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Dec 2020 19:05:48 -0800 Subject: mm/gup: remove the vma allocation from gup_longterm_locked() Long ago there wasn't a FOLL_LONGTERM flag so this DAX check was done by post-processing the VMA list. These days it is trivial to just check each VMA to see if it is DAX before processing it inside __get_user_pages() and return failure if a DAX VMA is encountered with FOLL_LONGTERM. Removing the allocation of the VMA list is a significant speed up for many call sites. Add an IS_ENABLED to vma_is_fsdax so that code generation is unchanged when DAX is compiled out. Remove the dummy version of __gup_longterm_locked() as !CONFIG_CMA already makes memalloc_nocma_save(), check_and_migrate_cma_pages(), and memalloc_nocma_restore() into a NOP. Link: https://lkml.kernel.org/r/0-v1-5551df3ed12e+b8-gup_dax_speedup_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: Ira Weiny Cc: Dan Williams Cc: John Hubbard Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8667d0cdc71e..1fcc2b00582b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3230,7 +3230,7 @@ static inline bool vma_is_fsdax(struct vm_area_struct *vma) { struct inode *inode; - if (!vma->vm_file) + if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file) return false; if (!vma_is_dax(vma)) return false; -- cgit v1.2.3 From 462680946b6d982afdda3bf5f7de3c379cb8c97b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 14 Dec 2020 19:06:11 -0800 Subject: mm: remove pagevec_lookup_range_nr_tag() With the merge of commit 2e1692966034 ("ceph: have ceph_writepages_start call pagevec_lookup_range_tag"), nothing calls this anymore. Link: https://lkml.kernel.org/r/20201021193926.101474-1-jlayton@kernel.org Signed-off-by: Jeff Layton Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 081d934eda64..ad4ddc17d403 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -43,9 +43,6 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag); -unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, pgoff_t end, - xa_mark_t tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, xa_mark_t tag) { -- cgit v1.2.3 From 30e6a51dbb0594d79dc2a9543659c1d596e2f7d4 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Mon, 14 Dec 2020 19:06:14 -0800 Subject: mm/shmem.c: make shmem_mapping() inline shmem_mapping() isn't worth an out-of-line call from any callsite. So make it inline by - make shmem_aops global - export shmem_aops - inline the shmem_mapping() and replace the direct call 'shmem_aops' with shmem_mapping() in shmem.c. Link: https://lkml.kernel.org/r/20201115165207.GA265355@rlk Signed-off-by: Hui Su Acked-by: Vlastimil Babka Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a5a5d1d4d7b1..d82b6f396588 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -67,7 +67,11 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); #ifdef CONFIG_SHMEM -extern bool shmem_mapping(struct address_space *mapping); +extern const struct address_space_operations shmem_aops; +static inline bool shmem_mapping(struct address_space *mapping) +{ + return mapping->a_ops == &shmem_aops; +} #else static inline bool shmem_mapping(struct address_space *mapping) { -- cgit v1.2.3 From 1a984c4e8200e0e58bb316f14a4bebb28d32d15a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 14 Dec 2020 19:06:24 -0800 Subject: mm: memcontrol: remove unused mod_memcg_obj_state() Since commit 991e7673859e ("mm: memcontrol: account kernel stack per node") there is no user of the mod_memcg_obj_state(). So just remove it. Also rework type of the idx parameter of the mod_objcg_state() from int to enum node_stat_item. Link: https://lkml.kernel.org/r/20201013153504.92602-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Acked-by: David Rientjes Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Christopher Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Yafang Shao Cc: Chris Down Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 922a7f600465..28bf65560084 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -798,8 +798,6 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val); -void mod_memcg_obj_state(void *p, int idx, int val); - static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) { @@ -1255,10 +1253,6 @@ static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, mod_node_page_state(page_pgdat(page), idx, val); } -static inline void mod_memcg_obj_state(void *p, int idx, int val) -{ -} - static inline unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, -- cgit v1.2.3 From 013339df116c2ee0d796dd8bfb8f293a2030c063 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 14 Dec 2020 19:06:39 -0800 Subject: mm/rmap: always do TTU_IGNORE_ACCESS Since commit 369ea8242c0f ("mm/rmap: update to new mmu_notifier semantic v2"), the code to check the secondary MMU's page table access bit is broken for !(TTU_IGNORE_ACCESS) because the page is unmapped from the secondary MMU's page table before the check. More specifically for those secondary MMUs which unmap the memory in mmu_notifier_invalidate_range_start() like kvm. However memory reclaim is the only user of !(TTU_IGNORE_ACCESS) or the absence of TTU_IGNORE_ACCESS and it explicitly performs the page table access check before trying to unmap the page. So, at worst the reclaim will miss accesses in a very short window if we remove page table access check in unmapping code. There is an unintented consequence of !(TTU_IGNORE_ACCESS) for the memcg reclaim. From memcg reclaim the page_referenced() only account the accesses from the processes which are in the same memcg of the target page but the unmapping code is considering accesses from all the processes, so, decreasing the effectiveness of memcg reclaim. The simplest solution is to always assume TTU_IGNORE_ACCESS in unmapping code. Link: https://lkml.kernel.org/r/20201104231928.1494083-1-shakeelb@google.com Fixes: 369ea8242c0f ("mm/rmap: update to new mmu_notifier semantic v2") Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Hugh Dickins Cc: Jerome Glisse Cc: Vlastimil Babka Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3a6adfa70fb0..70085ca1a3fc 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -91,7 +91,6 @@ enum ttu_flags { TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ - TTU_IGNORE_ACCESS = 0x10, /* don't age */ TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */ TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible * and caller guarantees they will -- cgit v1.2.3 From a7cb874bfff785d39de6cc847673539cb3540821 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 14 Dec 2020 19:06:45 -0800 Subject: mm: memcg: fix obsolete code comments This patch fixes/removes some obsolete comments in the code related to the kernel memory accounting: - kmem_cache->memcg_params.memcg_caches has been removed by commit 9855609bde03 ("mm: memcg/slab: use a single set of kmem_caches for all accounted allocations") - memcg->kmemcg_id is not used as a gate for kmem accounting since commit 0b8f73e10428 ("mm: memcontrol: clean up alloc, online, offline, free functions") Link: https://lkml.kernel.org/r/20201110184615.311974-1-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 28bf65560084..1f0f64fa3ccd 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -296,7 +296,6 @@ struct mem_cgroup { int tcpmem_pressure; #ifdef CONFIG_MEMCG_KMEM - /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; enum memcg_kmem_state kmem_state; struct obj_cgroup __rcu *objcg; @@ -1562,9 +1561,8 @@ static inline void memcg_kmem_uncharge(struct mem_cgroup *memcg, } /* - * helper for accessing a memcg's index. It will be used as an index in the - * child cache array in kmem_cache, and also to derive its name. This function - * will return -1 when this is not a kmem-limited memcg. + * A helper for accessing memcg's kmem_id, used for getting + * corresponding LRU lists. */ static inline int memcg_cache_id(struct mem_cgroup *memcg) { -- cgit v1.2.3 From bef8620cd8e0a117c1a0719604052e424eb418f9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 14 Dec 2020 19:06:49 -0800 Subject: mm: memcg: deprecate the non-hierarchical mode Patch series "mm: memcg: deprecate cgroup v1 non-hierarchical mode", v1. The non-hierarchical cgroup v1 mode is a legacy of early days of the memory controller and doesn't bring any value today. However, it complicates the code and creates many edge cases all over the memory controller code. It's a good time to deprecate it completely. This patchset removes the internal logic, adjusts the user interface and updates the documentation. The alt patch removes some bits of the cgroup core code, which become obsolete. Michal Hocko said: "All that we know today is that we have a warning in place to complain loudly when somebody relies on use_hierarchy=0 with a deeper hierarchy. For all those years we have seen _zero_ reports that would describe a sensible usecase. Moreover we (SUSE) have backported this warning into old distribution kernels (since 3.0 based kernels) to extend the coverage and didn't hear even for users who adopt new kernels only very slowly. The only report we have seen so far was a LTP test suite which doesn't really reflect any real life usecase" This patch (of 3): The non-hierarchical cgroup v1 mode is a legacy of early days of the memory controller and doesn't bring any value today. However, it complicates the code and creates many edge cases all over the memory controller code. It's a good time to deprecate it completely. Functionally this patch enabled is by default for all cgroups and forbids switching it off. Nothing changes if cgroup v2 is used: hierarchical mode was enforced from scratch. To protect the ABI memory.use_hierarchy interface is preserved with a limited functionality: reading always returns "1", writing of "1" passes silently, writing of any other value fails with -EINVAL and a warning to dmesg (on the first occasion). Link: https://lkml.kernel.org/r/20201110220800.929549-1-guro@fb.com Link: https://lkml.kernel.org/r/20201110220800.929549-2-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Acked-by: David Rientjes Acked-by: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1f0f64fa3ccd..dd992b81bcb7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -234,11 +234,6 @@ struct mem_cgroup { /* vmpressure notifications */ struct vmpressure vmpressure; - /* - * Should the accounting and control be hierarchical, per subtree? - */ - bool use_hierarchy; - /* * Should the OOM killer kill all belonging tasks, had it kill one? */ @@ -588,8 +583,6 @@ static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, { if (root == memcg) return true; - if (!root->use_hierarchy) - return false; return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } -- cgit v1.2.3 From 9d9d341df4d519d96e7927941d91f5785c5cea07 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 14 Dec 2020 19:06:55 -0800 Subject: cgroup: remove obsoleted broken_hierarchy and warned_broken_hierarchy With the deprecation of the non-hierarchical mode of the memory controller there are no more examples of broken hierarchies left. Let's remove the cgroup core code which was supposed to print warnings about creating of broken hierarchies. Link: https://lkml.kernel.org/r/20201110220800.929549-4-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: David Rientjes Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup-defs.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index fee0b5547cd0..559ee05f86b2 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -668,21 +668,6 @@ struct cgroup_subsys { */ bool threaded:1; - /* - * If %false, this subsystem is properly hierarchical - - * configuration, resource accounting and restriction on a parent - * cgroup cover those of its children. If %true, hierarchy support - * is broken in some ways - some subsystems ignore hierarchy - * completely while others are only implemented half-way. - * - * It's now disallowed to create nested cgroups if the subsystem is - * broken and cgroup core will emit a warning message on such - * cases. Eventually, all subsystems will be made properly - * hierarchical and this will go away. - */ - bool broken_hierarchy:1; - bool warned_broken_hierarchy:1; - /* the following two fields are initialized automtically during boot */ int id; const char *name; -- cgit v1.2.3 From da3ceeff923e3bc750a8423c840462760c463926 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 14 Dec 2020 19:07:04 -0800 Subject: mm: memcg/slab: rename *_lruvec_slab_state to *_lruvec_kmem_state The *_lruvec_slab_state is also suitable for pages allocated from buddy, not just for the slab objects. But the function name seems to tell us that only slab object is applicable. So we can rename the keyword of slab to kmem. Link: https://lkml.kernel.org/r/20201117085249.24319-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dd992b81bcb7..71c961452d9a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -788,15 +788,15 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val); +void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); -static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, +static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); - __mod_lruvec_slab_state(p, idx, val); + __mod_lruvec_kmem_state(p, idx, val); local_irq_restore(flags); } @@ -1229,7 +1229,7 @@ static inline void mod_lruvec_page_state(struct page *page, mod_node_page_state(page_pgdat(page), idx, val); } -static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, +static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); @@ -1237,7 +1237,7 @@ static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, __mod_node_page_state(page_pgdat(page), idx, val); } -static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, +static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); @@ -1332,14 +1332,14 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } -static inline void __inc_lruvec_slab_state(void *p, enum node_stat_item idx) +static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_slab_state(p, idx, 1); + __mod_lruvec_kmem_state(p, idx, 1); } -static inline void __dec_lruvec_slab_state(void *p, enum node_stat_item idx) +static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_slab_state(p, idx, -1); + __mod_lruvec_kmem_state(p, idx, -1); } /* idx can be of type enum memcg_stat_item or node_stat_item */ -- cgit v1.2.3 From c47d5032ed3002311a4188eae51f4641ec436beb Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 14 Dec 2020 19:07:14 -0800 Subject: mm: move lruvec stats update functions to vmstat.h Patch series "memcg: add pagetable comsumption to memory.stat", v2. Many workloads consumes significant amount of memory in pagetables. One specific use-case is the user space network driver which mmaps the application memory to provide zero copy transfer. This driver can consume a large amount memory in page tables. This patch series exposes the pagetable comsumption for each memory cgroup. This patch (of 2): This does not change any functionality and only move the functions which update the lruvec stats to vmstat.h from memcontrol.h. The main reason for this patch is to be able to use these functions in the page table contructor function which is defined in mm.h and we can not include the memcontrol.h in that file. Also this is a better place for this interface in general. The lruvec abstraction, while invented for memcg, isn't specific to memcg at all. Link: https://lkml.kernel.org/r/20201130212541.2781790-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 111 --------------------------------------------- include/linux/vmstat.h | 104 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 111 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 71c961452d9a..f530d634f055 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -786,8 +786,6 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, - int val); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, @@ -810,43 +808,6 @@ static inline void mod_memcg_lruvec_state(struct lruvec *lruvec, local_irq_restore(flags); } -static inline void mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_lruvec_state(lruvec, idx, val); - local_irq_restore(flags); -} - -static inline void __mod_lruvec_page_state(struct page *page, - enum node_stat_item idx, int val) -{ - struct page *head = compound_head(page); /* rmap on tail pages */ - pg_data_t *pgdat = page_pgdat(page); - struct lruvec *lruvec; - - /* Untracked pages have no memcg, no lruvec. Update only the node */ - if (!head->mem_cgroup) { - __mod_node_page_state(pgdat, idx, val); - return; - } - - lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); - __mod_lruvec_state(lruvec, idx, val); -} - -static inline void mod_lruvec_page_state(struct page *page, - enum node_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_lruvec_page_state(page, idx, val); - local_irq_restore(flags); -} - unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); @@ -1205,30 +1166,6 @@ static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, { } -static inline void __mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); -} - -static inline void mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - mod_node_page_state(lruvec_pgdat(lruvec), idx, val); -} - -static inline void __mod_lruvec_page_state(struct page *page, - enum node_stat_item idx, int val) -{ - __mod_node_page_state(page_pgdat(page), idx, val); -} - -static inline void mod_lruvec_page_state(struct page *page, - enum node_stat_item idx, int val) -{ - mod_node_page_state(page_pgdat(page), idx, val); -} - static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { @@ -1308,30 +1245,6 @@ static inline void __dec_memcg_page_state(struct page *page, __mod_memcg_page_state(page, idx, -1); } -static inline void __inc_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - __mod_lruvec_state(lruvec, idx, 1); -} - -static inline void __dec_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - __mod_lruvec_state(lruvec, idx, -1); -} - -static inline void __inc_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - __mod_lruvec_page_state(page, idx, 1); -} - -static inline void __dec_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - __mod_lruvec_page_state(page, idx, -1); -} - static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { __mod_lruvec_kmem_state(p, idx, 1); @@ -1370,30 +1283,6 @@ static inline void dec_memcg_page_state(struct page *page, mod_memcg_page_state(page, idx, -1); } -static inline void inc_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - mod_lruvec_state(lruvec, idx, 1); -} - -static inline void dec_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - mod_lruvec_state(lruvec, idx, -1); -} - -static inline void inc_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - mod_lruvec_page_state(page, idx, 1); -} - -static inline void dec_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - mod_lruvec_page_state(page, idx, -1); -} - static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) { struct mem_cgroup *memcg; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 322dcbfcc933..773135fc6e19 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -450,4 +450,108 @@ static inline const char *vm_event_name(enum vm_event_item item) } #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ +#ifdef CONFIG_MEMCG + +void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val); + +static inline void mod_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, int val) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_lruvec_state(lruvec, idx, val); + local_irq_restore(flags); +} + +void __mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val); + +static inline void mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_lruvec_page_state(page, idx, val); + local_irq_restore(flags); +} + +#else + +static inline void __mod_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, int val) +{ + __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); +} + +static inline void mod_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, int val) +{ + mod_node_page_state(lruvec_pgdat(lruvec), idx, val); +} + +static inline void __mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + __mod_node_page_state(page_pgdat(page), idx, val); +} + +static inline void mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + mod_node_page_state(page_pgdat(page), idx, val); +} + +#endif /* CONFIG_MEMCG */ + +static inline void __inc_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + __mod_lruvec_state(lruvec, idx, 1); +} + +static inline void __dec_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + __mod_lruvec_state(lruvec, idx, -1); +} + +static inline void __inc_lruvec_page_state(struct page *page, + enum node_stat_item idx) +{ + __mod_lruvec_page_state(page, idx, 1); +} + +static inline void __dec_lruvec_page_state(struct page *page, + enum node_stat_item idx) +{ + __mod_lruvec_page_state(page, idx, -1); +} + +static inline void inc_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + mod_lruvec_state(lruvec, idx, 1); +} + +static inline void dec_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + mod_lruvec_state(lruvec, idx, -1); +} + +static inline void inc_lruvec_page_state(struct page *page, + enum node_stat_item idx) +{ + mod_lruvec_page_state(page, idx, 1); +} + +static inline void dec_lruvec_page_state(struct page *page, + enum node_stat_item idx) +{ + mod_lruvec_page_state(page, idx, -1); +} + #endif /* _LINUX_VMSTAT_H */ -- cgit v1.2.3 From f0c0c115fb81940f4dba0644ac2a8a43b39c83f3 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 14 Dec 2020 19:07:17 -0800 Subject: mm: memcontrol: account pagetables per node For many workloads, pagetable consumption is significant and it makes sense to expose it in the memory.stat for the memory cgroups. However at the moment, the pagetables are accounted per-zone. Converting them to per-node and using the right interface will correctly account for the memory cgroups as well. [akpm@linux-foundation.org: export __mod_lruvec_page_state to modules for arch/mips/kvm/] Link: https://lkml.kernel.org/r/20201130212541.2781790-3-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++++---- include/linux/mmzone.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index db6ae4d3fb4e..5bbbf4aeee94 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2203,7 +2203,7 @@ static inline bool pgtable_pte_page_ctor(struct page *page) if (!ptlock_init(page)) return false; __SetPageTable(page); - inc_zone_page_state(page, NR_PAGETABLE); + inc_lruvec_page_state(page, NR_PAGETABLE); return true; } @@ -2211,7 +2211,7 @@ static inline void pgtable_pte_page_dtor(struct page *page) { ptlock_free(page); __ClearPageTable(page); - dec_zone_page_state(page, NR_PAGETABLE); + dec_lruvec_page_state(page, NR_PAGETABLE); } #define pte_offset_map_lock(mm, pmd, address, ptlp) \ @@ -2298,7 +2298,7 @@ static inline bool pgtable_pmd_page_ctor(struct page *page) if (!pmd_ptlock_init(page)) return false; __SetPageTable(page); - inc_zone_page_state(page, NR_PAGETABLE); + inc_lruvec_page_state(page, NR_PAGETABLE); return true; } @@ -2306,7 +2306,7 @@ static inline void pgtable_pmd_page_dtor(struct page *page) { pmd_ptlock_free(page); __ClearPageTable(page); - dec_zone_page_state(page, NR_PAGETABLE); + dec_lruvec_page_state(page, NR_PAGETABLE); } /* diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fb3bf696c05e..cca2a4443c9c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -152,7 +152,6 @@ enum zone_stat_item { NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ - NR_PAGETABLE, /* used for pagetables */ /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) @@ -207,6 +206,7 @@ enum node_stat_item { #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ #endif + NR_PAGETABLE, /* used for pagetables */ NR_VM_NODE_STAT_ITEMS }; -- cgit v1.2.3 From d3f5ffcacd1528736471bc78f03f06da6c4551cc Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 14 Dec 2020 19:07:45 -0800 Subject: mm: cleanup: remove unused tsk arg from __access_remote_vm Despite a comment that said that page fault accounting would be charged to whatever task_struct* was passed into __access_remote_vm(), the tsk argument was actually unused. Making page fault accounting actually use this task struct is quite a project, so there is no point in keeping the tsk argument. Delete both the comment, and the argument. [rppt@linux.ibm.com: changelog addition] Link: https://lkml.kernel.org/r/20201026074137.4147787-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Mike Rapoport Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5bbbf4aeee94..1d8e84bf718a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1716,8 +1716,8 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags); +extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags); long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, -- cgit v1.2.3 From 2b5067a8143e34aa3fa57a20fb8a3c40d905f942 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Mon, 14 Dec 2020 19:07:55 -0800 Subject: mm: mmap_lock: add tracepoints around lock acquisition The goal of these tracepoints is to be able to debug lock contention issues. This lock is acquired on most (all?) mmap / munmap / page fault operations, so a multi-threaded process which does a lot of these can experience significant contention. We trace just before we start acquisition, when the acquisition returns (whether it succeeded or not), and when the lock is released (or downgraded). The events are broken out by lock type (read / write). The events are also broken out by memcg path. For container-based workloads, users often think of several processes in a memcg as a single logical "task", so collecting statistics at this level is useful. The end goal is to get latency information. This isn't directly included in the trace events. Instead, users are expected to compute the time between "start locking" and "acquire returned", using e.g. synthetic events or BPF. The benefit we get from this is simpler code. Because we use tracepoint_enabled() to decide whether or not to trace, this patch has effectively no overhead unless tracepoints are enabled at runtime. If tracepoints are enabled, there is a performance impact, but how much depends on exactly what e.g. the BPF program does. [axelrasmussen@google.com: fix use-after-free race and css ref leak in tracepoints] Link: https://lkml.kernel.org/r/20201130233504.3725241-1-axelrasmussen@google.com [axelrasmussen@google.com: v3] Link: https://lkml.kernel.org/r/20201207213358.573750-1-axelrasmussen@google.com [rostedt@goodmis.org: in-depth examples of tracepoint_enabled() usage, and per-cpu-per-context buffer design] Link: https://lkml.kernel.org/r/20201105211739.568279-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Vlastimil Babka Cc: Steven Rostedt Cc: Ingo Molnar Cc: Michel Lespinasse Cc: Daniel Jordan Cc: Jann Horn Cc: Chinwen Chang Cc: Davidlohr Bueso Cc: David Rientjes Cc: Laurent Dufour Cc: Yafang Shao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmap_lock.h | 94 ++++++++++++++++++++++++++++++++-- include/trace/events/mmap_lock.h | 107 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+), 5 deletions(-) create mode 100644 include/trace/events/mmap_lock.h (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 18e7eae9b5ba..0540f0156f58 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -1,11 +1,65 @@ #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H +#include +#include #include +#include +#include +#include #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), +DECLARE_TRACEPOINT(mmap_lock_start_locking); +DECLARE_TRACEPOINT(mmap_lock_acquire_returned); +DECLARE_TRACEPOINT(mmap_lock_released); + +#ifdef CONFIG_TRACING + +void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write); +void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, + bool success); +void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write); + +static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, + bool write) +{ + if (tracepoint_enabled(mmap_lock_start_locking)) + __mmap_lock_do_trace_start_locking(mm, write); +} + +static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, + bool write, bool success) +{ + if (tracepoint_enabled(mmap_lock_acquire_returned)) + __mmap_lock_do_trace_acquire_returned(mm, write, success); +} + +static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) +{ + if (tracepoint_enabled(mmap_lock_released)) + __mmap_lock_do_trace_released(mm, write); +} + +#else /* !CONFIG_TRACING */ + +static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, + bool write) +{ +} + +static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, + bool write, bool success) +{ +} + +static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) +{ +} + +#endif /* CONFIG_TRACING */ + static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); @@ -13,57 +67,86 @@ static inline void mmap_init_lock(struct mm_struct *mm) static inline void mmap_write_lock(struct mm_struct *mm) { + __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); + __mmap_lock_trace_acquire_returned(mm, true, true); } static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { + __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); + __mmap_lock_trace_acquire_returned(mm, true, true); } static inline int mmap_write_lock_killable(struct mm_struct *mm) { - return down_write_killable(&mm->mmap_lock); + int ret; + + __mmap_lock_trace_start_locking(mm, true); + ret = down_write_killable(&mm->mmap_lock); + __mmap_lock_trace_acquire_returned(mm, true, ret == 0); + return ret; } static inline bool mmap_write_trylock(struct mm_struct *mm) { - return down_write_trylock(&mm->mmap_lock) != 0; + bool ret; + + __mmap_lock_trace_start_locking(mm, true); + ret = down_write_trylock(&mm->mmap_lock) != 0; + __mmap_lock_trace_acquire_returned(mm, true, ret); + return ret; } static inline void mmap_write_unlock(struct mm_struct *mm) { up_write(&mm->mmap_lock); + __mmap_lock_trace_released(mm, true); } static inline void mmap_write_downgrade(struct mm_struct *mm) { downgrade_write(&mm->mmap_lock); + __mmap_lock_trace_acquire_returned(mm, false, true); } static inline void mmap_read_lock(struct mm_struct *mm) { + __mmap_lock_trace_start_locking(mm, false); down_read(&mm->mmap_lock); + __mmap_lock_trace_acquire_returned(mm, false, true); } static inline int mmap_read_lock_killable(struct mm_struct *mm) { - return down_read_killable(&mm->mmap_lock); + int ret; + + __mmap_lock_trace_start_locking(mm, false); + ret = down_read_killable(&mm->mmap_lock); + __mmap_lock_trace_acquire_returned(mm, false, ret == 0); + return ret; } static inline bool mmap_read_trylock(struct mm_struct *mm) { - return down_read_trylock(&mm->mmap_lock) != 0; + bool ret; + + __mmap_lock_trace_start_locking(mm, false); + ret = down_read_trylock(&mm->mmap_lock) != 0; + __mmap_lock_trace_acquire_returned(mm, false, ret); + return ret; } static inline void mmap_read_unlock(struct mm_struct *mm) { up_read(&mm->mmap_lock); + __mmap_lock_trace_released(mm, false); } static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) { - if (down_read_trylock(&mm->mmap_lock)) { + if (mmap_read_trylock(mm)) { rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); return true; } @@ -73,6 +156,7 @@ static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { up_read_non_owner(&mm->mmap_lock); + __mmap_lock_trace_released(mm, false); } static inline void mmap_assert_locked(struct mm_struct *mm) diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h new file mode 100644 index 000000000000..0abff67b96f0 --- /dev/null +++ b/include/trace/events/mmap_lock.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mmap_lock + +#if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MMAP_LOCK_H + +#include +#include + +struct mm_struct; + +extern int trace_mmap_lock_reg(void); +extern void trace_mmap_lock_unreg(void); + +TRACE_EVENT_FN(mmap_lock_start_locking, + + TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), + + TP_ARGS(mm, memcg_path, write), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __string(memcg_path, memcg_path) + __field(bool, write) + ), + + TP_fast_assign( + __entry->mm = mm; + __assign_str(memcg_path, memcg_path); + __entry->write = write; + ), + + TP_printk( + "mm=%p memcg_path=%s write=%s\n", + __entry->mm, + __get_str(memcg_path), + __entry->write ? "true" : "false" + ), + + trace_mmap_lock_reg, trace_mmap_lock_unreg +); + +TRACE_EVENT_FN(mmap_lock_acquire_returned, + + TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, + bool success), + + TP_ARGS(mm, memcg_path, write, success), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __string(memcg_path, memcg_path) + __field(bool, write) + __field(bool, success) + ), + + TP_fast_assign( + __entry->mm = mm; + __assign_str(memcg_path, memcg_path); + __entry->write = write; + __entry->success = success; + ), + + TP_printk( + "mm=%p memcg_path=%s write=%s success=%s\n", + __entry->mm, + __get_str(memcg_path), + __entry->write ? "true" : "false", + __entry->success ? "true" : "false" + ), + + trace_mmap_lock_reg, trace_mmap_lock_unreg +); + +TRACE_EVENT_FN(mmap_lock_released, + + TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), + + TP_ARGS(mm, memcg_path, write), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __string(memcg_path, memcg_path) + __field(bool, write) + ), + + TP_fast_assign( + __entry->mm = mm; + __assign_str(memcg_path, memcg_path); + __entry->write = write; + ), + + TP_printk( + "mm=%p memcg_path=%s write=%s\n", + __entry->mm, + __get_str(memcg_path), + __entry->write ? "true" : "false" + ), + + trace_mmap_lock_reg, trace_mmap_lock_unreg +); + +#endif /* _TRACE_MMAP_LOCK_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 0966aeb404e854e3377a10fcd01be46f19055bc6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 14 Dec 2020 19:08:02 -0800 Subject: mm: move free_unref_page to mm/internal.h Code outside mm/ should not be calling free_unref_page(). Also move free_unref_page_list(). Link: https://lkml.kernel.org/r/20201125034655.27687-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c603237e006c..6e479e9c48ce 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -580,8 +580,6 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); -extern void free_unref_page(struct page *page); -extern void free_unref_page_list(struct list_head *list); struct page_frag_cache; extern void __page_frag_cache_drain(struct page *page, unsigned int count); -- cgit v1.2.3 From cd544fd1dc9293c6702fab6effa63dac1cc67e99 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 14 Dec 2020 19:08:13 -0800 Subject: mremap: don't allow MREMAP_DONTUNMAP on special_mappings and aio As kernel expect to see only one of such mappings, any further operations on the VMA-copy may be unexpected by the kernel. Maybe it's being on the safe side, but there doesn't seem to be any expected use-case for this, so restrict it now. Link: https://lkml.kernel.org/r/20201013013416.390574-4-dima@arista.com Fixes: commit e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()") Signed-off-by: Dmitry Safonov Cc: Alexander Viro Cc: Andy Lutomirski Cc: Brian Geffon Cc: Catalin Marinas Cc: Dan Carpenter Cc: Dan Williams Cc: Dave Jiang Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Minchan Kim Cc: Ralph Campbell Cc: Russell King Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vishal Verma Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1d8e84bf718a..dacc889744b1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -558,7 +558,7 @@ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); int (*split)(struct vm_area_struct * area, unsigned long addr); - int (*mremap)(struct vm_area_struct * area); + int (*mremap)(struct vm_area_struct *area, unsigned long flags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); -- cgit v1.2.3 From dd3b614f858d88f33e0cf8b7353e2ad937e71da3 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 14 Dec 2020 19:08:17 -0800 Subject: vm_ops: rename .split() callback to .may_split() Rename the callback to reflect that it's not called *on* or *after* split, but rather some time before the splitting to check if it's possible. Link: https://lkml.kernel.org/r/20201013013416.390574-5-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Alexander Viro Cc: Andy Lutomirski Cc: Brian Geffon Cc: Catalin Marinas Cc: Dan Carpenter Cc: Dan Williams Cc: Dave Jiang Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Minchan Kim Cc: Ralph Campbell Cc: Russell King Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vishal Verma Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index dacc889744b1..5d188b8611dc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -557,7 +557,8 @@ enum page_entry_size { struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); - int (*split)(struct vm_area_struct * area, unsigned long addr); + /* Called any time before splitting to check if it's allowed */ + int (*may_split)(struct vm_area_struct *area, unsigned long addr); int (*mremap)(struct vm_area_struct *area, unsigned long flags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, -- cgit v1.2.3 From 95d6c701f4ca7c44dc148d664f604541266a2333 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 14 Dec 2020 19:08:34 -0800 Subject: mm: extract might_alloc() debug check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extracted from slab.h, which seems to have the most complete version including the correct might_sleep() check. Roll it out to slob.c. Motivated by a discussion with Paul about possibly changing call_rcu behaviour to allocate memory, but only roughly every 500th call. There are a lot fewer places in the kernel that care about whether allocating memory is allowed or not (due to deadlocks with reclaim code) than places that care whether sleeping is allowed. But debugging these also tends to be a lot harder, so nice descriptive checks could come in handy. I might have some use eventually for annotations in drivers/gpu. Note that unlike fs_reclaim_acquire/release gfpflags_allow_blocking does not consult the PF_MEMALLOC flags. But there is no flag equivalent for GFP_NOWAIT, hence this check can't go wrong due to memalloc_no*_save/restore contexts. Willy is working on a patch series which might change this: https://lore.kernel.org/linux-mm/20200625113122.7540-7-willy@infradead.org/ I think best would be if that updates gfpflags_allow_blocking(), since there's a ton of callers all over the place for that already. Link: https://lkml.kernel.org/r/20201125162532.1299794-3-daniel.vetter@ffwll.ch Signed-off-by: Daniel Vetter Acked-by: Vlastimil Babka Acked-by: Paul E. McKenney Reviewed-by: Jason Gunthorpe Cc: Randy Dunlap Cc: Paul E. McKenney Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Vlastimil Babka Cc: Mathieu Desnoyers Cc: Sebastian Andrzej Siewior Cc: Michel Lespinasse Cc: Daniel Vetter Cc: Waiman Long Cc: Thomas Gleixner Cc: Randy Dunlap Cc: Dave Chinner Cc: Qian Cai Cc: "Matthew Wilcox (Oracle)" Cc: Christian König Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Maarten Lankhorst Cc: Thomas Hellström (Intel) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index d5ece7a9a403..a11a61b5226f 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -180,6 +180,22 @@ static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif +/** + * might_alloc - Mark possible allocation sites + * @gfp_mask: gfp_t flags that would be used to allocate + * + * Similar to might_sleep() and other annotations, this can be used in functions + * that might allocate, but often don't. Compiles to nothing without + * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking. + */ +static inline void might_alloc(gfp_t gfp_mask) +{ + fs_reclaim_acquire(gfp_mask); + fs_reclaim_release(gfp_mask); + + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); +} + /** * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. * -- cgit v1.2.3 From 96e2db456135db0cf2476b6890f1e8b2fdcf21eb Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 14 Dec 2020 19:08:49 -0800 Subject: mm/vmalloc: rework the drain logic A current "lazy drain" model suffers from at least two issues. First one is related to the unsorted list of vmap areas, thus in order to identify the [min:max] range of areas to be drained, it requires a full list scan. What is a time consuming if the list is too long. Second one and as a next step is about merging all fragments with a free space. What is also a time consuming because it has to iterate over entire list which holds outstanding lazy areas. See below the "preemptirqsoff" tracer that illustrates a high latency. It is ~24676us. Our workloads like audio and video are effected by such long latency: tracer: preemptirqsoff preemptirqsoff latency trace v1.1.5 on 4.9.186-perf+ -------------------------------------------------------------------- latency: 24676 us, #4/4, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 P:8) ----------------- | task: crtc_commit:112-261 (uid:0 nice:0 policy:1 rt_prio:16) ----------------- => started at: __purge_vmap_area_lazy => ended at: __purge_vmap_area_lazy _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / delay cmd pid ||||| time | caller \ / ||||| \ | / crtc_com-261 1...1 1us*: _raw_spin_lock <-__purge_vmap_area_lazy [...] crtc_com-261 1...1 24675us : _raw_spin_unlock <-__purge_vmap_area_lazy crtc_com-261 1...1 24677us : trace_preempt_on <-__purge_vmap_area_lazy crtc_com-261 1...1 24683us : => free_vmap_area_noflush => remove_vm_area => __vunmap => vfree => drm_property_free_blob => drm_mode_object_unreference => drm_property_unreference_blob => __drm_atomic_helper_crtc_destroy_state => sde_crtc_destroy_state => drm_atomic_state_default_clear => drm_atomic_state_clear => drm_atomic_state_free => complete_commit => _msm_drm_commit_work_cb => kthread_worker_fn => kthread => ret_from_fork To address those two issues we can redesign a purging of the outstanding lazy areas. Instead of queuing vmap areas to the list, we replace it by the separate rb-tree. In hat case an area is located in the tree/list in ascending order. It will give us below advantages: a) Outstanding vmap areas are merged creating bigger coalesced blocks, thus it becomes less fragmented. b) It is possible to calculate a flush range [min:max] without scanning all elements. It is O(1) access time or complexity; c) The final merge of areas with the rb-tree that represents a free space is faster because of (a). As a result the lock contention is also reduced. Link: https://lkml.kernel.org/r/20201116220033.1837-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Hillf Danton Cc: Michal Hocko Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Steven Rostedt Cc: Minchan Kim Cc: huang ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 938eaf9517e2..80c0181c411d 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -72,16 +72,14 @@ struct vmap_area { struct list_head list; /* address sorted list */ /* - * The following three variables can be packed, because - * a vmap_area object is always one of the three states: + * The following two variables can be packed, because + * a vmap_area object can be either: * 1) in "free" tree (root is vmap_area_root) - * 2) in "busy" tree (root is free_vmap_area_root) - * 3) in purge list (head is vmap_purge_list) + * 2) or "busy" tree (root is free_vmap_area_root) */ union { unsigned long subtree_max_size; /* in "free" tree */ struct vm_struct *vm; /* in "busy" tree */ - struct llist_node purge_list; /* in purge list */ }; }; -- cgit v1.2.3 From 03e92a5e097d679acbd1fb4d2ae238a38158aa0b Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 14 Dec 2020 19:09:32 -0800 Subject: ia64: remove custom __early_pfn_to_nid() The ia64 implementation of __early_pfn_to_nid() essentially relies on the same data as the generic implementation. The correspondence between memory ranges and nodes is set in memblock during early memory initialization in register_active_ranges() function. The initialization of sparsemem that requires early_pfn_to_nid() happens later and it can use the memblock information like the other architectures. Link: https://lkml.kernel.org/r/20201101170454.9567-3-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Alexey Dobriyan Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Matt Turner Cc: Meelis Roos Cc: Michael Schmitz Cc: Russell King Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 --- include/linux/mmzone.h | 11 ----------- 2 files changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5d188b8611dc..48fa3e71be1a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2434,9 +2434,6 @@ static inline int early_pfn_to_nid(unsigned long pfn) #else /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); -/* there is a per-arch backend function. */ -extern int __meminit __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cca2a4443c9c..16fb5522a74f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1428,17 +1428,6 @@ void sparse_init(void); #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ -/* - * During memory init memblocks map pfns to nids. The search is expensive and - * this caches recent lookups. The implementation of __early_pfn_to_nid - * may treat start/end as pfns or sections. - */ -struct mminit_pfnnid_cache { - unsigned long last_start; - unsigned long last_end; - int last_nid; -}; - /* * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we * need to check pfn validity within that MAX_ORDER_NR_PAGES block. -- cgit v1.2.3 From 5e545df3292fbd3d5963c68980f1527ead2a2b3f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 14 Dec 2020 19:09:55 -0800 Subject: arm: remove CONFIG_ARCH_HAS_HOLES_MEMORYMODEL ARM is the only architecture that defines CONFIG_ARCH_HAS_HOLES_MEMORYMODEL which in turn enables memmap_valid_within() function that is intended to verify existence of struct page associated with a pfn when there are holes in the memory map. However, the ARCH_HAS_HOLES_MEMORYMODEL also enables HAVE_ARCH_PFN_VALID and arch-specific pfn_valid() implementation that also deals with the holes in the memory map. The only two users of memmap_valid_within() call this function after a call to pfn_valid() so the memmap_valid_within() check becomes redundant. Remove CONFIG_ARCH_HAS_HOLES_MEMORYMODEL and memmap_valid_within() and rely entirely on ARM's implementation of pfn_valid() that is now enabled unconditionally. Link: https://lkml.kernel.org/r/20201101170454.9567-9-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Alexey Dobriyan Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Matt Turner Cc: Meelis Roos Cc: Michael Schmitz Cc: Russell King Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 31 ------------------------------- 1 file changed, 31 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 16fb5522a74f..6e0025b5a88f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1440,37 +1440,6 @@ void sparse_init(void); #define pfn_valid_within(pfn) (1) #endif -#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL -/* - * pfn_valid() is meant to be able to tell if a given PFN has valid memmap - * associated with it or not. This means that a struct page exists for this - * pfn. The caller cannot assume the page is fully initialized in general. - * Hotplugable pages might not have been onlined yet. pfn_to_online_page() - * will ensure the struct page is fully online and initialized. Special pages - * (e.g. ZONE_DEVICE) are never onlined and should be treated accordingly. - * - * In FLATMEM, it is expected that holes always have valid memmap as long as - * there is valid PFNs either side of the hole. In SPARSEMEM, it is assumed - * that a valid section has a memmap for the entire section. - * - * However, an ARM, and maybe other embedded architectures in the future - * free memmap backing holes to save memory on the assumption the memmap is - * never used. The page_zone linkages are then broken even though pfn_valid() - * returns true. A walker of the full memmap must then do this additional - * check to ensure the memmap they are looking at is sane by making sure - * the zone and PFN linkages are still valid. This is expensive, but walkers - * of the full memmap are extremely rare. - */ -bool memmap_valid_within(unsigned long pfn, - struct page *page, struct zone *zone); -#else -static inline bool memmap_valid_within(unsigned long pfn, - struct page *page, struct zone *zone) -{ - return true; -} -#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ - #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */ -- cgit v1.2.3 From 77bc7fd607dee2ffb28daff6d0dd8ae42af61ea8 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 14 Dec 2020 19:10:20 -0800 Subject: mm: introduce debug_pagealloc_{map,unmap}_pages() helpers Patch series "arch, mm: improve robustness of direct map manipulation", v7. During recent discussion about KVM protected memory, David raised a concern about usage of __kernel_map_pages() outside of DEBUG_PAGEALLOC scope [1]. Indeed, for architectures that define CONFIG_ARCH_HAS_SET_DIRECT_MAP it is possible that __kernel_map_pages() would fail, but since this function is void, the failure will go unnoticed. Moreover, there's lack of consistency of __kernel_map_pages() semantics across architectures as some guard this function with #ifdef DEBUG_PAGEALLOC, some refuse to update the direct map if page allocation debugging is disabled at run time and some allow modifying the direct map regardless of DEBUG_PAGEALLOC settings. This set straightens this out by restoring dependency of __kernel_map_pages() on DEBUG_PAGEALLOC and updating the call sites accordingly. Since currently the only user of __kernel_map_pages() outside DEBUG_PAGEALLOC is hibernation, it is updated to make direct map accesses there more explicit. [1] https://lore.kernel.org/lkml/2759b4bf-e1e3-d006-7d86-78a40348269d@redhat.com This patch (of 4): When CONFIG_DEBUG_PAGEALLOC is enabled, it unmaps pages from the kernel direct mapping after free_pages(). The pages than need to be mapped back before they could be used. Theese mapping operations use __kernel_map_pages() guarded with with debug_pagealloc_enabled(). The only place that calls __kernel_map_pages() without checking whether DEBUG_PAGEALLOC is enabled is the hibernation code that presumes availability of this function when ARCH_HAS_SET_DIRECT_MAP is set. Still, on arm64, __kernel_map_pages() will bail out when DEBUG_PAGEALLOC is not enabled but set_direct_map_invalid_noflush() may render some pages not present in the direct map and hibernation code won't be able to save such pages. To make page allocation debugging and hibernation interaction more robust, the dependency on DEBUG_PAGEALLOC or ARCH_HAS_SET_DIRECT_MAP has to be made more explicit. Start with combining the guard condition and the call to __kernel_map_pages() into debug_pagealloc_map_pages() and debug_pagealloc_unmap_pages() functions to emphasize that __kernel_map_pages() should not be called without DEBUG_PAGEALLOC and use these new functions to map/unmap pages when page allocation debugging is enabled. Link: https://lkml.kernel.org/r/20201109192128.960-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20201109192128.960-2-rppt@kernel.org Signed-off-by: Mike Rapoport Reviewed-by: David Hildenbrand Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Albert Ou Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Lameter Cc: "David S. Miller" Cc: Dave Hansen Cc: David Rientjes Cc: "Edgecombe, Rick P" Cc: "H. Peter Anvin" Cc: Heiko Carstens Cc: Ingo Molnar Cc: Joonsoo Kim Cc: Len Brown Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Pavel Machek Cc: Pekka Enberg Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 48fa3e71be1a..0f4c34672a13 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2943,12 +2943,27 @@ kernel_map_pages(struct page *page, int numpages, int enable) { __kernel_map_pages(page, numpages, enable); } + +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) +{ + if (debug_pagealloc_enabled_static()) + __kernel_map_pages(page, numpages, 1); +} + +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) +{ + if (debug_pagealloc_enabled_static()) + __kernel_map_pages(page, numpages, 0); +} + #ifdef CONFIG_HIBERNATION extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ #else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ static inline void kernel_map_pages(struct page *page, int numpages, int enable) {} +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} #ifdef CONFIG_HIBERNATION static inline bool kernel_page_present(struct page *page) { return true; } #endif /* CONFIG_HIBERNATION */ -- cgit v1.2.3 From 2abf962a8d42b32f5ffeb827826290b799c85f86 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 14 Dec 2020 19:10:25 -0800 Subject: PM: hibernate: make direct map manipulations more explicit When DEBUG_PAGEALLOC or ARCH_HAS_SET_DIRECT_MAP is enabled a page may be not present in the direct map and has to be explicitly mapped before it could be copied. Introduce hibernate_map_page() and hibernation_unmap_page() that will explicitly use set_direct_map_{default,invalid}_noflush() for ARCH_HAS_SET_DIRECT_MAP case and debug_pagealloc_{map,unmap}_pages() for DEBUG_PAGEALLOC case. The remapping of the pages in safe_copy_page() presumes that it only changes protection bits in an existing PTE and so it is safe to ignore return value of set_direct_map_{default,invalid}_noflush(). Still, add a pr_warn() so that future changes in set_memory APIs will not silently break hibernation. Link: https://lkml.kernel.org/r/20201109192128.960-3-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Rafael J. Wysocki Reviewed-by: David Hildenbrand Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Albert Ou Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Lameter Cc: Dave Hansen Cc: David Rientjes Cc: "David S. Miller" Cc: "Edgecombe, Rick P" Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joonsoo Kim Cc: Len Brown Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Pavel Machek Cc: Pekka Enberg Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0f4c34672a13..6b5df31387b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2934,16 +2934,6 @@ static inline bool debug_pagealloc_enabled_static(void) #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) extern void __kernel_map_pages(struct page *page, int numpages, int enable); -/* - * When called in DEBUG_PAGEALLOC context, the call should most likely be - * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static() - */ -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) -{ - __kernel_map_pages(page, numpages, enable); -} - static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) @@ -2960,8 +2950,6 @@ static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ #else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) {} static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} #ifdef CONFIG_HIBERNATION -- cgit v1.2.3 From 5d6ad668f31625c6aa9ed8dc3bdb29561d2b1144 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 14 Dec 2020 19:10:30 -0800 Subject: arch, mm: restore dependency of __kernel_map_pages() on DEBUG_PAGEALLOC The design of DEBUG_PAGEALLOC presumes that __kernel_map_pages() must never fail. With this assumption is wouldn't be safe to allow general usage of this function. Moreover, some architectures that implement __kernel_map_pages() have this function guarded by #ifdef DEBUG_PAGEALLOC and some refuse to map/unmap pages when page allocation debugging is disabled at runtime. As all the users of __kernel_map_pages() were converted to use debug_pagealloc_map_pages() it is safe to make it available only when DEBUG_PAGEALLOC is set. Link: https://lkml.kernel.org/r/20201109192128.960-4-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: David Hildenbrand Acked-by: Kirill A. Shutemov Cc: Albert Ou Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Lameter Cc: Dave Hansen Cc: David Rientjes Cc: "David S. Miller" Cc: "Edgecombe, Rick P" Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joonsoo Kim Cc: Len Brown Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Pavel Machek Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b5df31387b5..7a67bdf3df5e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2931,7 +2931,11 @@ static inline bool debug_pagealloc_enabled_static(void) return static_branch_unlikely(&_debug_pagealloc_enabled); } -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) +#ifdef CONFIG_DEBUG_PAGEALLOC +/* + * To support DEBUG_PAGEALLOC architecture must ensure that + * __kernel_map_pages() never fails + */ extern void __kernel_map_pages(struct page *page, int numpages, int enable); static inline void debug_pagealloc_map_pages(struct page *page, int numpages) @@ -2949,13 +2953,13 @@ static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) #ifdef CONFIG_HIBERNATION extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ -#else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ +#else /* CONFIG_DEBUG_PAGEALLOC */ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} #ifdef CONFIG_HIBERNATION static inline bool kernel_page_present(struct page *page) { return true; } #endif /* CONFIG_HIBERNATION */ -#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ +#endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); -- cgit v1.2.3 From 32a0de886eb3cb7e6990da27a9cdfa50baa8be64 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 14 Dec 2020 19:10:35 -0800 Subject: arch, mm: make kernel_page_present() always available For architectures that enable ARCH_HAS_SET_MEMORY having the ability to verify that a page is mapped in the kernel direct map can be useful regardless of hibernation. Add RISC-V implementation of kernel_page_present(), update its forward declarations and stubs to be a part of set_memory API and remove ugly ifdefery in inlcude/linux/mm.h around current declarations of kernel_page_present(). Link: https://lkml.kernel.org/r/20201109192128.960-5-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Kirill A. Shutemov Cc: Albert Ou Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Lameter Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: "David S. Miller" Cc: "Edgecombe, Rick P" Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joonsoo Kim Cc: Len Brown Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Pavel Machek Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 ------- include/linux/set_memory.h | 5 +++++ 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7a67bdf3df5e..5ec79757eeae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2949,16 +2949,9 @@ static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 0); } - -#ifdef CONFIG_HIBERNATION -extern bool kernel_page_present(struct page *page); -#endif /* CONFIG_HIBERNATION */ #else /* CONFIG_DEBUG_PAGEALLOC */ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} -#ifdef CONFIG_HIBERNATION -static inline bool kernel_page_present(struct page *page) { return true; } -#endif /* CONFIG_HIBERNATION */ #endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 860e0f843c12..fe1aa4e54680 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -23,6 +23,11 @@ static inline int set_direct_map_default_noflush(struct page *page) { return 0; } + +static inline bool kernel_page_present(struct page *page) +{ + return true; +} #endif #ifndef set_mce_nospec -- cgit v1.2.3 From 952eaf815925f106eb6b68346b3458a68bb18ec1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 14 Dec 2020 19:10:53 -0800 Subject: mm, page_alloc: cache pageset high and batch in struct zone All per-cpu pagesets for a zone use the same high and batch values, that are duplicated there just for performance (locality) reasons. This patch adds the same variables also to struct zone as a shared copy. This will be useful later for making possible to disable pcplists temporarily by setting high value to 0, while remembering the values for restoring them later. But we can also immediately benefit from not updating pagesets of all possible cpus in case the newly recalculated values (after sysctl change or memory online/offline) are actually unchanged from the previous ones. Link: https://lkml.kernel.org/r/20201111092812.11329-6-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6e0025b5a88f..4eee08b0062b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -470,6 +470,12 @@ struct zone { #endif struct pglist_data *zone_pgdat; struct per_cpu_pageset __percpu *pageset; + /* + * the high and batch values are copied to individual pagesets for + * faster access + */ + int pageset_high; + int pageset_batch; #ifndef CONFIG_SPARSEMEM /* -- cgit v1.2.3 From 2ee08717da50160c20056f6d6b76afdf65db33ab Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 14 Dec 2020 19:11:02 -0800 Subject: include/linux/page-flags.h: remove unused __[Set|Clear]PagePrivate They are not used anymore. Link: https://lkml.kernel.org/r/20201009135914.64826-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4f6ba9379112..50cbf5e931bc 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -363,8 +363,7 @@ PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) * for its own purposes. * - PG_private and PG_private_2 cause releasepage() and co to be invoked */ -PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY) - __CLEARPAGEFLAG(Private, private, PF_ANY) +PAGEFLAG(Private, private, PF_ANY) PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY) -- cgit v1.2.3 From 3b12da6d1d4adff087939c071e0d74a7857439a0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 14 Dec 2020 19:11:05 -0800 Subject: mm/page-flags: fix comment We haven't had 'dontuse' flags since 2002. Replace this obsolete warning with a hopefully more useful one. Link: https://lkml.kernel.org/r/20201027025823.3704-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 50cbf5e931bc..c1368af622c7 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -86,8 +86,7 @@ */ /* - * Don't use the *_dontuse flags. Use the macros. Otherwise you'll break - * locked- and dirty-page accounting. + * Don't use the pageflags directly. Use the PageFoo macros. * * The page flags field is split into two parts, the main flags area * which extends from the low bits upwards, and the fields area which -- cgit v1.2.3 From ebfe1b8f6ea5d83d8c1aa18ddd8ede432a7414e7 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 14 Dec 2020 19:11:58 -0800 Subject: include/linux/huge_mm.h: remove extern keyword The external function definitions don't need the "extern" keyword. Remove them so future changes don't copy the function definition style. Link: https://lkml.kernel.org/r/20201106235135.32109-1-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 93 ++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0365aa97f8e7..6a19f35f836b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -7,43 +7,37 @@ #include /* only for vma_is_dax() */ -extern vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); -extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, - struct vm_area_struct *vma); -extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); -extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pud_t *dst_pud, pud_t *src_pud, unsigned long addr, - struct vm_area_struct *vma); +vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); +int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma); +void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); +int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + struct vm_area_struct *vma); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); +void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); #else static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { } #endif -extern vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); -extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, - unsigned long addr, - pmd_t *pmd, - unsigned int flags); -extern bool madvise_free_huge_pmd(struct mmu_gather *tlb, - struct vm_area_struct *vma, - pmd_t *pmd, unsigned long addr, unsigned long next); -extern int zap_huge_pmd(struct mmu_gather *tlb, - struct vm_area_struct *vma, - pmd_t *pmd, unsigned long addr); -extern int zap_huge_pud(struct mmu_gather *tlb, - struct vm_area_struct *vma, - pud_t *pud, unsigned long addr); -extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, - unsigned long new_addr, - pmd_t *old_pmd, pmd_t *new_pmd); -extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, - unsigned long cp_flags); +vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); +struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmd, + unsigned int flags); +bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long next); +int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr); +int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, + unsigned long addr); +bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); +int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, + pgprot_t newprot, unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); @@ -100,13 +94,13 @@ enum transparent_hugepage_flag { struct kobject; struct kobj_attribute; -extern ssize_t single_hugepage_flag_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count, - enum transparent_hugepage_flag flag); -extern ssize_t single_hugepage_flag_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf, - enum transparent_hugepage_flag flag); +ssize_t single_hugepage_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag flag); +ssize_t single_hugepage_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) @@ -179,12 +173,11 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, (transparent_hugepage_flags & \ (1< Date: Mon, 14 Dec 2020 19:12:46 -0800 Subject: mm/compaction: make defer_compaction and compaction_deferred static defer_compaction() and compaction_deferred() and compaction_restarting() in mm/compaction.c won't be used in other files, so make them static, and remove the declaration in the header file. Take the chance to fix a typo. Link: https://lkml.kernel.org/r/20201123170801.GA9625@rlk Signed-off-by: Hui Su Acked-by: Vlastimil Babka Cc: Nitin Gupta Cc: Baoquan He Cc: Mateusz Nosek Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 1de5a1151ee7..ed4070ed41ef 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -98,11 +98,8 @@ extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int highest_zoneidx); -extern void defer_compaction(struct zone *zone, int order); -extern bool compaction_deferred(struct zone *zone, int order); extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); -extern bool compaction_restarting(struct zone *zone, int order); /* Compaction has made some progress and retrying makes sense */ static inline bool compaction_made_progress(enum compact_result result) @@ -194,15 +191,6 @@ static inline enum compact_result compaction_suitable(struct zone *zone, int ord return COMPACT_SKIPPED; } -static inline void defer_compaction(struct zone *zone, int order) -{ -} - -static inline bool compaction_deferred(struct zone *zone, int order) -{ - return true; -} - static inline bool compaction_made_progress(enum compact_result result) { return false; -- cgit v1.2.3 From 0060ef3b4e6dd1410da164d48a595eadb2fb02f7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 14 Dec 2020 19:12:59 -0800 Subject: mm: support THPs in zero_user_segments We can only kmap() one subpage of a THP at a time, so loop over all relevant subpages, skipping ones which don't need to be zeroed. This is too large to inline when THPs are enabled and we actually need highmem, so put it in highmem.c. [willy@infradead.org: start1 was allowed to be less than start2] Link: https://lkml.kernel.org/r/20201124041507.28996-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Yang Shi Cc: Jan Kara Cc: Michal Hocko Cc: Zi Yan Cc: Song Liu Cc: Mel Gorman Cc: Naresh Kamboju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 14e6202ce47f..8e21fe82b3a3 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -284,13 +284,22 @@ static inline void clear_highpage(struct page *page) kunmap_atomic(kaddr); } +/* + * If we pass in a base or tail page, we can zero up to PAGE_SIZE. + * If we pass in a head page, we can zero up to the size of the compound page. + */ +#if defined(CONFIG_HIGHMEM) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +void zero_user_segments(struct page *page, unsigned start1, unsigned end1, + unsigned start2, unsigned end2); +#else /* !HIGHMEM || !TRANSPARENT_HUGEPAGE */ static inline void zero_user_segments(struct page *page, - unsigned start1, unsigned end1, - unsigned start2, unsigned end2) + unsigned start1, unsigned end1, + unsigned start2, unsigned end2) { void *kaddr = kmap_atomic(page); + unsigned int i; - BUG_ON(end1 > PAGE_SIZE || end2 > PAGE_SIZE); + BUG_ON(end1 > page_size(page) || end2 > page_size(page)); if (end1 > start1) memset(kaddr + start1, 0, end1 - start1); @@ -299,8 +308,10 @@ static inline void zero_user_segments(struct page *page, memset(kaddr + start2, 0, end2 - start2); kunmap_atomic(kaddr); - flush_dcache_page(page); + for (i = 0; i < compound_nr(page); i++) + flush_dcache_page(page + i); } +#endif /* !HIGHMEM || !TRANSPARENT_HUGEPAGE */ static inline void zero_user_segment(struct page *page, unsigned start, unsigned end) -- cgit v1.2.3 From 236c32eb109696590b7428957eda50cc05e22af8 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 14 Dec 2020 19:13:13 -0800 Subject: mm: migrate: clean up migrate_prep{_local} The migrate_prep{_local} never fails, so it is pointless to have return value and check the return value. Link: https://lkml.kernel.org/r/20201113205359.556831-5-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zi Yan Cc: Jan Kara Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Song Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 0f8d1583fa8e..4594838a0f7c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -45,8 +45,8 @@ extern struct page *alloc_migration_target(struct page *page, unsigned long priv extern int isolate_movable_page(struct page *page, isolate_mode_t mode); extern void putback_movable_page(struct page *page); -extern int migrate_prep(void); -extern int migrate_prep_local(void); +extern void migrate_prep(void); +extern void migrate_prep_local(void); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); extern int migrate_huge_page_move_mapping(struct address_space *mapping, -- cgit v1.2.3 From 04013513cc84c401c7de9023ff3eda7863fc4add Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 14 Dec 2020 19:13:30 -0800 Subject: mm, page_alloc: do not rely on the order of page_poison and init_on_alloc/free parameters Patch series "cleanup page poisoning", v3. I have identified a number of issues and opportunities for cleanup with CONFIG_PAGE_POISON and friends: - interaction with init_on_alloc and init_on_free parameters depends on the order of parameters (Patch 1) - the boot time enabling uses static key, but inefficienty (Patch 2) - sanity checking is incompatible with hibernation (Patch 3) - CONFIG_PAGE_POISONING_NO_SANITY can be removed now that we have init_on_free (Patch 4) - CONFIG_PAGE_POISONING_ZERO can be most likely removed now that we have init_on_free (Patch 5) This patch (of 5): Enabling page_poison=1 together with init_on_alloc=1 or init_on_free=1 produces a warning in dmesg that page_poison takes precedence. However, as these warnings are printed in early_param handlers for init_on_alloc/free, they are not printed if page_poison is enabled later on the command line (handlers are called in the order of their parameters), or when init_on_alloc/free is always enabled by the respective config option - before the page_poison early param handler is called, it is not considered to be enabled. This is inconsistent. We can remove the dependency on order by making the init_on_* parameters only set a boolean variable, and postponing the evaluation after all early params have been processed. Introduce a new init_mem_debugging_and_hardening() function for that, and move the related debug_pagealloc processing there as well. As a result init_mem_debugging_and_hardening() knows always accurately if init_on_* and/or page_poison options were enabled. Thus we can also optimize want_init_on_alloc() and want_init_on_free(). We don't need to check page_poisoning_enabled() there, we can instead not enable the init_on_* static keys at all, if page poisoning is enabled. This results in a simpler and more effective code. Link: https://lkml.kernel.org/r/20201113104033.22907-1-vbabka@suse.cz Link: https://lkml.kernel.org/r/20201113104033.22907-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport Cc: Rafael J. Wysocki Cc: Alexander Potapenko Cc: Kees Cook Cc: Michal Hocko Cc: Mateusz Nosek Cc: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5ec79757eeae..591c784277ea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2872,6 +2872,7 @@ extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); +extern void init_mem_debugging_and_hardening(void); #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); extern void kernel_poison_pages(struct page *page, int numpages, int enable); @@ -2881,35 +2882,20 @@ static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } #endif -#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON -DECLARE_STATIC_KEY_TRUE(init_on_alloc); -#else DECLARE_STATIC_KEY_FALSE(init_on_alloc); -#endif static inline bool want_init_on_alloc(gfp_t flags) { - if (static_branch_unlikely(&init_on_alloc) && - !page_poisoning_enabled()) + if (static_branch_unlikely(&init_on_alloc)) return true; return flags & __GFP_ZERO; } -#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON -DECLARE_STATIC_KEY_TRUE(init_on_free); -#else DECLARE_STATIC_KEY_FALSE(init_on_free); -#endif static inline bool want_init_on_free(void) { - return static_branch_unlikely(&init_on_free) && - !page_poisoning_enabled(); + return static_branch_unlikely(&init_on_free); } -#ifdef CONFIG_DEBUG_PAGEALLOC -extern void init_debug_pagealloc(void); -#else -static inline void init_debug_pagealloc(void) {} -#endif extern bool _debug_pagealloc_enabled_early; DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); -- cgit v1.2.3 From 8db26a3d47354ce7271a8cab03cd65b9d3d610b9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 14 Dec 2020 19:13:34 -0800 Subject: mm, page_poison: use static key more efficiently Commit 11c9c7edae06 ("mm/page_poison.c: replace bool variable with static key") changed page_poisoning_enabled() to a static key check. However, the function is not inlined, so each check still involves a function call with overhead not eliminated when page poisoning is disabled. Analogically to how debug_pagealloc is handled, this patch converts page_poisoning_enabled() back to boolean check, and introduces page_poisoning_enabled_static() for fast paths. Both functions are inlined. The function kernel_poison_pages() is also called unconditionally and does the static key check inside. Remove it from there and put it to callers. Also split it to two functions kernel_poison_pages() and kernel_unpoison_pages() instead of the confusing bool parameter. Also optimize the check that enables page poisoning instead of debug_pagealloc for architectures without proper debug_pagealloc support. Move the check to init_mem_debugging_and_hardening() to enable a single static key instead of having two static branches in page_poisoning_enabled_static(). Link: https://lkml.kernel.org/r/20201113104033.22907-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: David Hildenbrand Cc: Mike Rapoport Cc: Rafael J. Wysocki Cc: Alexander Potapenko Cc: Kees Cook Cc: Laura Abbott Cc: Mateusz Nosek Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 591c784277ea..026707a58159 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2874,12 +2874,37 @@ extern int apply_to_existing_page_range(struct mm_struct *mm, extern void init_mem_debugging_and_hardening(void); #ifdef CONFIG_PAGE_POISONING -extern bool page_poisoning_enabled(void); -extern void kernel_poison_pages(struct page *page, int numpages, int enable); +extern void __kernel_poison_pages(struct page *page, int numpages); +extern void __kernel_unpoison_pages(struct page *page, int numpages); +extern bool _page_poisoning_enabled_early; +DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); +static inline bool page_poisoning_enabled(void) +{ + return _page_poisoning_enabled_early; +} +/* + * For use in fast paths after init_mem_debugging() has run, or when a + * false negative result is not harmful when called too early. + */ +static inline bool page_poisoning_enabled_static(void) +{ + return static_branch_unlikely(&_page_poisoning_enabled); +} +static inline void kernel_poison_pages(struct page *page, int numpages) +{ + if (page_poisoning_enabled_static()) + __kernel_poison_pages(page, numpages); +} +static inline void kernel_unpoison_pages(struct page *page, int numpages) +{ + if (page_poisoning_enabled_static()) + __kernel_unpoison_pages(page, numpages); +} #else static inline bool page_poisoning_enabled(void) { return false; } -static inline void kernel_poison_pages(struct page *page, int numpages, - int enable) { } +static inline bool page_poisoning_enabled_static(void) { return false; } +static inline void kernel_poison_pages(struct page *page, int numpages) { } +static inline void kernel_unpoison_pages(struct page *page, int numpages) { } #endif DECLARE_STATIC_KEY_FALSE(init_on_alloc); -- cgit v1.2.3 From 03b6c9a3e8805606c0bb4ad41855fac3bf85c3b9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 14 Dec 2020 19:13:38 -0800 Subject: kernel/power: allow hibernation with page_poison sanity checking Page poisoning used to be incompatible with hibernation, as the state of poisoned pages was lost after resume, thus enabling CONFIG_HIBERNATION forces CONFIG_PAGE_POISONING_NO_SANITY. For the same reason, the poisoning with zeroes variant CONFIG_PAGE_POISONING_ZERO used to disable hibernation. The latter restriction was removed by commit 1ad1410f632d ("PM / Hibernate: allow hibernation with PAGE_POISONING_ZERO") and similarly for init_on_free by commit 18451f9f9e58 ("PM: hibernate: fix crashes with init_on_free=1") by making sure free pages are cleared after resume. We can use the same mechanism to instead poison free pages with PAGE_POISON after resume. This covers both zero and 0xAA patterns. Thus we can remove the Kconfig restriction that disables page poison sanity checking when hibernation is enabled. Link: https://lkml.kernel.org/r/20201113104033.22907-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Rafael J. Wysocki [hibernation] Reviewed-by: David Hildenbrand Cc: Mike Rapoport Cc: Alexander Potapenko Cc: Kees Cook Cc: Laura Abbott Cc: Mateusz Nosek Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 026707a58159..3e1fe8ca9720 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2903,6 +2903,7 @@ static inline void kernel_unpoison_pages(struct page *page, int numpages) #else static inline bool page_poisoning_enabled(void) { return false; } static inline bool page_poisoning_enabled_static(void) { return false; } +static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } static inline void kernel_poison_pages(struct page *page, int numpages) { } static inline void kernel_unpoison_pages(struct page *page, int numpages) { } #endif -- cgit v1.2.3 From f289041ed4cf9a3f6e8a32068fef9ffb2acc5662 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 14 Dec 2020 19:13:45 -0800 Subject: mm, page_poison: remove CONFIG_PAGE_POISONING_ZERO CONFIG_PAGE_POISONING_ZERO uses the zero pattern instead of 0xAA. It was introduced by commit 1414c7f4f7d7 ("mm/page_poisoning.c: allow for zero poisoning"), noting that using zeroes retains the benefit of sanitizing content of freed pages, with the benefit of not having to zero them again on alloc, and the downside of making some forms of corruption (stray writes of NULLs) harder to detect than with the 0xAA pattern. Together with CONFIG_PAGE_POISONING_NO_SANITY it made possible to sanitize the contents on free without checking it back on alloc. These days we have the init_on_free() option to achieve sanitization with zeroes and to save clearing on alloc (and without checking on alloc). Arguably if someone does choose to check the poison for corruption on alloc, the savings of not clearing the page are secondary, and it makes sense to always use the 0xAA poison pattern. Thus, remove the CONFIG_PAGE_POISONING_ZERO option for being redundant. Link: https://lkml.kernel.org/r/20201113104033.22907-6-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: David Hildenbrand Cc: Mike Rapoport Cc: Rafael J. Wysocki Cc: Alexander Potapenko Cc: Kees Cook Cc: Laura Abbott Cc: Mateusz Nosek Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poison.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/poison.h b/include/linux/poison.h index dc8ae5d8db03..aff1c9250c82 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -27,11 +27,7 @@ #define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) /********** mm/page_poison.c **********/ -#ifdef CONFIG_PAGE_POISONING_ZERO -#define PAGE_POISON 0x00 -#else #define PAGE_POISON 0xaa -#endif /********** mm/page_alloc.c ************/ -- cgit v1.2.3 From 37cd0575b8510159992d279c530c05f872990b02 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Mon, 14 Dec 2020 19:13:49 -0800 Subject: userfaultfd: add UFFD_USER_MODE_ONLY Patch series "Control over userfaultfd kernel-fault handling", v6. This patch series is split from [1]. The other series enables SELinux support for userfaultfd file descriptors so that its creation and movement can be controlled. It has been demonstrated on various occasions that suspending kernel code execution for an arbitrary amount of time at any access to userspace memory (copy_from_user()/copy_to_user()/...) can be exploited to change the intended behavior of the kernel. For instance, handling page faults in kernel-mode using userfaultfd has been exploited in [2, 3]. Likewise, FUSE, which is similar to userfaultfd in this respect, has been exploited in [4, 5] for similar outcome. This small patch series adds a new flag to userfaultfd(2) that allows callers to give up the ability to handle kernel-mode faults with the resulting UFFD file object. It then adds a 'user-mode only' option to the unprivileged_userfaultfd sysctl knob to require unprivileged callers to use this new flag. The purpose of this new interface is to decrease the chance of an unprivileged userfaultfd user taking advantage of userfaultfd to enhance security vulnerabilities by lengthening the race window in kernel code. [1] https://lore.kernel.org/lkml/20200211225547.235083-1-dancol@google.com/ [2] https://duasynt.com/blog/linux-kernel-heap-spray [3] https://duasynt.com/blog/cve-2016-6187-heap-off-by-one-exploit [4] https://googleprojectzero.blogspot.com/2016/06/exploiting-recursion-in-linux-kernel_20.html [5] https://bugs.chromium.org/p/project-zero/issues/detail?id=808 This patch (of 2): userfaultfd handles page faults from both user and kernel code. Add a new UFFD_USER_MODE_ONLY flag for userfaultfd(2) that makes the resulting userfaultfd object refuse to handle faults from kernel mode, treating these faults as if SIGBUS were always raised, causing the kernel code to fail with EFAULT. A future patch adds a knob allowing administrators to give some processes the ability to create userfaultfd file objects only if they pass UFFD_USER_MODE_ONLY, reducing the likelihood that these processes will exploit userfaultfd's ability to delay kernel page faults to open timing windows for future exploits. Link: https://lkml.kernel.org/r/20201120030411.2690816-1-lokeshgidra@google.com Link: https://lkml.kernel.org/r/20201120030411.2690816-2-lokeshgidra@google.com Signed-off-by: Daniel Colascione Signed-off-by: Lokesh Gidra Reviewed-by: Andrea Arcangeli Cc: Alexander Viro Cc: Cc: Daniel Colascione Cc: Eric Biggers Cc: Iurii Zaikin Cc: Jeff Vander Stoep Cc: Jerome Glisse Cc: "Joel Fernandes (Google)" Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kalesh Singh Cc: Kees Cook Cc: Luis Chamberlain Cc: Mauro Carvalho Chehab Cc: Mel Gorman Cc: Mike Rapoport Cc: Nitin Gupta Cc: Peter Xu Cc: Sebastian Andrzej Siewior Cc: Shaohua Li Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index e7e98bde221f..5f2d88212f7c 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -257,4 +257,13 @@ struct uffdio_writeprotect { __u64 mode; }; +/* + * Flags for the userfaultfd(2) system call itself. + */ + +/* + * Create a userfaultfd that can handle page faults only in user mode. + */ +#define UFFD_USER_MODE_ONLY 1 + #endif /* _LINUX_USERFAULTFD_H */ -- cgit v1.2.3