From 9bd3bb6703d8c0a5fb8aec8e3287bd55b7341dcd Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 11 Jul 2019 20:52:08 -0700 Subject: mm/nvdimm: add is_ioremap_addr and use that to check ioremap address Architectures like powerpc use different address range to map ioremap and vmalloc range. The memunmap() check used by the nvdimm layer was wrongly using is_vmalloc_addr() to check for ioremap range which fails for ppc64. This result in ppc64 not freeing the ioremap mapping. The side effect of this is an unbind failure during module unload with papr_scm nvdimm driver Link: http://lkml.kernel.org/r/20190701134038.14165-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Fixes: b5beae5e224f ("powerpc/pseries: Add driver for PAPR SCM regions") Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index dd0b5f4e1e45..0a6dae2f2b84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -633,6 +633,11 @@ static inline bool is_vmalloc_addr(const void *x) return false; #endif } + +#ifndef is_ioremap_addr +#define is_ioremap_addr(x) is_vmalloc_addr(x) +#endif + #ifdef CONFIG_MMU extern int is_vmalloc_or_module_addr(const void *x); #else -- cgit v1.2.3 From a760f8a67cb38d19fd52f2a28c65c967e469367e Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 11 Jul 2019 20:52:24 -0700 Subject: include/linux/dmar.h: replace single-char identifiers in macros There are a few macros in IOMMU have single-char identifiers make the code hard to read and debug. Replace them with meaningful names. Link: http://lkml.kernel.org/r/1559566783-13627-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Suggested-by: Andrew Morton Cc: Joerg Roedel Cc: Robin Murphy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dmar.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 28813c6f44b6..a7cf3599d9a1 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -92,12 +92,14 @@ static inline bool dmar_rcu_check(void) #define dmar_rcu_dereference(p) rcu_dereference_check((p), dmar_rcu_check()) -#define for_each_dev_scope(a, c, p, d) \ - for ((p) = 0; ((d) = (p) < (c) ? dmar_rcu_dereference((a)[(p)].dev) : \ - NULL, (p) < (c)); (p)++) - -#define for_each_active_dev_scope(a, c, p, d) \ - for_each_dev_scope((a), (c), (p), (d)) if (!(d)) { continue; } else +#define for_each_dev_scope(devs, cnt, i, tmp) \ + for ((i) = 0; ((tmp) = (i) < (cnt) ? \ + dmar_rcu_dereference((devs)[(i)].dev) : NULL, (i) < (cnt)); \ + (i)++) + +#define for_each_active_dev_scope(devs, cnt, i, tmp) \ + for_each_dev_scope((devs), (cnt), (i), (tmp)) \ + if (!(tmp)) { continue; } else extern int dmar_table_init(void); extern int dmar_dev_scope_init(void); -- cgit v1.2.3 From 7d8ad890dad00f6cd64bfb44d9be4fceb10cf819 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:03 -0700 Subject: mm/kasan: introduce __kasan_check_{read,write} Patch series "mm/kasan: Add object validation in ksize()", v3. This patch (of 5): This introduces __kasan_check_{read,write}. __kasan_check functions may be used from anywhere, even compilation units that disable instrumentation selectively. This change eliminates the need for the __KASAN_INTERNAL definition. [elver@google.com: v5] Link: http://lkml.kernel.org/r/20190708170706.174189-2-elver@google.com Link: http://lkml.kernel.org/r/20190626142014.141844-2-elver@google.com Signed-off-by: Marco Elver Acked-by: Mark Rutland Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan-checks.h | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index a61dc075e2ce..221f05fbddd7 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -2,9 +2,28 @@ #ifndef _LINUX_KASAN_CHECKS_H #define _LINUX_KASAN_CHECKS_H -#if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL) -void kasan_check_read(const volatile void *p, unsigned int size); -void kasan_check_write(const volatile void *p, unsigned int size); +/* + * __kasan_check_*: Always available when KASAN is enabled. This may be used + * even in compilation units that selectively disable KASAN, but must use KASAN + * to validate access to an address. Never use these in header files! + */ +#ifdef CONFIG_KASAN +void __kasan_check_read(const volatile void *p, unsigned int size); +void __kasan_check_write(const volatile void *p, unsigned int size); +#else +static inline void __kasan_check_read(const volatile void *p, unsigned int size) +{ } +static inline void __kasan_check_write(const volatile void *p, unsigned int size) +{ } +#endif + +/* + * kasan_check_*: Only available when the particular compilation unit has KASAN + * instrumentation enabled. May be used in header files. + */ +#ifdef __SANITIZE_ADDRESS__ +#define kasan_check_read __kasan_check_read +#define kasan_check_write __kasan_check_write #else static inline void kasan_check_read(const volatile void *p, unsigned int size) { } -- cgit v1.2.3 From b5f6e0fc7d60e0234dac82498e90dfe9027bad1f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:07 -0700 Subject: mm/kasan: change kasan_check_{read,write} to return boolean This changes {,__}kasan_check_{read,write} functions to return a boolean denoting if the access was valid or not. [sfr@canb.auug.org.au: include types.h for "bool"] Link: http://lkml.kernel.org/r/20190705184949.13cdd021@canb.auug.org.au Link: http://lkml.kernel.org/r/20190626142014.141844-3-elver@google.com Signed-off-by: Marco Elver Signed-off-by: Stephen Rothwell Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mark Rutland Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan-checks.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index 221f05fbddd7..ac6aba632f2d 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -2,19 +2,25 @@ #ifndef _LINUX_KASAN_CHECKS_H #define _LINUX_KASAN_CHECKS_H +#include + /* * __kasan_check_*: Always available when KASAN is enabled. This may be used * even in compilation units that selectively disable KASAN, but must use KASAN * to validate access to an address. Never use these in header files! */ #ifdef CONFIG_KASAN -void __kasan_check_read(const volatile void *p, unsigned int size); -void __kasan_check_write(const volatile void *p, unsigned int size); +bool __kasan_check_read(const volatile void *p, unsigned int size); +bool __kasan_check_write(const volatile void *p, unsigned int size); #else -static inline void __kasan_check_read(const volatile void *p, unsigned int size) -{ } -static inline void __kasan_check_write(const volatile void *p, unsigned int size) -{ } +static inline bool __kasan_check_read(const volatile void *p, unsigned int size) +{ + return true; +} +static inline bool __kasan_check_write(const volatile void *p, unsigned int size) +{ + return true; +} #endif /* @@ -25,10 +31,14 @@ static inline void __kasan_check_write(const volatile void *p, unsigned int size #define kasan_check_read __kasan_check_read #define kasan_check_write __kasan_check_write #else -static inline void kasan_check_read(const volatile void *p, unsigned int size) -{ } -static inline void kasan_check_write(const volatile void *p, unsigned int size) -{ } +static inline bool kasan_check_read(const volatile void *p, unsigned int size) +{ + return true; +} +static inline bool kasan_check_write(const volatile void *p, unsigned int size) +{ + return true; +} #endif #endif -- cgit v1.2.3 From 10d1f8cb3965a6f633bf23eb984cda552927e3a5 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:14 -0700 Subject: mm/slab: refactor common ksize KASAN logic into slab_common.c This refactors common code of ksize() between the various allocators into slab_common.c: __ksize() is the allocator-specific implementation without instrumentation, whereas ksize() includes the required KASAN logic. Link: http://lkml.kernel.org/r/20190626142014.141844-5-elver@google.com Signed-off-by: Marco Elver Acked-by: Christoph Lameter Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mark Rutland Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 9449b19c5f10..98c3d12b7275 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -184,6 +184,7 @@ void * __must_check __krealloc(const void *, size_t, gfp_t); void * __must_check krealloc(const void *, size_t, gfp_t); void kfree(const void *); void kzfree(const void *); +size_t __ksize(const void *); size_t ksize(const void *); #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR -- cgit v1.2.3 From 0d4ca4c9bab397b525c9a4f875d31410ce4bc738 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Jul 2019 20:54:18 -0700 Subject: mm/kasan: add object validation in ksize() ksize() has been unconditionally unpoisoning the whole shadow memory region associated with an allocation. This can lead to various undetected bugs, for example, double-kzfree(). Specifically, kzfree() uses ksize() to determine the actual allocation size, and subsequently zeroes the memory. Since ksize() used to just unpoison the whole shadow memory region, no invalid free was detected. This patch addresses this as follows: 1. Add a check in ksize(), and only then unpoison the memory region. 2. Preserve kasan_unpoison_slab() semantics by explicitly unpoisoning the shadow memory region using the size obtained from __ksize(). Tested: 1. With SLAB allocator: a) normal boot without warnings; b) verified the added double-kzfree() is detected. 2. With SLUB allocator: a) normal boot without warnings; b) verified the added double-kzfree() is detected. [elver@google.com: s/BUG_ON/WARN_ON_ONCE/, per Kees] Link: http://lkml.kernel.org/r/20190627094445.216365-6-elver@google.com Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199359 Link: http://lkml.kernel.org/r/20190626142014.141844-6-elver@google.com Signed-off-by: Marco Elver Acked-by: Kees Cook Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b40ea104dd36..cc8a03cc9674 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -76,8 +76,11 @@ void kasan_free_shadow(const struct vm_struct *vm); int kasan_add_zero_shadow(void *start, unsigned long size); void kasan_remove_zero_shadow(void *start, unsigned long size); -size_t ksize(const void *); -static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } +size_t __ksize(const void *); +static inline void kasan_unpoison_slab(const void *ptr) +{ + kasan_unpoison_shadow(ptr, __ksize(ptr)); +} size_t kasan_metadata_size(struct kmem_cache *cache); bool kasan_save_enable_multi_shot(void); -- cgit v1.2.3 From 2236b99d6a33df72befa7205c2d8381aca7ae701 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Jul 2019 20:54:21 -0700 Subject: include/linux/pfn_t.h: remove pfn_t_to_virt() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It has no callers and there is no virt_to_pfn_t(). Reported-by: Anshuman Khandual Cc: Dan Williams Cc: Jérôme Glisse Cc: Laurent Dufour Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pfn_t.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 3c202a11a79e..01e8037023f7 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -66,13 +66,6 @@ static inline phys_addr_t pfn_t_to_phys(pfn_t pfn) return PFN_PHYS(pfn_t_to_pfn(pfn)); } -static inline void *pfn_t_to_virt(pfn_t pfn) -{ - if (pfn_t_has_page(pfn) && !is_device_private_page(pfn_t_to_page(pfn))) - return __va(pfn_t_to_phys(pfn)); - return NULL; -} - static inline pfn_t page_to_pfn_t(struct page *page) { return pfn_to_pfn_t(page_to_pfn(page)); -- cgit v1.2.3 From 442a5a9a9295bfd9b0cffd0691ef8a6ce81db7c4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Jul 2019 20:54:40 -0700 Subject: mm: make !CONFIG_HUGE_PAGE wrappers into static inlines Instead of using defines, which loses type safety and provokes unused variable warnings from gcc, put the constants into static inlines. Link: http://lkml.kernel.org/r/20190522235102.GA15370@mellanox.com Signed-off-by: Jason Gunthorpe Suggested-by: Andrew Morton Reviewed-by: Mike Kravetz Cc: Jerome Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 102 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 86 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index edf476c8cfb9..f895a79c6f5c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -608,22 +608,92 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; -#define alloc_huge_page(v, a, r) NULL -#define alloc_huge_page_node(h, nid) NULL -#define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL -#define alloc_huge_page_vma(h, vma, address) NULL -#define alloc_bootmem_huge_page(h) NULL -#define hstate_file(f) NULL -#define hstate_sizelog(s) NULL -#define hstate_vma(v) NULL -#define hstate_inode(i) NULL -#define page_hstate(page) NULL -#define huge_page_size(h) PAGE_SIZE -#define huge_page_mask(h) PAGE_MASK -#define vma_kernel_pagesize(v) PAGE_SIZE -#define vma_mmu_pagesize(v) PAGE_SIZE -#define huge_page_order(h) 0 -#define huge_page_shift(h) PAGE_SHIFT + +static inline struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, + int avoid_reserve) +{ + return NULL; +} + +static inline struct page *alloc_huge_page_node(struct hstate *h, int nid) +{ + return NULL; +} + +static inline struct page * +alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask) +{ + return NULL; +} + +static inline struct page *alloc_huge_page_vma(struct hstate *h, + struct vm_area_struct *vma, + unsigned long address) +{ + return NULL; +} + +static inline int __alloc_bootmem_huge_page(struct hstate *h) +{ + return 0; +} + +static inline struct hstate *hstate_file(struct file *f) +{ + return NULL; +} + +static inline struct hstate *hstate_sizelog(int page_size_log) +{ + return NULL; +} + +static inline struct hstate *hstate_vma(struct vm_area_struct *vma) +{ + return NULL; +} + +static inline struct hstate *hstate_inode(struct inode *i) +{ + return NULL; +} + +static inline struct hstate *page_hstate(struct page *page) +{ + return NULL; +} + +static inline unsigned long huge_page_size(struct hstate *h) +{ + return PAGE_SIZE; +} + +static inline unsigned long huge_page_mask(struct hstate *h) +{ + return PAGE_MASK; +} + +static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + return PAGE_SIZE; +} + +static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return PAGE_SIZE; +} + +static inline unsigned int huge_page_order(struct hstate *h) +{ + return 0; +} + +static inline unsigned int huge_page_shift(struct hstate *h) +{ + return PAGE_SHIFT; +} + static inline bool hstate_is_gigantic(struct hstate *h) { return false; -- cgit v1.2.3 From 219f8a2e25f0abbe222b170a0de2fd38c22d43ad Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 11 Jul 2019 20:54:43 -0700 Subject: include/linux/mm_types.h: ifdef struct vm_area_struct::swap_readahead_info The field is only used in swap code. Link: http://lkml.kernel.org/r/20190503190500.GA30589@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8ec38b11b361..1d1093474c1a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -329,7 +329,9 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ +#ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; +#endif #ifndef CONFIG_MMU struct vm_region *vm_region; /* NOMMU mapping region */ #endif -- cgit v1.2.3 From 1fcf0a561cd09d7fb7f7afa2ddfe05f72f32050e Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Thu, 11 Jul 2019 20:54:49 -0700 Subject: mm/page_isolation.c: change the prototype of undo_isolate_page_range() undo_isolate_page_range() never fails, so no need to return value. Link: http://lkml.kernel.org/r/1562075604-8979-1-git-send-email-kernelfans@gmail.com Signed-off-by: Pingfan Liu Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Reviewed-by: Anshuman Khandual Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 280ae96dc4c3..1099c2fee20f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. * target range is [start_pfn, end_pfn) */ -int +void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype); -- cgit v1.2.3 From 51b176290496518d6701bc40e63f70e4b6870198 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 11 Jul 2019 20:54:52 -0700 Subject: include/linux/vmpressure.h: use spinlock_t instead of struct spinlock For spinlocks the type spinlock_t should be used instead of "struct spinlock". Use spinlock_t for spinlock's definition. Link: http://lkml.kernel.org/r/20190704153803.12739-3-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmpressure.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 61e6fddfb26f..6d28bc433c1c 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -17,7 +17,7 @@ struct vmpressure { unsigned long tree_scanned; unsigned long tree_reclaimed; /* The lock is used to keep the scanned/reclaimed above in sync. */ - struct spinlock sr_lock; + spinlock_t sr_lock; /* The list of vmpressure_event structs. */ struct list_head events; -- cgit v1.2.3 From f445884562dd8bc51eb4136bd21f014403d1813d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Jul 2019 20:54:59 -0700 Subject: include/linux/pagemap.h: document trylock_page() return value Cc: Henry Burns Cc: Jonathan Adams Cc: David Rientjes Cc: Mike Rapoport Cc: Vitaly Wool Cc: Xidong Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fe0b29bf2df7..6fd0d3aa492c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -452,6 +452,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); extern void unlock_page(struct page *page); +/* + * Return true if the page was successfully locked + */ static inline int trylock_page(struct page *page) { page = compound_head(page); -- cgit v1.2.3 From 96a2b03f281d3a3b29c27028164f43090d6495b9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2019 20:55:06 -0700 Subject: mm, debug_pagelloc: use static keys to enable debugging Patch series "debug_pagealloc improvements". I have been recently debugging some pcplist corruptions, where it would be useful to perform struct page checks immediately as pages are allocated from and freed to pcplists, which is now only possible by rebuilding the kernel with CONFIG_DEBUG_VM (details in Patch 2 changelog). To make this kind of debugging simpler in future on a distro kernel, I have improved CONFIG_DEBUG_PAGEALLOC so that it has even smaller overhead when not enabled at boot time (Patch 1) and also when enabled (Patch 3), and extended it to perform the struct page checks more often when enabled (Patch 2). Now it can be configured in when building a distro kernel without extra overhead, and debugging page use after free or double free can be enabled simply by rebooting with debug_pagealloc=on. This patch (of 3): CONFIG_DEBUG_PAGEALLOC has been redesigned by 031bc5743f15 ("mm/debug-pagealloc: make debug-pagealloc boottime configurable") to allow being always enabled in a distro kernel, but only perform its expensive functionality when booted with debug_pagelloc=on. We can further reduce the overhead when not boot-enabled (including page allocator fast paths) using static keys. This patch introduces one for debug_pagealloc core functionality, and another for the optional guard page functionality (enabled by booting with debug_guardpage_minorder=X). Link: http://lkml.kernel.org/r/20190603143451.27353-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Andrew Morton Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Vlastimil Babka Cc: Matthew Wilcox Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0a6dae2f2b84..2c2e98cae2d1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2701,11 +2701,18 @@ static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } #endif -extern bool _debug_pagealloc_enabled; +#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT +DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); +#else +DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); +#endif static inline bool debug_pagealloc_enabled(void) { - return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled; + if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) + return false; + + return static_branch_unlikely(&_debug_pagealloc_enabled); } #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) @@ -2859,7 +2866,7 @@ extern struct page_ext_operations debug_guardpage_ops; #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; -extern bool _debug_guardpage_enabled; +DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static inline unsigned int debug_guardpage_minorder(void) { @@ -2868,7 +2875,7 @@ static inline unsigned int debug_guardpage_minorder(void) static inline bool debug_guardpage_enabled(void) { - return _debug_guardpage_enabled; + return static_branch_unlikely(&_debug_guardpage_enabled); } static inline bool page_is_guard(struct page *page) -- cgit v1.2.3 From 3972f6bb1c6ae1d32dcf2e4ff635d24b77f26dcb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2019 20:55:13 -0700 Subject: mm, debug_pagealloc: use a page type instead of page_ext flag When debug_pagealloc is enabled, we currently allocate the page_ext array to mark guard pages with the PAGE_EXT_DEBUG_GUARD flag. Now that we have the page_type field in struct page, we can use that instead, as guard pages are neither PageSlab nor mapped to userspace. This reduces memory overhead when debug_pagealloc is enabled and there are no other features requiring the page_ext array. Link: http://lkml.kernel.org/r/20190603143451.27353-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +--------- include/linux/page-flags.h | 6 ++++++ include/linux/page_ext.h | 1 - 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c2e98cae2d1..cb8d413d635e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2862,8 +2862,6 @@ extern long copy_huge_page_from_user(struct page *dst_page, bool allow_pagefault); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -extern struct page_ext_operations debug_guardpage_ops; - #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); @@ -2880,16 +2878,10 @@ static inline bool debug_guardpage_enabled(void) static inline bool page_is_guard(struct page *page) { - struct page_ext *page_ext; - if (!debug_guardpage_enabled()) return false; - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + return PageGuard(page); } #else static inline unsigned int debug_guardpage_minorder(void) { return 0; } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 9f8712a4b1a5..b848517da64c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -703,6 +703,7 @@ PAGEFLAG_FALSE(DoubleMap) #define PG_offline 0x00000100 #define PG_kmemcg 0x00000200 #define PG_table 0x00000400 +#define PG_guard 0x00000800 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) @@ -754,6 +755,11 @@ PAGE_TYPE_OPS(Kmemcg, kmemcg) */ PAGE_TYPE_OPS(Table, table) +/* + * Marks guardpages used with debug_pagealloc. + */ +PAGE_TYPE_OPS(Guard, guard) + extern bool is_free_buddy_page(struct page *page); __PAGEFLAG(Isolated, isolated, PF_ANY); diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index f84f167ec04c..09592951725c 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -17,7 +17,6 @@ struct page_ext_operations { #ifdef CONFIG_PAGE_EXTENSION enum page_ext_flags { - PAGE_EXT_DEBUG_GUARD, PAGE_EXT_OWNER, #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, -- cgit v1.2.3 From 6c45b454191b330c8bc21d1ed3cf39bb6da1a4eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:55:20 -0700 Subject: mm/filemap: don't cast ->readpage to filler_t for do_read_cache_page We can just pass a NULL filler and do the right thing inside of do_read_cache_page based on the NULL parameter. Link: http://lkml.kernel.org/r/20190520055731.24538-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Cc: Nick Desaulniers Cc: Sami Tolvanen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 6fd0d3aa492c..c7552459a15f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -383,8 +383,7 @@ extern int read_cache_pages(struct address_space *mapping, static inline struct page *read_mapping_page(struct address_space *mapping, pgoff_t index, void *data) { - filler_t *filler = (filler_t *)mapping->a_ops->readpage; - return read_cache_page(mapping, index, filler, data); + return read_cache_page(mapping, index, NULL, data); } /* -- cgit v1.2.3 From eb085574a7526c4375965c5fbf7e5b0c19cdd336 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 11 Jul 2019 20:55:33 -0700 Subject: mm, swap: fix race between swapoff and some swap operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When swapin is performed, after getting the swap entry information from the page table, system will swap in the swap entry, without any lock held to prevent the swap device from being swapoff. This may cause the race like below, CPU 1 CPU 2 ----- ----- do_swap_page swapin_readahead __read_swap_cache_async swapoff swapcache_prepare p->swap_map = NULL __swap_duplicate p->swap_map[?] /* !!! NULL pointer access */ Because swapoff is usually done when system shutdown only, the race may not hit many people in practice. But it is still a race need to be fixed. To fix the race, get_swap_device() is added to check whether the specified swap entry is valid in its swap device. If so, it will keep the swap entry valid via preventing the swap device from being swapoff, until put_swap_device() is called. Because swapoff() is very rare code path, to make the normal path runs as fast as possible, rcu_read_lock/unlock() and synchronize_rcu() instead of reference count is used to implement get/put_swap_device(). >From get_swap_device() to put_swap_device(), RCU reader side is locked, so synchronize_rcu() in swapoff() will wait until put_swap_device() is called. In addition to swap_map, cluster_info, etc. data structure in the struct swap_info_struct, the swap cache radix tree will be freed after swapoff, so this patch fixes the race between swap cache looking up and swapoff too. Races between some other swap cache usages and swapoff are fixed too via calling synchronize_rcu() between clearing PageSwapCache() and freeing swap cache data structure. Another possible method to fix this is to use preempt_off() + stop_machine() to prevent the swap device from being swapoff when its data structure is being accessed. The overhead in hot-path of both methods is similar. The advantages of RCU based method are, 1. stop_machine() may disturb the normal execution code path on other CPUs. 2. File cache uses RCU to protect its radix tree. If the similar mechanism is used for swap cache too, it is easier to share code between them. 3. RCU is used to protect swap cache in total_swapcache_pages() and exit_swap_address_space() already. The two mechanisms can be merged to simplify the logic. Link: http://lkml.kernel.org/r/20190522015423.14418-1-ying.huang@intel.com Fixes: 235b62176712 ("mm/swap: add cluster lock") Signed-off-by: "Huang, Ying" Reviewed-by: Andrea Parri Not-nacked-by: Hugh Dickins Cc: Andrea Arcangeli Cc: Paul E. McKenney Cc: Daniel Jordan Cc: Michal Hocko Cc: Minchan Kim Cc: Johannes Weiner Cc: Tim Chen Cc: Mel Gorman Cc: Jérôme Glisse Cc: Yang Shi Cc: David Rientjes Cc: Rik van Riel Cc: Jan Kara Cc: Dave Jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 4bfb5c4ac108..6358a6185634 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -175,8 +175,9 @@ enum { SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ + SWP_VALID = (1 << 13), /* swap is valid to be operated on? */ /* add others here before... */ - SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL @@ -460,7 +461,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); -extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry); +extern int __swap_count(swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); @@ -470,6 +471,12 @@ extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); +extern struct swap_info_struct *get_swap_device(swp_entry_t entry); + +static inline void put_swap_device(struct swap_info_struct *si) +{ + rcu_read_unlock(); +} #else /* CONFIG_SWAP */ @@ -576,7 +583,7 @@ static inline int page_swapcount(struct page *page) return 0; } -static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +static inline int __swap_count(swp_entry_t entry) { return 0; } -- cgit v1.2.3 From 4efaceb1c5f8136d5fec3f26549d294b8e898bd7 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 11 Jul 2019 20:55:41 -0700 Subject: mm, swap: use rbtree for swap_extent swap_extent is used to map swap page offset to backing device's block offset. For a continuous block range, one swap_extent is used and all these swap_extents are managed in a linked list. These swap_extents are used by map_swap_entry() during swap's read and write path. To find out the backing device's block offset for a page offset, the swap_extent list will be traversed linearly, with curr_swap_extent being used as a cache to speed up the search. This works well as long as swap_extents are not huge or when the number of processes that access swap device are few, but when the swap device has many extents and there are a number of processes accessing the swap device concurrently, it can be a problem. On one of our servers, the disk's remaining size is tight: $df -h Filesystem Size Used Avail Use% Mounted on ... ... /dev/nvme0n1p1 1.8T 1.3T 504G 72% /home/t4 When creating a 80G swapfile there, there are as many as 84656 swap extents. The end result is, kernel spends abou 30% time in map_swap_entry() and swap throughput is only 70MB/s. As a comparison, when I used smaller sized swapfile, like 4G whose swap_extent dropped to 2000, swap throughput is back to 400-500MB/s and map_swap_entry() is about 3%. One downside of using rbtree for swap_extent is, 'struct rbtree' takes 24 bytes while 'struct list_head' takes 16 bytes, that's 8 bytes more for each swap_extent. For a swapfile that has 80k swap_extents, that means 625KiB more memory consumed. Test: Since it's not possible to reboot that server, I can not test this patch diretly there. Instead, I tested it on another server with NVMe disk. I created a 20G swapfile on an NVMe backed XFS fs. By default, the filesystem is quite clean and the created swapfile has only 2 extents. Testing vanilla and this patch shows no obvious performance difference when swapfile is not fragmented. To see the patch's effects, I used some tweaks to manually fragment the swapfile by breaking the extent at 1M boundary. This made the swapfile have 20K extents. nr_task=4 kernel swapout(KB/s) map_swap_entry(perf) swapin(KB/s) map_swap_entry(perf) vanilla 165191 90.77% 171798 90.21% patched 858993 +420% 2.16% 715827 +317% 0.77% nr_task=8 kernel swapout(KB/s) map_swap_entry(perf) swapin(KB/s) map_swap_entry(perf) vanilla 306783 92.19% 318145 87.76% patched 954437 +211% 2.35% 1073741 +237% 1.57% swapout: the throughput of swap out, in KB/s, higher is better 1st map_swap_entry: cpu cycles percent sampled by perf swapin: the throughput of swap in, in KB/s, higher is better. 2nd map_swap_entry: cpu cycles percent sampled by perf nr_task=1 doesn't show any difference, this is due to the curr_swap_extent can be effectively used to cache the correct swap extent for single task workload. [akpm@linux-foundation.org: s/BUG_ON(1)/BUG()/] Link: http://lkml.kernel.org/r/20190523142404.GA181@aaronlu Signed-off-by: Aaron Lu Cc: Huang Ying Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 6358a6185634..de2c67a33b7e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -148,7 +148,7 @@ struct zone; * We always assume that blocks are of size PAGE_SIZE. */ struct swap_extent { - struct list_head list; + struct rb_node rb_node; pgoff_t start_page; pgoff_t nr_pages; sector_t start_block; @@ -248,8 +248,7 @@ struct swap_info_struct { unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ - struct swap_extent *curr_swap_extent; - struct swap_extent first_swap_extent; + struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ -- cgit v1.2.3 From 1e577f970f66a53d429cbee37b36177c9712f488 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 20:55:55 -0700 Subject: mm, memcg: introduce memory.events.local The memory controller in cgroup v2 exposes memory.events file for each memcg which shows the number of times events like low, high, max, oom and oom_kill have happened for the whole tree rooted at that memcg. Users can also poll or register notification to monitor the changes in that file. Any event at any level of the tree rooted at memcg will notify all the listeners along the path till root_mem_cgroup. There are existing users which depend on this behavior. However there are users which are only interested in the events happening at a specific level of the memcg tree and not in the events in the underlying tree rooted at that memcg. One such use-case is a centralized resource monitor which can dynamically adjust the limits of the jobs running on a system. The jobs can create their sub-hierarchy for their own sub-tasks. The centralized monitor is only interested in the events at the top level memcgs of the jobs as it can then act and adjust the limits of the jobs. Using the current memory.events for such centralized monitor is very inconvenient. The monitor will keep receiving events which it is not interested and to find if the received event is interesting, it has to read memory.event files of the next level and compare it with the top level one. So, let's introduce memory.events.local to the memcg which shows and notify for the events at the memcg level. Now, does memory.stat and memory.pressure need their local versions. IMHO no due to the no internal process contraint of the cgroup v2. The memory.stat file of the top level memcg of a job shows the stats and vmevents of the whole tree. The local stats or vmevents of the top level memcg will only change if there is a process running in that memcg but v2 does not allow that. Similarly for memory.pressure there will not be any process in the internal nodes and thus no chance of local pressure. Link: http://lkml.kernel.org/r/20190527174643.209172-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Chris Down Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1dcb763bb610..22141ebc5e15 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -233,8 +233,9 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; - /* memory.events */ + /* memory.events and memory.events.local */ struct cgroup_file events_file; + struct cgroup_file events_local_file; /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; @@ -281,6 +282,7 @@ struct mem_cgroup { /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; + atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS]; unsigned long socket_pressure; @@ -747,6 +749,9 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { + atomic_long_inc(&memcg->memory_events_local[event]); + cgroup_file_notify(&memcg->events_local_file); + do { atomic_long_inc(&memcg->memory_events[event]); cgroup_file_notify(&memcg->events_file); -- cgit v1.2.3 From 0b14e8aa68223c2c124d408aa4b110b364d13c53 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:06 -0700 Subject: mm: memcg/slab: rename slab delayed deactivation functions and fields The delayed work/rcu deactivation infrastructure of non-root kmem_caches can be also used for asynchronous release of these objects. Let's get rid of the word "deactivation" in corresponding names to make the code look better after generalization. It's easier to make the renaming first, so that the generalized code will look consistent from scratch. Let's rename struct memcg_cache_params fields: deact_fn -> work_fn deact_rcu_head -> rcu_head deact_work -> work And RCU/delayed work callbacks in slab common code: kmemcg_deactivate_rcufn -> kmemcg_rcufn kmemcg_deactivate_workfn -> kmemcg_workfn This patch contains no functional changes, only renamings. Link: http://lkml.kernel.org/r/20190611231813.3148843-3-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 98c3d12b7275..6008d884e621 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -643,10 +643,10 @@ struct memcg_cache_params { struct list_head children_node; struct list_head kmem_caches_node; - void (*deact_fn)(struct kmem_cache *); + void (*work_fn)(struct kmem_cache *); union { - struct rcu_head deact_rcu_head; - struct work_struct deact_work; + struct rcu_head rcu_head; + struct work_struct work; }; }; }; -- cgit v1.2.3 From 49a18eae2e98a794477b5af5d85938e430c0be72 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:13 -0700 Subject: mm: memcg/slab: introduce __memcg_kmem_uncharge_memcg() Let's separate the page counter modification code out of __memcg_kmem_uncharge() in a way similar to what __memcg_kmem_charge() and __memcg_kmem_charge_memcg() work. This will allow to reuse this code later using a new memcg_kmem_uncharge_memcg() wrapper, which calls __memcg_kmem_uncharge_memcg() if memcg_kmem_enabled() check is passed. Link: http://lkml.kernel.org/r/20190611231813.3148843-5-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 22141ebc5e15..68402842c337 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1278,6 +1278,8 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge(struct page *page, int order); int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct mem_cgroup *memcg); +void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg, + unsigned int nr_pages); extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1319,6 +1321,14 @@ static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, return __memcg_kmem_charge_memcg(page, gfp, order, memcg); return 0; } + +static inline void memcg_kmem_uncharge_memcg(struct page *page, int order, + struct mem_cgroup *memcg) +{ + if (memcg_kmem_enabled()) + __memcg_kmem_uncharge_memcg(memcg, 1 << order); +} + /* * helper for accessing a memcg's index. It will be used as an index in the * child cache array in kmem_cache, and also to derive its name. This function -- cgit v1.2.3 From f0a3a24b532d9a7e56a33c5112b2a212ed6ec580 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:27 -0700 Subject: mm: memcg/slab: rework non-root kmem_cache lifecycle management Currently each charged slab page holds a reference to the cgroup to which it's charged. Kmem_caches are held by the memcg and are released all together with the memory cgroup. It means that none of kmem_caches are released unless at least one reference to the memcg exists, which is very far from optimal. Let's rework it in a way that allows releasing individual kmem_caches as soon as the cgroup is offline, the kmem_cache is empty and there are no pending allocations. To make it possible, let's introduce a new percpu refcounter for non-root kmem caches. The counter is initialized to the percpu mode, and is switched to the atomic mode during kmem_cache deactivation. The counter is bumped for every charged page and also for every running allocation. So the kmem_cache can't be released unless all allocations complete. To shutdown non-active empty kmem_caches, let's reuse the work queue, previously used for the kmem_cache deactivation. Once the reference counter reaches 0, let's schedule an asynchronous kmem_cache release. * I used the following simple approach to test the performance (stolen from another patchset by T. Harding): time find / -name fname-no-exist echo 2 > /proc/sys/vm/drop_caches repeat 10 times Results: orig patched real 0m1.455s real 0m1.355s user 0m0.206s user 0m0.219s sys 0m0.855s sys 0m0.807s real 0m1.487s real 0m1.699s user 0m0.221s user 0m0.256s sys 0m0.806s sys 0m0.948s real 0m1.515s real 0m1.505s user 0m0.183s user 0m0.215s sys 0m0.876s sys 0m0.858s real 0m1.291s real 0m1.380s user 0m0.193s user 0m0.198s sys 0m0.843s sys 0m0.786s real 0m1.364s real 0m1.374s user 0m0.180s user 0m0.182s sys 0m0.868s sys 0m0.806s real 0m1.352s real 0m1.312s user 0m0.201s user 0m0.212s sys 0m0.820s sys 0m0.761s real 0m1.302s real 0m1.349s user 0m0.205s user 0m0.203s sys 0m0.803s sys 0m0.792s real 0m1.334s real 0m1.301s user 0m0.194s user 0m0.201s sys 0m0.806s sys 0m0.779s real 0m1.426s real 0m1.434s user 0m0.216s user 0m0.181s sys 0m0.824s sys 0m0.864s real 0m1.350s real 0m1.295s user 0m0.200s user 0m0.190s sys 0m0.842s sys 0m0.811s So it looks like the difference is not noticeable in this test. [cai@lca.pw: fix an use-after-free in kmemcg_workfn()] Link: http://lkml.kernel.org/r/1560977573-10715-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20190611231813.3148843-9-guro@fb.com Signed-off-by: Roman Gushchin Signed-off-by: Qian Cai Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 6008d884e621..bc189a43e680 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -16,6 +16,7 @@ #include #include #include +#include /* @@ -152,7 +153,6 @@ int kmem_cache_shrink(struct kmem_cache *); void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); void memcg_deactivate_kmem_caches(struct mem_cgroup *); -void memcg_destroy_kmem_caches(struct mem_cgroup *); /* * Please use this macro to create slab caches. Simply specify the @@ -642,6 +642,7 @@ struct memcg_cache_params { struct mem_cgroup *memcg; struct list_head children_node; struct list_head kmem_caches_node; + struct percpu_ref refcnt; void (*work_fn)(struct kmem_cache *); union { -- cgit v1.2.3 From fb2f2b0adb98bbbbbb51c5a5327f3f90f5dc417e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 20:56:34 -0700 Subject: mm: memcg/slab: reparent memcg kmem_caches on cgroup removal Let's reparent non-root kmem_caches on memcg offlining. This allows us to release the memory cgroup without waiting for the last outstanding kernel object (e.g. dentry used by another application). Since the parent cgroup is already charged, everything we need to do is to splice the list of kmem_caches to the parent's kmem_caches list, swap the memcg pointer, drop the css refcounter for each kmem_cache and adjust the parent's css refcounter. Please, note that kmem_cache->memcg_params.memcg isn't a stable pointer anymore. It's safe to read it under rcu_read_lock(), cgroup_mutex held, or any other way that protects the memory cgroup from being released. We can race with the slab allocation and deallocation paths. It's not a big problem: parent's charge and slab global stats are always correct, and we don't care anymore about the child usage and global stats. The child cgroup is already offline, so we don't use or show it anywhere. Local slab stats (NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE) aren't used anywhere except count_shadow_nodes(). But even there it won't break anything: after reparenting "nodes" will be 0 on child level (because we're already reparenting shrinker lists), and on parent level page stats always were 0, and this patch won't change anything. [guro@fb.com: properly handle kmem_caches reparented to root_mem_cgroup] Link: http://lkml.kernel.org/r/20190620213427.1691847-1-guro@fb.com Link: http://lkml.kernel.org/r/20190611231813.3148843-11-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Vladimir Davydov Reviewed-by: Shakeel Butt Acked-by: David Rientjes Cc: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Waiman Long Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Andrei Vagin Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index bc189a43e680..fd0ef2e16178 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -152,7 +152,7 @@ void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); -void memcg_deactivate_kmem_caches(struct mem_cgroup *); +void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *); /* * Please use this macro to create slab caches. Simply specify the -- cgit v1.2.3 From fcf8a1e483490cd249df4e02d5425636c3f43c86 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 11 Jul 2019 20:56:38 -0700 Subject: mm, memcg: add a memcg_slabinfo debugfs file There are concerns about memory leaks from extensive use of memory cgroups as each memory cgroup creates its own set of kmem caches. There is a possiblity that the memcg kmem caches may remain even after the memory cgroups have been offlined. Therefore, it will be useful to show the status of each of memcg kmem caches. This patch introduces a new /memcg_slabinfo file which is somewhat similar to /proc/slabinfo in format, but lists only information about kmem caches that have child memcg kmem caches. Information available in /proc/slabinfo are not repeated in memcg_slabinfo. A portion of a sample output of the file was: # rpc_inode_cache root 13 51 1 1 rpc_inode_cache 48 0 0 0 0 fat_inode_cache root 1 45 1 1 fat_inode_cache 41 2 45 1 1 xfs_inode root 770 816 24 24 xfs_inode 92 22 34 1 1 xfs_inode 88:dead 1 34 1 1 xfs_inode 89:dead 23 34 1 1 xfs_inode 85 4 34 1 1 xfs_inode 84 9 34 1 1 The css id of the memcg is also listed. If a memcg is not online, the tag ":dead" will be attached as shown above. [longman@redhat.com: memcg: add ":deact" tag for reparented kmem caches in memcg_slabinfo] Link: http://lkml.kernel.org/r/20190621173005.31514-1-longman@redhat.com [longman@redhat.com: set the flag in the common code as suggested by Roman] Link: http://lkml.kernel.org/r/20190627184324.5875-1-longman@redhat.com Link: http://lkml.kernel.org/r/20190619171621.26209-1-longman@redhat.com Signed-off-by: Waiman Long Suggested-by: Shakeel Butt Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index fd0ef2e16178..56c9c7eed34e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,6 +116,10 @@ /* Objects are reclaimable */ #define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ + +/* Slab deactivation flag */ +#define SLAB_DEACTIVATED ((slab_flags_t __force)0x10000000U) + /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * -- cgit v1.2.3 From cbd34da7dc9afd521e0bea5e7d12701f4a9da7c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:57:28 -0700 Subject: mm: move the powerpc hugepd code to mm/gup.c While only powerpc supports the hugepd case, the code is pretty generic and I'd like to keep all GUP internals in one place. Link: http://lkml.kernel.org/r/20190625143715.1689-15-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Andrey Konovalov Cc: Benjamin Herrenschmidt Cc: David Miller Cc: James Hogan Cc: Jason Gunthorpe Cc: Khalid Aziz Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f895a79c6f5c..edfca4278319 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -16,29 +16,11 @@ struct user_struct; struct mmu_gather; #ifndef is_hugepd -/* - * Some architectures requires a hugepage directory format that is - * required to support multiple hugepage sizes. For example - * a4fe3ce76 "powerpc/mm: Allow more flexible layouts for hugepage pagetables" - * introduced the same on powerpc. This allows for a more flexible hugepage - * pagetable layout. - */ typedef struct { unsigned long pd; } hugepd_t; #define is_hugepd(hugepd) (0) #define __hugepd(x) ((hugepd_t) { (x) }) -static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned pdshift, unsigned long end, - int write, struct page **pages, int *nr) -{ - return 0; -} -#else -extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned pdshift, unsigned long end, - int write, struct page **pages, int *nr); #endif - #ifdef CONFIG_HUGETLB_PAGE #include -- cgit v1.2.3 From 8b1e0f81fb6fcf3109465a168b2e2da3f711fa86 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 11 Jul 2019 20:58:43 -0700 Subject: mm/pgtable: drop pgtable_t variable from pte_fn_t functions Drop the pgtable_t variable from all implementation for pte_fn_t as none of them use it. apply_to_pte_range() should stop computing it as well. Should help us save some cycles. Link: http://lkml.kernel.org/r/1556803126-26596-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Matthew Wilcox Cc: Ard Biesheuvel Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Michal Hocko Cc: Logan Gunthorpe Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cb8d413d635e..bb242ad810eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2686,8 +2686,7 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) return 0; } -typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, - void *data); +typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); -- cgit v1.2.3 From 6471384af2a6530696fc0203bafe4de41a23c9ef Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 11 Jul 2019 20:59:19 -0700 Subject: mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options Patch series "add init_on_alloc/init_on_free boot options", v10. Provide init_on_alloc and init_on_free boot options. These are aimed at preventing possible information leaks and making the control-flow bugs that depend on uninitialized values more deterministic. Enabling either of the options guarantees that the memory returned by the page allocator and SL[AU]B is initialized with zeroes. SLOB allocator isn't supported at the moment, as its emulation of kmem caches complicates handling of SLAB_TYPESAFE_BY_RCU caches correctly. Enabling init_on_free also guarantees that pages and heap objects are initialized right after they're freed, so it won't be possible to access stale data by using a dangling pointer. As suggested by Michal Hocko, right now we don't let the heap users to disable initialization for certain allocations. There's not enough evidence that doing so can speed up real-life cases, and introducing ways to opt-out may result in things going out of control. This patch (of 2): The new options are needed to prevent possible information leaks and make control-flow bugs that depend on uninitialized values more deterministic. This is expected to be on-by-default on Android and Chrome OS. And it gives the opportunity for anyone else to use it under distros too via the boot args. (The init_on_free feature is regularly requested by folks where memory forensics is included in their threat models.) init_on_alloc=1 makes the kernel initialize newly allocated pages and heap objects with zeroes. Initialization is done at allocation time at the places where checks for __GFP_ZERO are performed. init_on_free=1 makes the kernel initialize freed pages and heap objects with zeroes upon their deletion. This helps to ensure sensitive data doesn't leak via use-after-free accesses. Both init_on_alloc=1 and init_on_free=1 guarantee that the allocator returns zeroed memory. The two exceptions are slab caches with constructors and SLAB_TYPESAFE_BY_RCU flag. Those are never zero-initialized to preserve their semantics. Both init_on_alloc and init_on_free default to zero, but those defaults can be overridden with CONFIG_INIT_ON_ALLOC_DEFAULT_ON and CONFIG_INIT_ON_FREE_DEFAULT_ON. If either SLUB poisoning or page poisoning is enabled, those options take precedence over init_on_alloc and init_on_free: initialization is only applied to unpoisoned allocations. Slowdown for the new features compared to init_on_free=0, init_on_alloc=0: hackbench, init_on_free=1: +7.62% sys time (st.err 0.74%) hackbench, init_on_alloc=1: +7.75% sys time (st.err 2.14%) Linux build with -j12, init_on_free=1: +8.38% wall time (st.err 0.39%) Linux build with -j12, init_on_free=1: +24.42% sys time (st.err 0.52%) Linux build with -j12, init_on_alloc=1: -0.13% wall time (st.err 0.42%) Linux build with -j12, init_on_alloc=1: +0.57% sys time (st.err 0.40%) The slowdown for init_on_free=0, init_on_alloc=0 compared to the baseline is within the standard error. The new features are also going to pave the way for hardware memory tagging (e.g. arm64's MTE), which will require both on_alloc and on_free hooks to set the tags for heap objects. With MTE, tagging will have the same cost as memory initialization. Although init_on_free is rather costly, there are paranoid use-cases where in-memory data lifetime is desired to be minimized. There are various arguments for/against the realism of the associated threat models, but given that we'll need the infrastructure for MTE anyway, and there are people who want wipe-on-free behavior no matter what the performance cost, it seems reasonable to include it in this series. [glider@google.com: v8] Link: http://lkml.kernel.org/r/20190626121943.131390-2-glider@google.com [glider@google.com: v9] Link: http://lkml.kernel.org/r/20190627130316.254309-2-glider@google.com [glider@google.com: v10] Link: http://lkml.kernel.org/r/20190628093131.199499-2-glider@google.com Link: http://lkml.kernel.org/r/20190617151050.92663-2-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Kees Cook Acked-by: Michal Hocko [page and dmapool parts Acked-by: James Morris ] Cc: Christoph Lameter Cc: Masahiro Yamada Cc: "Serge E. Hallyn" Cc: Nick Desaulniers Cc: Kostya Serebryany Cc: Dmitry Vyukov Cc: Sandeep Patil Cc: Laura Abbott Cc: Randy Dunlap Cc: Jann Horn Cc: Mark Rutland Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index bb242ad810eb..f88f0eabcc5e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2700,6 +2700,30 @@ static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } #endif +#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON +DECLARE_STATIC_KEY_TRUE(init_on_alloc); +#else +DECLARE_STATIC_KEY_FALSE(init_on_alloc); +#endif +static inline bool want_init_on_alloc(gfp_t flags) +{ + if (static_branch_unlikely(&init_on_alloc) && + !page_poisoning_enabled()) + return true; + return flags & __GFP_ZERO; +} + +#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON +DECLARE_STATIC_KEY_TRUE(init_on_free); +#else +DECLARE_STATIC_KEY_FALSE(init_on_free); +#endif +static inline bool want_init_on_free(void) +{ + return static_branch_unlikely(&init_on_free) && + !page_poisoning_enabled(); +} + #ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); #else -- cgit v1.2.3 From 97105f0ab7b877a8ece2005e214894e93793950c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 21:00:13 -0700 Subject: mm: vmalloc: show number of vmalloc pages in /proc/meminfo Vmalloc() is getting more and more used these days (kernel stacks, bpf and percpu allocator are new top users), and the total % of memory consumed by vmalloc() can be pretty significant and changes dynamically. /proc/meminfo is the best place to display this information: its top goal is to show top consumers of the memory. Since the VmallocUsed field in /proc/meminfo is not in use for quite a long time (it has been defined to 0 by a5ad88ce8c7f ("mm: get rid of 'vmalloc_info' from /proc/meminfo")), let's reuse it for showing the actual physical memory consumption of vmalloc(). Link: http://lkml.kernel.org/r/20190417194002.12369-3-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Reviewed-by: Andrew Morton Cc: Matthew Wilcox Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 51e131245379..9b21d0047710 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -72,10 +72,12 @@ extern void vm_unmap_aliases(void); #ifdef CONFIG_MMU extern void __init vmalloc_init(void); +extern unsigned long vmalloc_nr_pages(void); #else static inline void vmalloc_init(void) { } +static inline unsigned long vmalloc_nr_pages(void) { return 0; } #endif extern void *vmalloc(unsigned long size); -- cgit v1.2.3 From 6ba749ee78ef42ffdf4b95c042fc574a37d229d9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 21:00:26 -0700 Subject: mm, oom: remove redundant task_in_mem_cgroup() check oom_unkillable_task() can be called from three different contexts i.e. global OOM, memcg OOM and oom_score procfs interface. At the moment oom_unkillable_task() does a task_in_mem_cgroup() check on the given process. Since there is no reason to perform task_in_mem_cgroup() check for global OOM and oom_score procfs interface, those contexts provide NULL memcg and skips the task_in_mem_cgroup() check. However for memcg OOM context, the oom_unkillable_task() is always called from mem_cgroup_scan_tasks() and thus task_in_mem_cgroup() check becomes redundant and effectively dead code. So, just remove the task_in_mem_cgroup() check altogether. Link: http://lkml.kernel.org/r/20190624212631.87212-2-shakeelb@google.com Signed-off-by: Shakeel Butt Signed-off-by: Tetsuo Handa Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Jackson Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ------- include/linux/oom.h | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 68402842c337..44c41462be33 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -394,7 +394,6 @@ out: struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); @@ -875,12 +874,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return true; } -static inline bool task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg) -{ - return true; -} - static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { return NULL; diff --git a/include/linux/oom.h b/include/linux/oom.h index d07992009265..b75104690311 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -108,7 +108,7 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) bool __oom_reap_task_mm(struct mm_struct *mm); extern unsigned long oom_badness(struct task_struct *p, - struct mem_cgroup *memcg, const nodemask_t *nodemask, + const nodemask_t *nodemask, unsigned long totalpages); extern bool out_of_memory(struct oom_control *oc); -- cgit v1.2.3 From ac311a14c682dcd8a120a6244d0542ec654e3d93 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 21:00:31 -0700 Subject: oom: decouple mems_allowed from oom_unkillable_task Commit ef08e3b4981a ("[PATCH] cpusets: confine oom_killer to mem_exclusive cpuset") introduces a heuristic where a potential oom-killer victim is skipped if the intersection of the potential victim and the current (the process triggered the oom) is empty based on the reason that killing such victim most probably will not help the current allocating process. However the commit 7887a3da753e ("[PATCH] oom: cpuset hint") changed the heuristic to just decrease the oom_badness scores of such potential victim based on the reason that the cpuset of such processes might have changed and previously they may have allocated memory on mems where the current allocating process can allocate from. Unintentionally 7887a3da753e ("[PATCH] oom: cpuset hint") introduced a side effect as the oom_badness is also exposed to the user space through /proc/[pid]/oom_score, so, readers with different cpusets can read different oom_score of the same process. Later, commit 6cf86ac6f36b ("oom: filter tasks not sharing the same cpuset") fixed the side effect introduced by 7887a3da753e by moving the cpuset intersection back to only oom-killer context and out of oom_badness. However the combination of ab290adbaf8f ("oom: make oom_unkillable_task() helper function") and 26ebc984913b ("oom: /proc//oom_score treat kernel thread honestly") unintentionally brought back the cpuset intersection check into the oom_badness calculation function. Other than doing cpuset/mempolicy intersection from oom_badness, the memcg oom context is also doing cpuset/mempolicy intersection which is quite wrong and is caught by syzcaller with the following report: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 28426 Comm: syz-executor.5 Not tainted 5.2.0-rc3-next-20190607 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline] RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline] RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline] RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155 Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff RSP: 0018:ffff888000127490 EFLAGS: 00010a03 RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001 RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0 R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007 R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6 FS: 00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000607304 CR3: 000000009237e000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 Call Trace: oom_evaluate_task+0x49/0x520 mm/oom_kill.c:321 mem_cgroup_scan_tasks+0xcc/0x180 mm/memcontrol.c:1169 select_bad_process mm/oom_kill.c:374 [inline] out_of_memory mm/oom_kill.c:1088 [inline] out_of_memory+0x6b2/0x1280 mm/oom_kill.c:1035 mem_cgroup_out_of_memory+0x1ca/0x230 mm/memcontrol.c:1573 mem_cgroup_oom mm/memcontrol.c:1905 [inline] try_charge+0xfbe/0x1480 mm/memcontrol.c:2468 mem_cgroup_try_charge+0x24d/0x5e0 mm/memcontrol.c:6073 mem_cgroup_try_charge_delay+0x1f/0xa0 mm/memcontrol.c:6088 do_huge_pmd_wp_page_fallback+0x24f/0x1680 mm/huge_memory.c:1201 do_huge_pmd_wp_page+0x7fc/0x2160 mm/huge_memory.c:1359 wp_huge_pmd mm/memory.c:3793 [inline] __handle_mm_fault+0x164c/0x3eb0 mm/memory.c:4006 handle_mm_fault+0x3b7/0xa90 mm/memory.c:4053 do_user_addr_fault arch/x86/mm/fault.c:1455 [inline] __do_page_fault+0x5ef/0xda0 arch/x86/mm/fault.c:1521 do_page_fault+0x71/0x57d arch/x86/mm/fault.c:1552 page_fault+0x1e/0x30 arch/x86/entry/entry_64.S:1156 RIP: 0033:0x400590 Code: 06 e9 49 01 00 00 48 8b 44 24 10 48 0b 44 24 28 75 1f 48 8b 14 24 48 8b 7c 24 20 be 04 00 00 00 e8 f5 56 00 00 48 8b 74 24 08 <89> 06 e9 1e 01 00 00 48 8b 44 24 08 48 8b 14 24 be 04 00 00 00 8b RSP: 002b:00007fff7bc49780 EFLAGS: 00010206 RAX: 0000000000000001 RBX: 0000000000760000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000000002000cffc RDI: 0000000000000001 RBP: fffffffffffffffe R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000075 R11: 0000000000000246 R12: 0000000000760008 R13: 00000000004c55f2 R14: 0000000000000000 R15: 00007fff7bc499b0 Modules linked in: ---[ end trace a65689219582ffff ]--- RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline] RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline] RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline] RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155 Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff RSP: 0018:ffff888000127490 EFLAGS: 00010a03 RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001 RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0 R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007 R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6 FS: 00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2f823000 CR3: 000000009237e000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 The fix is to decouple the cpuset/mempolicy intersection check from oom_unkillable_task() and make sure cpuset/mempolicy intersection check is only done in the global oom context. [shakeelb@google.com: change function name and update comment] Link: http://lkml.kernel.org/r/20190628152421.198994-3-shakeelb@google.com Link: http://lkml.kernel.org/r/20190624212631.87212-3-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: syzbot+d0fc9d3c166bc5e4a94b@syzkaller.appspotmail.com Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Jackson Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index b75104690311..c696c265f019 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -108,7 +108,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) bool __oom_reap_task_mm(struct mm_struct *mm); extern unsigned long oom_badness(struct task_struct *p, - const nodemask_t *nodemask, unsigned long totalpages); extern bool out_of_memory(struct oom_control *oc); -- cgit v1.2.3