From 9bd3bb6703d8c0a5fb8aec8e3287bd55b7341dcd Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Thu, 11 Jul 2019 20:52:08 -0700
Subject: mm/nvdimm: add is_ioremap_addr and use that to check ioremap address

Architectures like powerpc use different address range to map ioremap
and vmalloc range.  The memunmap() check used by the nvdimm layer was
wrongly using is_vmalloc_addr() to check for ioremap range which fails
for ppc64.  This result in ppc64 not freeing the ioremap mapping.  The
side effect of this is an unbind failure during module unload with
papr_scm nvdimm driver

Link: http://lkml.kernel.org/r/20190701134038.14165-1-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Fixes: b5beae5e224f ("powerpc/pseries: Add driver for PAPR SCM regions")
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd0b5f4e1e45..0a6dae2f2b84 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -633,6 +633,11 @@ static inline bool is_vmalloc_addr(const void *x)
 	return false;
 #endif
 }
+
+#ifndef is_ioremap_addr
+#define is_ioremap_addr(x) is_vmalloc_addr(x)
+#endif
+
 #ifdef CONFIG_MMU
 extern int is_vmalloc_or_module_addr(const void *x);
 #else
-- 
cgit v1.2.3


From a760f8a67cb38d19fd52f2a28c65c967e469367e Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Thu, 11 Jul 2019 20:52:24 -0700
Subject: include/linux/dmar.h: replace single-char identifiers in macros

There are a few macros in IOMMU have single-char identifiers make the code
hard to read and debug.  Replace them with meaningful names.

Link: http://lkml.kernel.org/r/1559566783-13627-1-git-send-email-cai@lca.pw
Signed-off-by: Qian Cai <cai@lca.pw>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dmar.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 28813c6f44b6..a7cf3599d9a1 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -92,12 +92,14 @@ static inline bool dmar_rcu_check(void)
 
 #define	dmar_rcu_dereference(p)	rcu_dereference_check((p), dmar_rcu_check())
 
-#define	for_each_dev_scope(a, c, p, d)	\
-	for ((p) = 0; ((d) = (p) < (c) ? dmar_rcu_dereference((a)[(p)].dev) : \
-			NULL, (p) < (c)); (p)++)
-
-#define	for_each_active_dev_scope(a, c, p, d)	\
-	for_each_dev_scope((a), (c), (p), (d))	if (!(d)) { continue; } else
+#define for_each_dev_scope(devs, cnt, i, tmp)				\
+	for ((i) = 0; ((tmp) = (i) < (cnt) ?				\
+	    dmar_rcu_dereference((devs)[(i)].dev) : NULL, (i) < (cnt)); \
+	    (i)++)
+
+#define for_each_active_dev_scope(devs, cnt, i, tmp)			\
+	for_each_dev_scope((devs), (cnt), (i), (tmp))			\
+		if (!(tmp)) { continue; } else
 
 extern int dmar_table_init(void);
 extern int dmar_dev_scope_init(void);
-- 
cgit v1.2.3


From 7d8ad890dad00f6cd64bfb44d9be4fceb10cf819 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 11 Jul 2019 20:54:03 -0700
Subject: mm/kasan: introduce __kasan_check_{read,write}

Patch series "mm/kasan: Add object validation in ksize()", v3.

This patch (of 5):

This introduces __kasan_check_{read,write}.  __kasan_check functions may
be used from anywhere, even compilation units that disable instrumentation
selectively.

This change eliminates the need for the __KASAN_INTERNAL definition.

[elver@google.com: v5]
  Link: http://lkml.kernel.org/r/20190708170706.174189-2-elver@google.com
Link: http://lkml.kernel.org/r/20190626142014.141844-2-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan-checks.h | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index a61dc075e2ce..221f05fbddd7 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -2,9 +2,28 @@
 #ifndef _LINUX_KASAN_CHECKS_H
 #define _LINUX_KASAN_CHECKS_H
 
-#if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL)
-void kasan_check_read(const volatile void *p, unsigned int size);
-void kasan_check_write(const volatile void *p, unsigned int size);
+/*
+ * __kasan_check_*: Always available when KASAN is enabled. This may be used
+ * even in compilation units that selectively disable KASAN, but must use KASAN
+ * to validate access to an address.   Never use these in header files!
+ */
+#ifdef CONFIG_KASAN
+void __kasan_check_read(const volatile void *p, unsigned int size);
+void __kasan_check_write(const volatile void *p, unsigned int size);
+#else
+static inline void __kasan_check_read(const volatile void *p, unsigned int size)
+{ }
+static inline void __kasan_check_write(const volatile void *p, unsigned int size)
+{ }
+#endif
+
+/*
+ * kasan_check_*: Only available when the particular compilation unit has KASAN
+ * instrumentation enabled. May be used in header files.
+ */
+#ifdef __SANITIZE_ADDRESS__
+#define kasan_check_read __kasan_check_read
+#define kasan_check_write __kasan_check_write
 #else
 static inline void kasan_check_read(const volatile void *p, unsigned int size)
 { }
-- 
cgit v1.2.3


From b5f6e0fc7d60e0234dac82498e90dfe9027bad1f Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 11 Jul 2019 20:54:07 -0700
Subject: mm/kasan: change kasan_check_{read,write} to return boolean

This changes {,__}kasan_check_{read,write} functions to return a boolean
denoting if the access was valid or not.

[sfr@canb.auug.org.au: include types.h for "bool"]
  Link: http://lkml.kernel.org/r/20190705184949.13cdd021@canb.auug.org.au
Link: http://lkml.kernel.org/r/20190626142014.141844-3-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan-checks.h | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index 221f05fbddd7..ac6aba632f2d 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -2,19 +2,25 @@
 #ifndef _LINUX_KASAN_CHECKS_H
 #define _LINUX_KASAN_CHECKS_H
 
+#include <linux/types.h>
+
 /*
  * __kasan_check_*: Always available when KASAN is enabled. This may be used
  * even in compilation units that selectively disable KASAN, but must use KASAN
  * to validate access to an address.   Never use these in header files!
  */
 #ifdef CONFIG_KASAN
-void __kasan_check_read(const volatile void *p, unsigned int size);
-void __kasan_check_write(const volatile void *p, unsigned int size);
+bool __kasan_check_read(const volatile void *p, unsigned int size);
+bool __kasan_check_write(const volatile void *p, unsigned int size);
 #else
-static inline void __kasan_check_read(const volatile void *p, unsigned int size)
-{ }
-static inline void __kasan_check_write(const volatile void *p, unsigned int size)
-{ }
+static inline bool __kasan_check_read(const volatile void *p, unsigned int size)
+{
+	return true;
+}
+static inline bool __kasan_check_write(const volatile void *p, unsigned int size)
+{
+	return true;
+}
 #endif
 
 /*
@@ -25,10 +31,14 @@ static inline void __kasan_check_write(const volatile void *p, unsigned int size
 #define kasan_check_read __kasan_check_read
 #define kasan_check_write __kasan_check_write
 #else
-static inline void kasan_check_read(const volatile void *p, unsigned int size)
-{ }
-static inline void kasan_check_write(const volatile void *p, unsigned int size)
-{ }
+static inline bool kasan_check_read(const volatile void *p, unsigned int size)
+{
+	return true;
+}
+static inline bool kasan_check_write(const volatile void *p, unsigned int size)
+{
+	return true;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 10d1f8cb3965a6f633bf23eb984cda552927e3a5 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 11 Jul 2019 20:54:14 -0700
Subject: mm/slab: refactor common ksize KASAN logic into slab_common.c

This refactors common code of ksize() between the various allocators into
slab_common.c: __ksize() is the allocator-specific implementation without
instrumentation, whereas ksize() includes the required KASAN logic.

Link: http://lkml.kernel.org/r/20190626142014.141844-5-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9449b19c5f10..98c3d12b7275 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -184,6 +184,7 @@ void * __must_check __krealloc(const void *, size_t, gfp_t);
 void * __must_check krealloc(const void *, size_t, gfp_t);
 void kfree(const void *);
 void kzfree(const void *);
+size_t __ksize(const void *);
 size_t ksize(const void *);
 
 #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
-- 
cgit v1.2.3


From 0d4ca4c9bab397b525c9a4f875d31410ce4bc738 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 11 Jul 2019 20:54:18 -0700
Subject: mm/kasan: add object validation in ksize()

ksize() has been unconditionally unpoisoning the whole shadow memory
region associated with an allocation.  This can lead to various undetected
bugs, for example, double-kzfree().

Specifically, kzfree() uses ksize() to determine the actual allocation
size, and subsequently zeroes the memory.  Since ksize() used to just
unpoison the whole shadow memory region, no invalid free was detected.

This patch addresses this as follows:

1. Add a check in ksize(), and only then unpoison the memory region.

2. Preserve kasan_unpoison_slab() semantics by explicitly unpoisoning
   the shadow memory region using the size obtained from __ksize().

Tested:
1. With SLAB allocator: a) normal boot without warnings; b) verified the
   added double-kzfree() is detected.
2. With SLUB allocator: a) normal boot without warnings; b) verified the
   added double-kzfree() is detected.

[elver@google.com: s/BUG_ON/WARN_ON_ONCE/, per Kees]
  Link: http://lkml.kernel.org/r/20190627094445.216365-6-elver@google.com
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199359
Link: http://lkml.kernel.org/r/20190626142014.141844-6-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b40ea104dd36..cc8a03cc9674 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -76,8 +76,11 @@ void kasan_free_shadow(const struct vm_struct *vm);
 int kasan_add_zero_shadow(void *start, unsigned long size);
 void kasan_remove_zero_shadow(void *start, unsigned long size);
 
-size_t ksize(const void *);
-static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
+size_t __ksize(const void *);
+static inline void kasan_unpoison_slab(const void *ptr)
+{
+	kasan_unpoison_shadow(ptr, __ksize(ptr));
+}
 size_t kasan_metadata_size(struct kmem_cache *cache);
 
 bool kasan_save_enable_multi_shot(void);
-- 
cgit v1.2.3


From 2236b99d6a33df72befa7205c2d8381aca7ae701 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 11 Jul 2019 20:54:21 -0700
Subject: include/linux/pfn_t.h: remove pfn_t_to_virt()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It has no callers and there is no virt_to_pfn_t().

Reported-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pfn_t.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 3c202a11a79e..01e8037023f7 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -66,13 +66,6 @@ static inline phys_addr_t pfn_t_to_phys(pfn_t pfn)
 	return PFN_PHYS(pfn_t_to_pfn(pfn));
 }
 
-static inline void *pfn_t_to_virt(pfn_t pfn)
-{
-	if (pfn_t_has_page(pfn) && !is_device_private_page(pfn_t_to_page(pfn)))
-		return __va(pfn_t_to_phys(pfn));
-	return NULL;
-}
-
 static inline pfn_t page_to_pfn_t(struct page *page)
 {
 	return pfn_to_pfn_t(page_to_pfn(page));
-- 
cgit v1.2.3


From 442a5a9a9295bfd9b0cffd0691ef8a6ce81db7c4 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 11 Jul 2019 20:54:40 -0700
Subject: mm: make !CONFIG_HUGE_PAGE wrappers into static inlines

Instead of using defines, which loses type safety and provokes unused
variable warnings from gcc, put the constants into static inlines.

Link: http://lkml.kernel.org/r/20190522235102.GA15370@mellanox.com
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 102 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 86 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index edf476c8cfb9..f895a79c6f5c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -608,22 +608,92 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
-#define alloc_huge_page(v, a, r) NULL
-#define alloc_huge_page_node(h, nid) NULL
-#define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL
-#define alloc_huge_page_vma(h, vma, address) NULL
-#define alloc_bootmem_huge_page(h) NULL
-#define hstate_file(f) NULL
-#define hstate_sizelog(s) NULL
-#define hstate_vma(v) NULL
-#define hstate_inode(i) NULL
-#define page_hstate(page) NULL
-#define huge_page_size(h) PAGE_SIZE
-#define huge_page_mask(h) PAGE_MASK
-#define vma_kernel_pagesize(v) PAGE_SIZE
-#define vma_mmu_pagesize(v) PAGE_SIZE
-#define huge_page_order(h) 0
-#define huge_page_shift(h) PAGE_SHIFT
+
+static inline struct page *alloc_huge_page(struct vm_area_struct *vma,
+					   unsigned long addr,
+					   int avoid_reserve)
+{
+	return NULL;
+}
+
+static inline struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+	return NULL;
+}
+
+static inline struct page *
+alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask)
+{
+	return NULL;
+}
+
+static inline struct page *alloc_huge_page_vma(struct hstate *h,
+					       struct vm_area_struct *vma,
+					       unsigned long address)
+{
+	return NULL;
+}
+
+static inline int __alloc_bootmem_huge_page(struct hstate *h)
+{
+	return 0;
+}
+
+static inline struct hstate *hstate_file(struct file *f)
+{
+	return NULL;
+}
+
+static inline struct hstate *hstate_sizelog(int page_size_log)
+{
+	return NULL;
+}
+
+static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
+static inline struct hstate *hstate_inode(struct inode *i)
+{
+	return NULL;
+}
+
+static inline struct hstate *page_hstate(struct page *page)
+{
+	return NULL;
+}
+
+static inline unsigned long huge_page_size(struct hstate *h)
+{
+	return PAGE_SIZE;
+}
+
+static inline unsigned long huge_page_mask(struct hstate *h)
+{
+	return PAGE_MASK;
+}
+
+static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
+{
+	return PAGE_SIZE;
+}
+
+static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+	return PAGE_SIZE;
+}
+
+static inline unsigned int huge_page_order(struct hstate *h)
+{
+	return 0;
+}
+
+static inline unsigned int huge_page_shift(struct hstate *h)
+{
+	return PAGE_SHIFT;
+}
+
 static inline bool hstate_is_gigantic(struct hstate *h)
 {
 	return false;
-- 
cgit v1.2.3


From 219f8a2e25f0abbe222b170a0de2fd38c22d43ad Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 11 Jul 2019 20:54:43 -0700
Subject: include/linux/mm_types.h: ifdef struct
 vm_area_struct::swap_readahead_info

The field is only used in swap code.

Link: http://lkml.kernel.org/r/20190503190500.GA30589@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8ec38b11b361..1d1093474c1a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -329,7 +329,9 @@ struct vm_area_struct {
 	struct file * vm_file;		/* File we map to (can be NULL). */
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 
+#ifdef CONFIG_SWAP
 	atomic_long_t swap_readahead_info;
+#endif
 #ifndef CONFIG_MMU
 	struct vm_region *vm_region;	/* NOMMU mapping region */
 #endif
-- 
cgit v1.2.3


From 1fcf0a561cd09d7fb7f7afa2ddfe05f72f32050e Mon Sep 17 00:00:00 2001
From: Pingfan Liu <kernelfans@gmail.com>
Date: Thu, 11 Jul 2019 20:54:49 -0700
Subject: mm/page_isolation.c: change the prototype of
 undo_isolate_page_range()

undo_isolate_page_range() never fails, so no need to return value.

Link: http://lkml.kernel.org/r/1562075604-8979-1-git-send-email-kernelfans@gmail.com
Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-isolation.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 280ae96dc4c3..1099c2fee20f 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
  * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
  * target range is [start_pfn, end_pfn)
  */
-int
+void
 undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			unsigned migratetype);
 
-- 
cgit v1.2.3


From 51b176290496518d6701bc40e63f70e4b6870198 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 11 Jul 2019 20:54:52 -0700
Subject: include/linux/vmpressure.h: use spinlock_t instead of struct spinlock

For spinlocks the type spinlock_t should be used instead of "struct
spinlock".

Use spinlock_t for spinlock's definition.

Link: http://lkml.kernel.org/r/20190704153803.12739-3-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmpressure.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 61e6fddfb26f..6d28bc433c1c 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -17,7 +17,7 @@ struct vmpressure {
 	unsigned long tree_scanned;
 	unsigned long tree_reclaimed;
 	/* The lock is used to keep the scanned/reclaimed above in sync. */
-	struct spinlock sr_lock;
+	spinlock_t sr_lock;
 
 	/* The list of vmpressure_event structs. */
 	struct list_head events;
-- 
cgit v1.2.3


From f445884562dd8bc51eb4136bd21f014403d1813d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 11 Jul 2019 20:54:59 -0700
Subject: include/linux/pagemap.h: document trylock_page() return value

Cc: Henry Burns <henryburns@google.com>
Cc: Jonathan Adams <jwadams@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Vitaly Wool <vitalywool@gmail.com>
Cc: Xidong Wang <wangxidong_97@163.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index fe0b29bf2df7..6fd0d3aa492c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -452,6 +452,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 				unsigned int flags);
 extern void unlock_page(struct page *page);
 
+/*
+ * Return true if the page was successfully locked
+ */
 static inline int trylock_page(struct page *page)
 {
 	page = compound_head(page);
-- 
cgit v1.2.3


From 96a2b03f281d3a3b29c27028164f43090d6495b9 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 11 Jul 2019 20:55:06 -0700
Subject: mm, debug_pagelloc: use static keys to enable debugging

Patch series "debug_pagealloc improvements".

I have been recently debugging some pcplist corruptions, where it would be
useful to perform struct page checks immediately as pages are allocated
from and freed to pcplists, which is now only possible by rebuilding the
kernel with CONFIG_DEBUG_VM (details in Patch 2 changelog).

To make this kind of debugging simpler in future on a distro kernel, I
have improved CONFIG_DEBUG_PAGEALLOC so that it has even smaller overhead
when not enabled at boot time (Patch 1) and also when enabled (Patch 3),
and extended it to perform the struct page checks more often when enabled
(Patch 2).  Now it can be configured in when building a distro kernel
without extra overhead, and debugging page use after free or double free
can be enabled simply by rebooting with debug_pagealloc=on.

This patch (of 3):

CONFIG_DEBUG_PAGEALLOC has been redesigned by 031bc5743f15
("mm/debug-pagealloc: make debug-pagealloc boottime configurable") to
allow being always enabled in a distro kernel, but only perform its
expensive functionality when booted with debug_pagelloc=on.  We can
further reduce the overhead when not boot-enabled (including page
allocator fast paths) using static keys.  This patch introduces one for
debug_pagealloc core functionality, and another for the optional guard
page functionality (enabled by booting with debug_guardpage_minorder=X).

Link: http://lkml.kernel.org/r/20190603143451.27353-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0a6dae2f2b84..2c2e98cae2d1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2701,11 +2701,18 @@ static inline void kernel_poison_pages(struct page *page, int numpages,
 					int enable) { }
 #endif
 
-extern bool _debug_pagealloc_enabled;
+#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
+DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
+#else
+DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+#endif
 
 static inline bool debug_pagealloc_enabled(void)
 {
-	return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled;
+	if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
+		return false;
+
+	return static_branch_unlikely(&_debug_pagealloc_enabled);
 }
 
 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
@@ -2859,7 +2866,7 @@ extern struct page_ext_operations debug_guardpage_ops;
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 extern unsigned int _debug_guardpage_minorder;
-extern bool _debug_guardpage_enabled;
+DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
 
 static inline unsigned int debug_guardpage_minorder(void)
 {
@@ -2868,7 +2875,7 @@ static inline unsigned int debug_guardpage_minorder(void)
 
 static inline bool debug_guardpage_enabled(void)
 {
-	return _debug_guardpage_enabled;
+	return static_branch_unlikely(&_debug_guardpage_enabled);
 }
 
 static inline bool page_is_guard(struct page *page)
-- 
cgit v1.2.3


From 3972f6bb1c6ae1d32dcf2e4ff635d24b77f26dcb Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 11 Jul 2019 20:55:13 -0700
Subject: mm, debug_pagealloc: use a page type instead of page_ext flag

When debug_pagealloc is enabled, we currently allocate the page_ext
array to mark guard pages with the PAGE_EXT_DEBUG_GUARD flag.  Now that
we have the page_type field in struct page, we can use that instead, as
guard pages are neither PageSlab nor mapped to userspace.  This reduces
memory overhead when debug_pagealloc is enabled and there are no other
features requiring the page_ext array.

Link: http://lkml.kernel.org/r/20190603143451.27353-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h         | 10 +---------
 include/linux/page-flags.h |  6 ++++++
 include/linux/page_ext.h   |  1 -
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2c2e98cae2d1..cb8d413d635e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2862,8 +2862,6 @@ extern long copy_huge_page_from_user(struct page *dst_page,
 				bool allow_pagefault);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
-extern struct page_ext_operations debug_guardpage_ops;
-
 #ifdef CONFIG_DEBUG_PAGEALLOC
 extern unsigned int _debug_guardpage_minorder;
 DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
@@ -2880,16 +2878,10 @@ static inline bool debug_guardpage_enabled(void)
 
 static inline bool page_is_guard(struct page *page)
 {
-	struct page_ext *page_ext;
-
 	if (!debug_guardpage_enabled())
 		return false;
 
-	page_ext = lookup_page_ext(page);
-	if (unlikely(!page_ext))
-		return false;
-
-	return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+	return PageGuard(page);
 }
 #else
 static inline unsigned int debug_guardpage_minorder(void) { return 0; }
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 9f8712a4b1a5..b848517da64c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -703,6 +703,7 @@ PAGEFLAG_FALSE(DoubleMap)
 #define PG_offline	0x00000100
 #define PG_kmemcg	0x00000200
 #define PG_table	0x00000400
+#define PG_guard	0x00000800
 
 #define PageType(page, flag)						\
 	((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@ -754,6 +755,11 @@ PAGE_TYPE_OPS(Kmemcg, kmemcg)
  */
 PAGE_TYPE_OPS(Table, table)
 
+/*
+ * Marks guardpages used with debug_pagealloc.
+ */
+PAGE_TYPE_OPS(Guard, guard)
+
 extern bool is_free_buddy_page(struct page *page);
 
 __PAGEFLAG(Isolated, isolated, PF_ANY);
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index f84f167ec04c..09592951725c 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -17,7 +17,6 @@ struct page_ext_operations {
 #ifdef CONFIG_PAGE_EXTENSION
 
 enum page_ext_flags {
-	PAGE_EXT_DEBUG_GUARD,
 	PAGE_EXT_OWNER,
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
 	PAGE_EXT_YOUNG,
-- 
cgit v1.2.3


From 6c45b454191b330c8bc21d1ed3cf39bb6da1a4eb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 11 Jul 2019 20:55:20 -0700
Subject: mm/filemap: don't cast ->readpage to filler_t for do_read_cache_page

We can just pass a NULL filler and do the right thing inside of
do_read_cache_page based on the NULL parameter.

Link: http://lkml.kernel.org/r/20190520055731.24538-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 6fd0d3aa492c..c7552459a15f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -383,8 +383,7 @@ extern int read_cache_pages(struct address_space *mapping,
 static inline struct page *read_mapping_page(struct address_space *mapping,
 				pgoff_t index, void *data)
 {
-	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
-	return read_cache_page(mapping, index, filler, data);
+	return read_cache_page(mapping, index, NULL, data);
 }
 
 /*
-- 
cgit v1.2.3


From eb085574a7526c4375965c5fbf7e5b0c19cdd336 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 11 Jul 2019 20:55:33 -0700
Subject: mm, swap: fix race between swapoff and some swap operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When swapin is performed, after getting the swap entry information from
the page table, system will swap in the swap entry, without any lock held
to prevent the swap device from being swapoff.  This may cause the race
like below,

CPU 1				CPU 2
-----				-----
				do_swap_page
				  swapin_readahead
				    __read_swap_cache_async
swapoff				      swapcache_prepare
  p->swap_map = NULL		        __swap_duplicate
					  p->swap_map[?] /* !!! NULL pointer access */

Because swapoff is usually done when system shutdown only, the race may
not hit many people in practice.  But it is still a race need to be fixed.

To fix the race, get_swap_device() is added to check whether the specified
swap entry is valid in its swap device.  If so, it will keep the swap
entry valid via preventing the swap device from being swapoff, until
put_swap_device() is called.

Because swapoff() is very rare code path, to make the normal path runs as
fast as possible, rcu_read_lock/unlock() and synchronize_rcu() instead of
reference count is used to implement get/put_swap_device().  >From
get_swap_device() to put_swap_device(), RCU reader side is locked, so
synchronize_rcu() in swapoff() will wait until put_swap_device() is
called.

In addition to swap_map, cluster_info, etc.  data structure in the struct
swap_info_struct, the swap cache radix tree will be freed after swapoff,
so this patch fixes the race between swap cache looking up and swapoff
too.

Races between some other swap cache usages and swapoff are fixed too via
calling synchronize_rcu() between clearing PageSwapCache() and freeing
swap cache data structure.

Another possible method to fix this is to use preempt_off() +
stop_machine() to prevent the swap device from being swapoff when its data
structure is being accessed.  The overhead in hot-path of both methods is
similar.  The advantages of RCU based method are,

1. stop_machine() may disturb the normal execution code path on other
   CPUs.

2. File cache uses RCU to protect its radix tree.  If the similar
   mechanism is used for swap cache too, it is easier to share code
   between them.

3. RCU is used to protect swap cache in total_swapcache_pages() and
   exit_swap_address_space() already.  The two mechanisms can be
   merged to simplify the logic.

Link: http://lkml.kernel.org/r/20190522015423.14418-1-ying.huang@intel.com
Fixes: 235b62176712 ("mm/swap: add cluster lock")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Not-nacked-by: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4bfb5c4ac108..6358a6185634 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -175,8 +175,9 @@ enum {
 	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
 	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
 	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
+	SWP_VALID	= (1 << 13),	/* swap is valid to be operated on? */
 					/* add others here before... */
-	SWP_SCANNING	= (1 << 13),	/* refcount in scan_swap_map */
+	SWP_SCANNING	= (1 << 14),	/* refcount in scan_swap_map */
 };
 
 #define SWAP_CLUSTER_MAX 32UL
@@ -460,7 +461,7 @@ extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
-extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry);
+extern int __swap_count(swp_entry_t entry);
 extern int __swp_swapcount(swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
@@ -470,6 +471,12 @@ extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
+extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
+
+static inline void put_swap_device(struct swap_info_struct *si)
+{
+	rcu_read_unlock();
+}
 
 #else /* CONFIG_SWAP */
 
@@ -576,7 +583,7 @@ static inline int page_swapcount(struct page *page)
 	return 0;
 }
 
-static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+static inline int __swap_count(swp_entry_t entry)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 4efaceb1c5f8136d5fec3f26549d294b8e898bd7 Mon Sep 17 00:00:00 2001
From: Aaron Lu <ziqian.lzq@antfin.com>
Date: Thu, 11 Jul 2019 20:55:41 -0700
Subject: mm, swap: use rbtree for swap_extent

swap_extent is used to map swap page offset to backing device's block
offset.  For a continuous block range, one swap_extent is used and all
these swap_extents are managed in a linked list.

These swap_extents are used by map_swap_entry() during swap's read and
write path.  To find out the backing device's block offset for a page
offset, the swap_extent list will be traversed linearly, with
curr_swap_extent being used as a cache to speed up the search.

This works well as long as swap_extents are not huge or when the number
of processes that access swap device are few, but when the swap device
has many extents and there are a number of processes accessing the swap
device concurrently, it can be a problem.  On one of our servers, the
disk's remaining size is tight:

  $df -h
  Filesystem      Size  Used Avail Use% Mounted on
  ... ...
  /dev/nvme0n1p1  1.8T  1.3T  504G  72% /home/t4

When creating a 80G swapfile there, there are as many as 84656 swap
extents.  The end result is, kernel spends abou 30% time in
map_swap_entry() and swap throughput is only 70MB/s.

As a comparison, when I used smaller sized swapfile, like 4G whose
swap_extent dropped to 2000, swap throughput is back to 400-500MB/s and
map_swap_entry() is about 3%.

One downside of using rbtree for swap_extent is, 'struct rbtree' takes
24 bytes while 'struct list_head' takes 16 bytes, that's 8 bytes more
for each swap_extent.  For a swapfile that has 80k swap_extents, that
means 625KiB more memory consumed.

Test:

Since it's not possible to reboot that server, I can not test this patch
diretly there.  Instead, I tested it on another server with NVMe disk.

I created a 20G swapfile on an NVMe backed XFS fs.  By default, the
filesystem is quite clean and the created swapfile has only 2 extents.
Testing vanilla and this patch shows no obvious performance difference
when swapfile is not fragmented.

To see the patch's effects, I used some tweaks to manually fragment the
swapfile by breaking the extent at 1M boundary.  This made the swapfile
have 20K extents.

  nr_task=4
  kernel   swapout(KB/s) map_swap_entry(perf)  swapin(KB/s) map_swap_entry(perf)
  vanilla  165191           90.77%             171798          90.21%
  patched  858993 +420%      2.16%             715827 +317%     0.77%

  nr_task=8
  kernel   swapout(KB/s) map_swap_entry(perf)  swapin(KB/s) map_swap_entry(perf)
  vanilla  306783           92.19%             318145          87.76%
  patched  954437 +211%      2.35%            1073741 +237%     1.57%

swapout: the throughput of swap out, in KB/s, higher is better 1st
map_swap_entry: cpu cycles percent sampled by perf swapin: the
throughput of swap in, in KB/s, higher is better.  2nd map_swap_entry:
cpu cycles percent sampled by perf

nr_task=1 doesn't show any difference, this is due to the curr_swap_extent
can be effectively used to cache the correct swap extent for single task
workload.

[akpm@linux-foundation.org: s/BUG_ON(1)/BUG()/]
Link: http://lkml.kernel.org/r/20190523142404.GA181@aaronlu
Signed-off-by: Aaron Lu <ziqian.lzq@antfin.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6358a6185634..de2c67a33b7e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -148,7 +148,7 @@ struct zone;
  * We always assume that blocks are of size PAGE_SIZE.
  */
 struct swap_extent {
-	struct list_head list;
+	struct rb_node rb_node;
 	pgoff_t start_page;
 	pgoff_t nr_pages;
 	sector_t start_block;
@@ -248,8 +248,7 @@ struct swap_info_struct {
 	unsigned int cluster_next;	/* likely index for next allocation */
 	unsigned int cluster_nr;	/* countdown to next cluster search */
 	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
-	struct swap_extent *curr_swap_extent;
-	struct swap_extent first_swap_extent;
+	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	unsigned int old_block_size;	/* seldom referenced */
-- 
cgit v1.2.3


From 1e577f970f66a53d429cbee37b36177c9712f488 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 11 Jul 2019 20:55:55 -0700
Subject: mm, memcg: introduce memory.events.local

The memory controller in cgroup v2 exposes memory.events file for each
memcg which shows the number of times events like low, high, max, oom
and oom_kill have happened for the whole tree rooted at that memcg.
Users can also poll or register notification to monitor the changes in
that file.  Any event at any level of the tree rooted at memcg will
notify all the listeners along the path till root_mem_cgroup.  There are
existing users which depend on this behavior.

However there are users which are only interested in the events
happening at a specific level of the memcg tree and not in the events in
the underlying tree rooted at that memcg.  One such use-case is a
centralized resource monitor which can dynamically adjust the limits of
the jobs running on a system.  The jobs can create their sub-hierarchy
for their own sub-tasks.  The centralized monitor is only interested in
the events at the top level memcgs of the jobs as it can then act and
adjust the limits of the jobs.  Using the current memory.events for such
centralized monitor is very inconvenient.  The monitor will keep
receiving events which it is not interested and to find if the received
event is interesting, it has to read memory.event files of the next
level and compare it with the top level one.  So, let's introduce
memory.events.local to the memcg which shows and notify for the events
at the memcg level.

Now, does memory.stat and memory.pressure need their local versions.  IMHO
no due to the no internal process contraint of the cgroup v2.  The
memory.stat file of the top level memcg of a job shows the stats and
vmevents of the whole tree.  The local stats or vmevents of the top level
memcg will only change if there is a process running in that memcg but v2
does not allow that.  Similarly for memory.pressure there will not be any
process in the internal nodes and thus no chance of local pressure.

Link: http://lkml.kernel.org/r/20190527174643.209172-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1dcb763bb610..22141ebc5e15 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -233,8 +233,9 @@ struct mem_cgroup {
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 
-	/* memory.events */
+	/* memory.events and memory.events.local */
 	struct cgroup_file events_file;
+	struct cgroup_file events_local_file;
 
 	/* handle for "memory.swap.events" */
 	struct cgroup_file swap_events_file;
@@ -281,6 +282,7 @@ struct mem_cgroup {
 
 	/* memory.events */
 	atomic_long_t		memory_events[MEMCG_NR_MEMORY_EVENTS];
+	atomic_long_t		memory_events_local[MEMCG_NR_MEMORY_EVENTS];
 
 	unsigned long		socket_pressure;
 
@@ -747,6 +749,9 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
 static inline void memcg_memory_event(struct mem_cgroup *memcg,
 				      enum memcg_memory_event event)
 {
+	atomic_long_inc(&memcg->memory_events_local[event]);
+	cgroup_file_notify(&memcg->events_local_file);
+
 	do {
 		atomic_long_inc(&memcg->memory_events[event]);
 		cgroup_file_notify(&memcg->events_file);
-- 
cgit v1.2.3


From 0b14e8aa68223c2c124d408aa4b110b364d13c53 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 11 Jul 2019 20:56:06 -0700
Subject: mm: memcg/slab: rename slab delayed deactivation functions and fields

The delayed work/rcu deactivation infrastructure of non-root kmem_caches
can be also used for asynchronous release of these objects.  Let's get rid
of the word "deactivation" in corresponding names to make the code look
better after generalization.

It's easier to make the renaming first, so that the generalized code will
look consistent from scratch.

Let's rename struct memcg_cache_params fields:
  deact_fn -> work_fn
  deact_rcu_head -> rcu_head
  deact_work -> work

And RCU/delayed work callbacks in slab common code:
  kmemcg_deactivate_rcufn -> kmemcg_rcufn
  kmemcg_deactivate_workfn -> kmemcg_workfn

This patch contains no functional changes, only renamings.

Link: http://lkml.kernel.org/r/20190611231813.3148843-3-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Waiman Long <longman@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 98c3d12b7275..6008d884e621 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -643,10 +643,10 @@ struct memcg_cache_params {
 			struct list_head children_node;
 			struct list_head kmem_caches_node;
 
-			void (*deact_fn)(struct kmem_cache *);
+			void (*work_fn)(struct kmem_cache *);
 			union {
-				struct rcu_head deact_rcu_head;
-				struct work_struct deact_work;
+				struct rcu_head rcu_head;
+				struct work_struct work;
 			};
 		};
 	};
-- 
cgit v1.2.3


From 49a18eae2e98a794477b5af5d85938e430c0be72 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 11 Jul 2019 20:56:13 -0700
Subject: mm: memcg/slab: introduce __memcg_kmem_uncharge_memcg()

Let's separate the page counter modification code out of
__memcg_kmem_uncharge() in a way similar to what
__memcg_kmem_charge() and __memcg_kmem_charge_memcg() work.

This will allow to reuse this code later using a new
memcg_kmem_uncharge_memcg() wrapper, which calls
__memcg_kmem_uncharge_memcg() if memcg_kmem_enabled()
check is passed.

Link: http://lkml.kernel.org/r/20190611231813.3148843-5-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Waiman Long <longman@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 22141ebc5e15..68402842c337 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1278,6 +1278,8 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
 void __memcg_kmem_uncharge(struct page *page, int order);
 int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
 			      struct mem_cgroup *memcg);
+void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
+				 unsigned int nr_pages);
 
 extern struct static_key_false memcg_kmem_enabled_key;
 extern struct workqueue_struct *memcg_kmem_cache_wq;
@@ -1319,6 +1321,14 @@ static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp,
 		return __memcg_kmem_charge_memcg(page, gfp, order, memcg);
 	return 0;
 }
+
+static inline void memcg_kmem_uncharge_memcg(struct page *page, int order,
+					     struct mem_cgroup *memcg)
+{
+	if (memcg_kmem_enabled())
+		__memcg_kmem_uncharge_memcg(memcg, 1 << order);
+}
+
 /*
  * helper for accessing a memcg's index. It will be used as an index in the
  * child cache array in kmem_cache, and also to derive its name. This function
-- 
cgit v1.2.3


From f0a3a24b532d9a7e56a33c5112b2a212ed6ec580 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 11 Jul 2019 20:56:27 -0700
Subject: mm: memcg/slab: rework non-root kmem_cache lifecycle management

Currently each charged slab page holds a reference to the cgroup to which
it's charged.  Kmem_caches are held by the memcg and are released all
together with the memory cgroup.  It means that none of kmem_caches are
released unless at least one reference to the memcg exists, which is very
far from optimal.

Let's rework it in a way that allows releasing individual kmem_caches as
soon as the cgroup is offline, the kmem_cache is empty and there are no
pending allocations.

To make it possible, let's introduce a new percpu refcounter for non-root
kmem caches.  The counter is initialized to the percpu mode, and is
switched to the atomic mode during kmem_cache deactivation.  The counter
is bumped for every charged page and also for every running allocation.
So the kmem_cache can't be released unless all allocations complete.

To shutdown non-active empty kmem_caches, let's reuse the work queue,
previously used for the kmem_cache deactivation.  Once the reference
counter reaches 0, let's schedule an asynchronous kmem_cache release.

* I used the following simple approach to test the performance
(stolen from another patchset by T. Harding):

    time find / -name fname-no-exist
    echo 2 > /proc/sys/vm/drop_caches
    repeat 10 times

Results:

        orig		patched

real	0m1.455s	real	0m1.355s
user	0m0.206s	user	0m0.219s
sys	0m0.855s	sys	0m0.807s

real	0m1.487s	real	0m1.699s
user	0m0.221s	user	0m0.256s
sys	0m0.806s	sys	0m0.948s

real	0m1.515s	real	0m1.505s
user	0m0.183s	user	0m0.215s
sys	0m0.876s	sys	0m0.858s

real	0m1.291s	real	0m1.380s
user	0m0.193s	user	0m0.198s
sys	0m0.843s	sys	0m0.786s

real	0m1.364s	real	0m1.374s
user	0m0.180s	user	0m0.182s
sys	0m0.868s	sys	0m0.806s

real	0m1.352s	real	0m1.312s
user	0m0.201s	user	0m0.212s
sys	0m0.820s	sys	0m0.761s

real	0m1.302s	real	0m1.349s
user	0m0.205s	user	0m0.203s
sys	0m0.803s	sys	0m0.792s

real	0m1.334s	real	0m1.301s
user	0m0.194s	user	0m0.201s
sys	0m0.806s	sys	0m0.779s

real	0m1.426s	real	0m1.434s
user	0m0.216s	user	0m0.181s
sys	0m0.824s	sys	0m0.864s

real	0m1.350s	real	0m1.295s
user	0m0.200s	user	0m0.190s
sys	0m0.842s	sys	0m0.811s

So it looks like the difference is not noticeable in this test.

[cai@lca.pw: fix an use-after-free in kmemcg_workfn()]
  Link: http://lkml.kernel.org/r/1560977573-10715-1-git-send-email-cai@lca.pw
Link: http://lkml.kernel.org/r/20190611231813.3148843-9-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Qian Cai <cai@lca.pw>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Waiman Long <longman@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 6008d884e621..bc189a43e680 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -16,6 +16,7 @@
 #include <linux/overflow.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <linux/percpu-refcount.h>
 
 
 /*
@@ -152,7 +153,6 @@ int kmem_cache_shrink(struct kmem_cache *);
 
 void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
 void memcg_deactivate_kmem_caches(struct mem_cgroup *);
-void memcg_destroy_kmem_caches(struct mem_cgroup *);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
@@ -642,6 +642,7 @@ struct memcg_cache_params {
 			struct mem_cgroup *memcg;
 			struct list_head children_node;
 			struct list_head kmem_caches_node;
+			struct percpu_ref refcnt;
 
 			void (*work_fn)(struct kmem_cache *);
 			union {
-- 
cgit v1.2.3


From fb2f2b0adb98bbbbbb51c5a5327f3f90f5dc417e Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 11 Jul 2019 20:56:34 -0700
Subject: mm: memcg/slab: reparent memcg kmem_caches on cgroup removal

Let's reparent non-root kmem_caches on memcg offlining.  This allows us to
release the memory cgroup without waiting for the last outstanding kernel
object (e.g.  dentry used by another application).

Since the parent cgroup is already charged, everything we need to do is to
splice the list of kmem_caches to the parent's kmem_caches list, swap the
memcg pointer, drop the css refcounter for each kmem_cache and adjust the
parent's css refcounter.

Please, note that kmem_cache->memcg_params.memcg isn't a stable pointer
anymore.  It's safe to read it under rcu_read_lock(), cgroup_mutex held,
or any other way that protects the memory cgroup from being released.

We can race with the slab allocation and deallocation paths.  It's not a
big problem: parent's charge and slab global stats are always correct, and
we don't care anymore about the child usage and global stats.  The child
cgroup is already offline, so we don't use or show it anywhere.

Local slab stats (NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE) aren't
used anywhere except count_shadow_nodes().  But even there it won't break
anything: after reparenting "nodes" will be 0 on child level (because
we're already reparenting shrinker lists), and on parent level page stats
always were 0, and this patch won't change anything.

[guro@fb.com: properly handle kmem_caches reparented to root_mem_cgroup]
  Link: http://lkml.kernel.org/r/20190620213427.1691847-1-guro@fb.com
Link: http://lkml.kernel.org/r/20190611231813.3148843-11-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Waiman Long <longman@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index bc189a43e680..fd0ef2e16178 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -152,7 +152,7 @@ void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
 
 void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
-void memcg_deactivate_kmem_caches(struct mem_cgroup *);
+void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
-- 
cgit v1.2.3


From fcf8a1e483490cd249df4e02d5425636c3f43c86 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 11 Jul 2019 20:56:38 -0700
Subject: mm, memcg: add a memcg_slabinfo debugfs file

There are concerns about memory leaks from extensive use of memory cgroups
as each memory cgroup creates its own set of kmem caches.  There is a
possiblity that the memcg kmem caches may remain even after the memory
cgroups have been offlined.  Therefore, it will be useful to show the
status of each of memcg kmem caches.

This patch introduces a new <debugfs>/memcg_slabinfo file which is
somewhat similar to /proc/slabinfo in format, but lists only information
about kmem caches that have child memcg kmem caches.  Information
available in /proc/slabinfo are not repeated in memcg_slabinfo.

A portion of a sample output of the file was:

  # <name> <css_id[:dead]> <active_objs> <num_objs> <active_slabs> <num_slabs>
  rpc_inode_cache   root          13     51      1      1
  rpc_inode_cache     48           0      0      0      0
  fat_inode_cache   root           1     45      1      1
  fat_inode_cache     41           2     45      1      1
  xfs_inode         root         770    816     24     24
  xfs_inode           92          22     34      1      1
  xfs_inode           88:dead      1     34      1      1
  xfs_inode           89:dead     23     34      1      1
  xfs_inode           85           4     34      1      1
  xfs_inode           84           9     34      1      1

The css id of the memcg is also listed. If a memcg is not online,
the tag ":dead" will be attached as shown above.

[longman@redhat.com: memcg: add ":deact" tag for reparented kmem caches in memcg_slabinfo]
  Link: http://lkml.kernel.org/r/20190621173005.31514-1-longman@redhat.com
[longman@redhat.com: set the flag in the common code as suggested by Roman]
  Link: http://lkml.kernel.org/r/20190627184324.5875-1-longman@redhat.com
Link: http://lkml.kernel.org/r/20190619171621.26209-1-longman@redhat.com
Signed-off-by: Waiman Long <longman@redhat.com>
Suggested-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index fd0ef2e16178..56c9c7eed34e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -116,6 +116,10 @@
 /* Objects are reclaimable */
 #define SLAB_RECLAIM_ACCOUNT	((slab_flags_t __force)0x00020000U)
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
+
+/* Slab deactivation flag */
+#define SLAB_DEACTIVATED	((slab_flags_t __force)0x10000000U)
+
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
-- 
cgit v1.2.3


From cbd34da7dc9afd521e0bea5e7d12701f4a9da7c7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 11 Jul 2019 20:57:28 -0700
Subject: mm: move the powerpc hugepd code to mm/gup.c

While only powerpc supports the hugepd case, the code is pretty generic
and I'd like to keep all GUP internals in one place.

Link: http://lkml.kernel.org/r/20190625143715.1689-15-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Miller <davem@davemloft.net>
Cc: James Hogan <jhogan@kernel.org>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Khalid Aziz <khalid.aziz@oracle.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f895a79c6f5c..edfca4278319 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -16,29 +16,11 @@ struct user_struct;
 struct mmu_gather;
 
 #ifndef is_hugepd
-/*
- * Some architectures requires a hugepage directory format that is
- * required to support multiple hugepage sizes. For example
- * a4fe3ce76 "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
- * introduced the same on powerpc. This allows for a more flexible hugepage
- * pagetable layout.
- */
 typedef struct { unsigned long pd; } hugepd_t;
 #define is_hugepd(hugepd) (0)
 #define __hugepd(x) ((hugepd_t) { (x) })
-static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
-			      unsigned pdshift, unsigned long end,
-			      int write, struct page **pages, int *nr)
-{
-	return 0;
-}
-#else
-extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
-		       unsigned pdshift, unsigned long end,
-		       int write, struct page **pages, int *nr);
 #endif
 
-
 #ifdef CONFIG_HUGETLB_PAGE
 
 #include <linux/mempolicy.h>
-- 
cgit v1.2.3


From 8b1e0f81fb6fcf3109465a168b2e2da3f711fa86 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Thu, 11 Jul 2019 20:58:43 -0700
Subject: mm/pgtable: drop pgtable_t variable from pte_fn_t functions

Drop the pgtable_t variable from all implementation for pte_fn_t as none
of them use it.  apply_to_pte_range() should stop computing it as well.
Should help us save some cycles.

Link: http://lkml.kernel.org/r/1556803126-26596-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Matthew Wilcox <willy@infradead.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: <jglisse@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cb8d413d635e..bb242ad810eb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2686,8 +2686,7 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
 	return 0;
 }
 
-typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
-			void *data);
+typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
-- 
cgit v1.2.3


From 6471384af2a6530696fc0203bafe4de41a23c9ef Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 11 Jul 2019 20:59:19 -0700
Subject: mm: security: introduce init_on_alloc=1 and init_on_free=1 boot
 options

Patch series "add init_on_alloc/init_on_free boot options", v10.

Provide init_on_alloc and init_on_free boot options.

These are aimed at preventing possible information leaks and making the
control-flow bugs that depend on uninitialized values more deterministic.

Enabling either of the options guarantees that the memory returned by the
page allocator and SL[AU]B is initialized with zeroes.  SLOB allocator
isn't supported at the moment, as its emulation of kmem caches complicates
handling of SLAB_TYPESAFE_BY_RCU caches correctly.

Enabling init_on_free also guarantees that pages and heap objects are
initialized right after they're freed, so it won't be possible to access
stale data by using a dangling pointer.

As suggested by Michal Hocko, right now we don't let the heap users to
disable initialization for certain allocations.  There's not enough
evidence that doing so can speed up real-life cases, and introducing ways
to opt-out may result in things going out of control.

This patch (of 2):

The new options are needed to prevent possible information leaks and make
control-flow bugs that depend on uninitialized values more deterministic.

This is expected to be on-by-default on Android and Chrome OS.  And it
gives the opportunity for anyone else to use it under distros too via the
boot args.  (The init_on_free feature is regularly requested by folks
where memory forensics is included in their threat models.)

init_on_alloc=1 makes the kernel initialize newly allocated pages and heap
objects with zeroes.  Initialization is done at allocation time at the
places where checks for __GFP_ZERO are performed.

init_on_free=1 makes the kernel initialize freed pages and heap objects
with zeroes upon their deletion.  This helps to ensure sensitive data
doesn't leak via use-after-free accesses.

Both init_on_alloc=1 and init_on_free=1 guarantee that the allocator
returns zeroed memory.  The two exceptions are slab caches with
constructors and SLAB_TYPESAFE_BY_RCU flag.  Those are never
zero-initialized to preserve their semantics.

Both init_on_alloc and init_on_free default to zero, but those defaults
can be overridden with CONFIG_INIT_ON_ALLOC_DEFAULT_ON and
CONFIG_INIT_ON_FREE_DEFAULT_ON.

If either SLUB poisoning or page poisoning is enabled, those options take
precedence over init_on_alloc and init_on_free: initialization is only
applied to unpoisoned allocations.

Slowdown for the new features compared to init_on_free=0, init_on_alloc=0:

hackbench, init_on_free=1:  +7.62% sys time (st.err 0.74%)
hackbench, init_on_alloc=1: +7.75% sys time (st.err 2.14%)

Linux build with -j12, init_on_free=1:  +8.38% wall time (st.err 0.39%)
Linux build with -j12, init_on_free=1:  +24.42% sys time (st.err 0.52%)
Linux build with -j12, init_on_alloc=1: -0.13% wall time (st.err 0.42%)
Linux build with -j12, init_on_alloc=1: +0.57% sys time (st.err 0.40%)

The slowdown for init_on_free=0, init_on_alloc=0 compared to the baseline
is within the standard error.

The new features are also going to pave the way for hardware memory
tagging (e.g.  arm64's MTE), which will require both on_alloc and on_free
hooks to set the tags for heap objects.  With MTE, tagging will have the
same cost as memory initialization.

Although init_on_free is rather costly, there are paranoid use-cases where
in-memory data lifetime is desired to be minimized.  There are various
arguments for/against the realism of the associated threat models, but
given that we'll need the infrastructure for MTE anyway, and there are
people who want wipe-on-free behavior no matter what the performance cost,
it seems reasonable to include it in this series.

[glider@google.com: v8]
  Link: http://lkml.kernel.org/r/20190626121943.131390-2-glider@google.com
[glider@google.com: v9]
  Link: http://lkml.kernel.org/r/20190627130316.254309-2-glider@google.com
[glider@google.com: v10]
  Link: http://lkml.kernel.org/r/20190628093131.199499-2-glider@google.com
Link: http://lkml.kernel.org/r/20190617151050.92663-2-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Michal Hocko <mhocko@suse.cz>		[page and dmapool parts
Acked-by: James Morris <jamorris@linux.microsoft.com>]
Cc: Christoph Lameter <cl@linux.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Sandeep Patil <sspatil@android.com>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Jann Horn <jannh@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bb242ad810eb..f88f0eabcc5e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2700,6 +2700,30 @@ static inline void kernel_poison_pages(struct page *page, int numpages,
 					int enable) { }
 #endif
 
+#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
+DECLARE_STATIC_KEY_TRUE(init_on_alloc);
+#else
+DECLARE_STATIC_KEY_FALSE(init_on_alloc);
+#endif
+static inline bool want_init_on_alloc(gfp_t flags)
+{
+	if (static_branch_unlikely(&init_on_alloc) &&
+	    !page_poisoning_enabled())
+		return true;
+	return flags & __GFP_ZERO;
+}
+
+#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
+DECLARE_STATIC_KEY_TRUE(init_on_free);
+#else
+DECLARE_STATIC_KEY_FALSE(init_on_free);
+#endif
+static inline bool want_init_on_free(void)
+{
+	return static_branch_unlikely(&init_on_free) &&
+	       !page_poisoning_enabled();
+}
+
 #ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
 DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
 #else
-- 
cgit v1.2.3


From 97105f0ab7b877a8ece2005e214894e93793950c Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 11 Jul 2019 21:00:13 -0700
Subject: mm: vmalloc: show number of vmalloc pages in /proc/meminfo

Vmalloc() is getting more and more used these days (kernel stacks, bpf and
percpu allocator are new top users), and the total % of memory consumed by
vmalloc() can be pretty significant and changes dynamically.

/proc/meminfo is the best place to display this information: its top goal
is to show top consumers of the memory.

Since the VmallocUsed field in /proc/meminfo is not in use for quite a
long time (it has been defined to 0 by a5ad88ce8c7f ("mm: get rid of
'vmalloc_info' from /proc/meminfo")), let's reuse it for showing the
actual physical memory consumption of vmalloc().

Link: http://lkml.kernel.org/r/20190417194002.12369-3-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 51e131245379..9b21d0047710 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -72,10 +72,12 @@ extern void vm_unmap_aliases(void);
 
 #ifdef CONFIG_MMU
 extern void __init vmalloc_init(void);
+extern unsigned long vmalloc_nr_pages(void);
 #else
 static inline void vmalloc_init(void)
 {
 }
+static inline unsigned long vmalloc_nr_pages(void) { return 0; }
 #endif
 
 extern void *vmalloc(unsigned long size);
-- 
cgit v1.2.3


From 6ba749ee78ef42ffdf4b95c042fc574a37d229d9 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 11 Jul 2019 21:00:26 -0700
Subject: mm, oom: remove redundant task_in_mem_cgroup() check

oom_unkillable_task() can be called from three different contexts i.e.
global OOM, memcg OOM and oom_score procfs interface.  At the moment
oom_unkillable_task() does a task_in_mem_cgroup() check on the given
process.  Since there is no reason to perform task_in_mem_cgroup()
check for global OOM and oom_score procfs interface, those contexts
provide NULL memcg and skips the task_in_mem_cgroup() check.  However
for memcg OOM context, the oom_unkillable_task() is always called from
mem_cgroup_scan_tasks() and thus task_in_mem_cgroup() check becomes
redundant and effectively dead code.  So, just remove the
task_in_mem_cgroup() check altogether.

Link: http://lkml.kernel.org/r/20190624212631.87212-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 7 -------
 include/linux/oom.h        | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 68402842c337..44c41462be33 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -394,7 +394,6 @@ out:
 
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
 
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
@@ -875,12 +874,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 	return true;
 }
 
-static inline bool task_in_mem_cgroup(struct task_struct *task,
-				      const struct mem_cgroup *memcg)
-{
-	return true;
-}
-
 static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	return NULL;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index d07992009265..b75104690311 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -108,7 +108,7 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 bool __oom_reap_task_mm(struct mm_struct *mm);
 
 extern unsigned long oom_badness(struct task_struct *p,
-		struct mem_cgroup *memcg, const nodemask_t *nodemask,
+		const nodemask_t *nodemask,
 		unsigned long totalpages);
 
 extern bool out_of_memory(struct oom_control *oc);
-- 
cgit v1.2.3


From ac311a14c682dcd8a120a6244d0542ec654e3d93 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 11 Jul 2019 21:00:31 -0700
Subject: oom: decouple mems_allowed from oom_unkillable_task

Commit ef08e3b4981a ("[PATCH] cpusets: confine oom_killer to
mem_exclusive cpuset") introduces a heuristic where a potential
oom-killer victim is skipped if the intersection of the potential victim
and the current (the process triggered the oom) is empty based on the
reason that killing such victim most probably will not help the current
allocating process.

However the commit 7887a3da753e ("[PATCH] oom: cpuset hint") changed the
heuristic to just decrease the oom_badness scores of such potential
victim based on the reason that the cpuset of such processes might have
changed and previously they may have allocated memory on mems where the
current allocating process can allocate from.

Unintentionally 7887a3da753e ("[PATCH] oom: cpuset hint") introduced a
side effect as the oom_badness is also exposed to the user space through
/proc/[pid]/oom_score, so, readers with different cpusets can read
different oom_score of the same process.

Later, commit 6cf86ac6f36b ("oom: filter tasks not sharing the same
cpuset") fixed the side effect introduced by 7887a3da753e by moving the
cpuset intersection back to only oom-killer context and out of
oom_badness.  However the combination of ab290adbaf8f ("oom: make
oom_unkillable_task() helper function") and 26ebc984913b ("oom:
/proc/<pid>/oom_score treat kernel thread honestly") unintentionally
brought back the cpuset intersection check into the oom_badness
calculation function.

Other than doing cpuset/mempolicy intersection from oom_badness, the memcg
oom context is also doing cpuset/mempolicy intersection which is quite
wrong and is caught by syzcaller with the following report:

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 28426 Comm: syz-executor.5 Not tainted 5.2.0-rc3-next-20190607
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline]
RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline]
RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline]
RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155
Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00
00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f
85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff
RSP: 0018:ffff888000127490 EFLAGS: 00010a03
RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c
RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001
RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0
R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007
R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6
FS:  00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000607304 CR3: 000000009237e000 CR4: 00000000001426f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600
Call Trace:
  oom_evaluate_task+0x49/0x520 mm/oom_kill.c:321
  mem_cgroup_scan_tasks+0xcc/0x180 mm/memcontrol.c:1169
  select_bad_process mm/oom_kill.c:374 [inline]
  out_of_memory mm/oom_kill.c:1088 [inline]
  out_of_memory+0x6b2/0x1280 mm/oom_kill.c:1035
  mem_cgroup_out_of_memory+0x1ca/0x230 mm/memcontrol.c:1573
  mem_cgroup_oom mm/memcontrol.c:1905 [inline]
  try_charge+0xfbe/0x1480 mm/memcontrol.c:2468
  mem_cgroup_try_charge+0x24d/0x5e0 mm/memcontrol.c:6073
  mem_cgroup_try_charge_delay+0x1f/0xa0 mm/memcontrol.c:6088
  do_huge_pmd_wp_page_fallback+0x24f/0x1680 mm/huge_memory.c:1201
  do_huge_pmd_wp_page+0x7fc/0x2160 mm/huge_memory.c:1359
  wp_huge_pmd mm/memory.c:3793 [inline]
  __handle_mm_fault+0x164c/0x3eb0 mm/memory.c:4006
  handle_mm_fault+0x3b7/0xa90 mm/memory.c:4053
  do_user_addr_fault arch/x86/mm/fault.c:1455 [inline]
  __do_page_fault+0x5ef/0xda0 arch/x86/mm/fault.c:1521
  do_page_fault+0x71/0x57d arch/x86/mm/fault.c:1552
  page_fault+0x1e/0x30 arch/x86/entry/entry_64.S:1156
RIP: 0033:0x400590
Code: 06 e9 49 01 00 00 48 8b 44 24 10 48 0b 44 24 28 75 1f 48 8b 14 24 48
8b 7c 24 20 be 04 00 00 00 e8 f5 56 00 00 48 8b 74 24 08 <89> 06 e9 1e 01
00 00 48 8b 44 24 08 48 8b 14 24 be 04 00 00 00 8b
RSP: 002b:00007fff7bc49780 EFLAGS: 00010206
RAX: 0000000000000001 RBX: 0000000000760000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 000000002000cffc RDI: 0000000000000001
RBP: fffffffffffffffe R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000075 R11: 0000000000000246 R12: 0000000000760008
R13: 00000000004c55f2 R14: 0000000000000000 R15: 00007fff7bc499b0
Modules linked in:
---[ end trace a65689219582ffff ]---
RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline]
RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline]
RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline]
RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155
Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00
00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f
85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff
RSP: 0018:ffff888000127490 EFLAGS: 00010a03
RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c
RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001
RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0
R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007
R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6
FS:  00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000001b2f823000 CR3: 000000009237e000 CR4: 00000000001426f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600

The fix is to decouple the cpuset/mempolicy intersection check from
oom_unkillable_task() and make sure cpuset/mempolicy intersection check is
only done in the global oom context.

[shakeelb@google.com: change function name and update comment]
  Link: http://lkml.kernel.org/r/20190628152421.198994-3-shakeelb@google.com
Link: http://lkml.kernel.org/r/20190624212631.87212-3-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Reported-by: syzbot+d0fc9d3c166bc5e4a94b@syzkaller.appspotmail.com
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index b75104690311..c696c265f019 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -108,7 +108,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 bool __oom_reap_task_mm(struct mm_struct *mm);
 
 extern unsigned long oom_badness(struct task_struct *p,
-		const nodemask_t *nodemask,
 		unsigned long totalpages);
 
 extern bool out_of_memory(struct oom_control *oc);
-- 
cgit v1.2.3