From d50112edde1d0c621520e53747044009f11c656b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 15 Nov 2017 17:32:18 -0800
Subject: slab, slub, slob: add slab_flags_t

Add sparse-checked slab_flags_t for struct kmem_cache::flags (SLAB_POISON,
etc).

SLAB is bloated temporarily by switching to "unsigned long", but only
temporarily.

Link: http://lkml.kernel.org/r/20171021100225.GA22428@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h    |  4 ++--
 include/linux/kmemleak.h |  8 +++----
 include/linux/slab.h     | 60 +++++++++++++++++++++++++++++-------------------
 include/linux/slab_def.h |  2 +-
 include/linux/slub_def.h |  2 +-
 include/linux/types.h    |  1 +
 include/net/sock.h       |  2 +-
 7 files changed, 47 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 5017269e3f04..e3eb834c9a35 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -46,7 +46,7 @@ void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
 
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
-			unsigned long *flags);
+			slab_flags_t *flags);
 void kasan_cache_shrink(struct kmem_cache *cache);
 void kasan_cache_shutdown(struct kmem_cache *cache);
 
@@ -95,7 +95,7 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      size_t *size,
-				      unsigned long *flags) {}
+				      slab_flags_t *flags) {}
 static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
 static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
 
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 590343f6c1b1..5ac416e2d339 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -48,14 +48,14 @@ extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref;
 extern void kmemleak_ignore_phys(phys_addr_t phys) __ref;
 
 static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
-					    int min_count, unsigned long flags,
+					    int min_count, slab_flags_t flags,
 					    gfp_t gfp)
 {
 	if (!(flags & SLAB_NOLEAKTRACE))
 		kmemleak_alloc(ptr, size, min_count, gfp);
 }
 
-static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags)
+static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags)
 {
 	if (!(flags & SLAB_NOLEAKTRACE))
 		kmemleak_free(ptr);
@@ -76,7 +76,7 @@ static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count,
 {
 }
 static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
-					    int min_count, unsigned long flags,
+					    int min_count, slab_flags_t flags,
 					    gfp_t gfp)
 {
 }
@@ -94,7 +94,7 @@ static inline void kmemleak_free(const void *ptr)
 static inline void kmemleak_free_part(const void *ptr, size_t size)
 {
 }
-static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags)
+static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags)
 {
 }
 static inline void kmemleak_free_percpu(const void __percpu *ptr)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index af5aa65c7c18..0c4c579f52ed 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -21,13 +21,20 @@
  * Flags to pass to kmem_cache_create().
  * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
  */
-#define SLAB_CONSISTENCY_CHECKS	0x00000100UL	/* DEBUG: Perform (expensive) checks on alloc/free */
-#define SLAB_RED_ZONE		0x00000400UL	/* DEBUG: Red zone objs in a cache */
-#define SLAB_POISON		0x00000800UL	/* DEBUG: Poison objects */
-#define SLAB_HWCACHE_ALIGN	0x00002000UL	/* Align objs on cache lines */
-#define SLAB_CACHE_DMA		0x00004000UL	/* Use GFP_DMA memory */
-#define SLAB_STORE_USER		0x00010000UL	/* DEBUG: Store the last owner for bug hunting */
-#define SLAB_PANIC		0x00040000UL	/* Panic if kmem_cache_create() fails */
+/* DEBUG: Perform (expensive) checks on alloc/free */
+#define SLAB_CONSISTENCY_CHECKS	((slab_flags_t __force)0x00000100UL)
+/* DEBUG: Red zone objs in a cache */
+#define SLAB_RED_ZONE		((slab_flags_t __force)0x00000400UL)
+/* DEBUG: Poison objects */
+#define SLAB_POISON		((slab_flags_t __force)0x00000800UL)
+/* Align objs on cache lines */
+#define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000UL)
+/* Use GFP_DMA memory */
+#define SLAB_CACHE_DMA		((slab_flags_t __force)0x00004000UL)
+/* DEBUG: Store the last owner for bug hunting */
+#define SLAB_STORE_USER		((slab_flags_t __force)0x00010000UL)
+/* Panic if kmem_cache_create() fails */
+#define SLAB_PANIC		((slab_flags_t __force)0x00040000UL)
 /*
  * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
  *
@@ -65,44 +72,51 @@
  *
  * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
  */
-#define SLAB_TYPESAFE_BY_RCU	0x00080000UL	/* Defer freeing slabs to RCU */
-#define SLAB_MEM_SPREAD		0x00100000UL	/* Spread some memory over cpuset */
-#define SLAB_TRACE		0x00200000UL	/* Trace allocations and frees */
+/* Defer freeing slabs to RCU */
+#define SLAB_TYPESAFE_BY_RCU	((slab_flags_t __force)0x00080000UL)
+/* Spread some memory over cpuset */
+#define SLAB_MEM_SPREAD		((slab_flags_t __force)0x00100000UL)
+/* Trace allocations and frees */
+#define SLAB_TRACE		((slab_flags_t __force)0x00200000UL)
 
 /* Flag to prevent checks on free */
 #ifdef CONFIG_DEBUG_OBJECTS
-# define SLAB_DEBUG_OBJECTS	0x00400000UL
+# define SLAB_DEBUG_OBJECTS	((slab_flags_t __force)0x00400000UL)
 #else
-# define SLAB_DEBUG_OBJECTS	0x00000000UL
+# define SLAB_DEBUG_OBJECTS	((slab_flags_t __force)0x00000000UL)
 #endif
 
-#define SLAB_NOLEAKTRACE	0x00800000UL	/* Avoid kmemleak tracing */
+/* Avoid kmemleak tracing */
+#define SLAB_NOLEAKTRACE	((slab_flags_t __force)0x00800000UL)
 
 /* Don't track use of uninitialized memory */
 #ifdef CONFIG_KMEMCHECK
-# define SLAB_NOTRACK		0x01000000UL
+# define SLAB_NOTRACK		((slab_flags_t __force)0x01000000UL)
 #else
-# define SLAB_NOTRACK		0x00000000UL
+# define SLAB_NOTRACK		((slab_flags_t __force)0x00000000UL)
 #endif
+/* Fault injection mark */
 #ifdef CONFIG_FAILSLAB
-# define SLAB_FAILSLAB		0x02000000UL	/* Fault injection mark */
+# define SLAB_FAILSLAB		((slab_flags_t __force)0x02000000UL)
 #else
-# define SLAB_FAILSLAB		0x00000000UL
+# define SLAB_FAILSLAB		((slab_flags_t __force)0x00000000UL)
 #endif
+/* Account to memcg */
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
-# define SLAB_ACCOUNT		0x04000000UL	/* Account to memcg */
+# define SLAB_ACCOUNT		((slab_flags_t __force)0x04000000UL)
 #else
-# define SLAB_ACCOUNT		0x00000000UL
+# define SLAB_ACCOUNT		((slab_flags_t __force)0x00000000UL)
 #endif
 
 #ifdef CONFIG_KASAN
-#define SLAB_KASAN		0x08000000UL
+#define SLAB_KASAN		((slab_flags_t __force)0x08000000UL)
 #else
-#define SLAB_KASAN		0x00000000UL
+#define SLAB_KASAN		((slab_flags_t __force)0x00000000UL)
 #endif
 
 /* The following flags affect the page allocator grouping pages by mobility */
-#define SLAB_RECLAIM_ACCOUNT	0x00020000UL		/* Objects are reclaimable */
+/* Objects are reclaimable */
+#define SLAB_RECLAIM_ACCOUNT	((slab_flags_t __force)0x00020000UL)
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
@@ -128,7 +142,7 @@ void __init kmem_cache_init(void);
 bool slab_is_available(void);
 
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
-			unsigned long,
+			slab_flags_t,
 			void (*)(void *));
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 8f7d2b1656d2..072e46e9e1d5 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -20,7 +20,7 @@ struct kmem_cache {
 	struct reciprocal_value reciprocal_buffer_size;
 /* 2) touched by every alloc & free from the backend */
 
-	unsigned int flags;		/* constant flags */
+	slab_flags_t flags;		/* constant flags */
 	unsigned int num;		/* # of objs per slab */
 
 /* 3) cache_grow/shrink */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 39fa09bcde23..0adae162dc8f 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -82,7 +82,7 @@ struct kmem_cache_order_objects {
 struct kmem_cache {
 	struct kmem_cache_cpu __percpu *cpu_slab;
 	/* Used for retriving partial slabs etc */
-	unsigned long flags;
+	slab_flags_t flags;
 	unsigned long min_partial;
 	int size;		/* The size of an object including meta data */
 	int object_size;	/* The size of an object without meta data */
diff --git a/include/linux/types.h b/include/linux/types.h
index 34fce54e4f1b..732b52c2eae4 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -156,6 +156,7 @@ typedef u32 dma_addr_t;
 #endif
 
 typedef unsigned __bitwise gfp_t;
+typedef unsigned long __bitwise slab_flags_t;
 typedef unsigned __bitwise fmode_t;
 
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
diff --git a/include/net/sock.h b/include/net/sock.h
index a6b9a8d1a6df..c577286dbffb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1105,7 +1105,7 @@ struct proto {
 
 	struct kmem_cache	*slab;
 	unsigned int		obj_size;
-	int			slab_flags;
+	slab_flags_t		slab_flags;
 
 	struct percpu_counter	*orphan_count;
 
-- 
cgit v1.2.3


From 4fd0b46e898791009b03b2fdd6510044fa8730a6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 15 Nov 2017 17:32:21 -0800
Subject: slab, slub, slob: convert slab_flags_t to 32-bit

struct kmem_cache::flags is "unsigned long" which is unnecessary on
64-bit as no flags are defined in the higher bits.

Switch the field to 32-bit and save some space on x86_64 until such
flags appear:

	add/remove: 0/0 grow/shrink: 0/107 up/down: 0/-657 (-657)
	function                                     old     new   delta
	sysfs_slab_add                               720     719      -1
				...
	check_object                                 699     676     -23

[akpm@linux-foundation.org: fix printk warning]
Link: http://lkml.kernel.org/r/20171021100635.GA8287@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h  | 44 ++++++++++++++++++++++----------------------
 include/linux/types.h |  2 +-
 2 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0c4c579f52ed..f37cb93768ab 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -22,19 +22,19 @@
  * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
  */
 /* DEBUG: Perform (expensive) checks on alloc/free */
-#define SLAB_CONSISTENCY_CHECKS	((slab_flags_t __force)0x00000100UL)
+#define SLAB_CONSISTENCY_CHECKS	((slab_flags_t __force)0x00000100U)
 /* DEBUG: Red zone objs in a cache */
-#define SLAB_RED_ZONE		((slab_flags_t __force)0x00000400UL)
+#define SLAB_RED_ZONE		((slab_flags_t __force)0x00000400U)
 /* DEBUG: Poison objects */
-#define SLAB_POISON		((slab_flags_t __force)0x00000800UL)
+#define SLAB_POISON		((slab_flags_t __force)0x00000800U)
 /* Align objs on cache lines */
-#define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000UL)
+#define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000U)
 /* Use GFP_DMA memory */
-#define SLAB_CACHE_DMA		((slab_flags_t __force)0x00004000UL)
+#define SLAB_CACHE_DMA		((slab_flags_t __force)0x00004000U)
 /* DEBUG: Store the last owner for bug hunting */
-#define SLAB_STORE_USER		((slab_flags_t __force)0x00010000UL)
+#define SLAB_STORE_USER		((slab_flags_t __force)0x00010000U)
 /* Panic if kmem_cache_create() fails */
-#define SLAB_PANIC		((slab_flags_t __force)0x00040000UL)
+#define SLAB_PANIC		((slab_flags_t __force)0x00040000U)
 /*
  * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
  *
@@ -73,50 +73,50 @@
  * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
  */
 /* Defer freeing slabs to RCU */
-#define SLAB_TYPESAFE_BY_RCU	((slab_flags_t __force)0x00080000UL)
+#define SLAB_TYPESAFE_BY_RCU	((slab_flags_t __force)0x00080000U)
 /* Spread some memory over cpuset */
-#define SLAB_MEM_SPREAD		((slab_flags_t __force)0x00100000UL)
+#define SLAB_MEM_SPREAD		((slab_flags_t __force)0x00100000U)
 /* Trace allocations and frees */
-#define SLAB_TRACE		((slab_flags_t __force)0x00200000UL)
+#define SLAB_TRACE		((slab_flags_t __force)0x00200000U)
 
 /* Flag to prevent checks on free */
 #ifdef CONFIG_DEBUG_OBJECTS
-# define SLAB_DEBUG_OBJECTS	((slab_flags_t __force)0x00400000UL)
+# define SLAB_DEBUG_OBJECTS	((slab_flags_t __force)0x00400000U)
 #else
-# define SLAB_DEBUG_OBJECTS	((slab_flags_t __force)0x00000000UL)
+# define SLAB_DEBUG_OBJECTS	0
 #endif
 
 /* Avoid kmemleak tracing */
-#define SLAB_NOLEAKTRACE	((slab_flags_t __force)0x00800000UL)
+#define SLAB_NOLEAKTRACE	((slab_flags_t __force)0x00800000U)
 
 /* Don't track use of uninitialized memory */
 #ifdef CONFIG_KMEMCHECK
-# define SLAB_NOTRACK		((slab_flags_t __force)0x01000000UL)
+# define SLAB_NOTRACK		((slab_flags_t __force)0x01000000U)
 #else
-# define SLAB_NOTRACK		((slab_flags_t __force)0x00000000UL)
+# define SLAB_NOTRACK		0
 #endif
 /* Fault injection mark */
 #ifdef CONFIG_FAILSLAB
-# define SLAB_FAILSLAB		((slab_flags_t __force)0x02000000UL)
+# define SLAB_FAILSLAB		((slab_flags_t __force)0x02000000U)
 #else
-# define SLAB_FAILSLAB		((slab_flags_t __force)0x00000000UL)
+# define SLAB_FAILSLAB		0
 #endif
 /* Account to memcg */
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
-# define SLAB_ACCOUNT		((slab_flags_t __force)0x04000000UL)
+# define SLAB_ACCOUNT		((slab_flags_t __force)0x04000000U)
 #else
-# define SLAB_ACCOUNT		((slab_flags_t __force)0x00000000UL)
+# define SLAB_ACCOUNT		0
 #endif
 
 #ifdef CONFIG_KASAN
-#define SLAB_KASAN		((slab_flags_t __force)0x08000000UL)
+#define SLAB_KASAN		((slab_flags_t __force)0x08000000U)
 #else
-#define SLAB_KASAN		((slab_flags_t __force)0x00000000UL)
+#define SLAB_KASAN		0
 #endif
 
 /* The following flags affect the page allocator grouping pages by mobility */
 /* Objects are reclaimable */
-#define SLAB_RECLAIM_ACCOUNT	((slab_flags_t __force)0x00020000UL)
+#define SLAB_RECLAIM_ACCOUNT	((slab_flags_t __force)0x00020000U)
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
diff --git a/include/linux/types.h b/include/linux/types.h
index 732b52c2eae4..c94d59ef96cc 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -156,7 +156,7 @@ typedef u32 dma_addr_t;
 #endif
 
 typedef unsigned __bitwise gfp_t;
-typedef unsigned long __bitwise slab_flags_t;
+typedef unsigned __bitwise slab_flags_t;
 typedef unsigned __bitwise fmode_t;
 
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
-- 
cgit v1.2.3


From 5799b255c491d853b73fb9e0e1760210315d06cd Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Wed, 15 Nov 2017 17:32:29 -0800
Subject: include/linux/slab.h: add kmalloc_array_node() and kcalloc_node()

Patch series "Add kmalloc_array_node() and kcalloc_node()".

Our current memeory allocation routines suffer form an API imbalance,
for one we have kmalloc_array() and kcalloc() which check for overflows
in size multiplication and we have kmalloc_node() and kzalloc_node()
which allow for memory allocation on a certain NUMA node but don't check
for eventual overflows.

This patch (of 6):

We have kmalloc_array() and kcalloc() wrappers on top of kmalloc() which
ensure us overflow free multiplication for the size of a memory
allocation but these implementations are not NUMA-aware.

Likewise we have kmalloc_node() which is a NUMA-aware version of
kmalloc() but the implementation is not aware of any possible overflows
in eventual size calculations.

Introduce a combination of the two above cases to have a NUMA-node aware
version of kmalloc_array() and kcalloc().

Link: http://lkml.kernel.org/r/20170927082038.3782-2-jthumshirn@suse.de
Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Damien Le Moal <damien.lemoal@wdc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mike Marciniszyn <infinipath@intel.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index f37cb93768ab..ebddfca9e24b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -650,6 +650,22 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
 #define kmalloc_track_caller(size, flags) \
 	__kmalloc_track_caller(size, flags, _RET_IP_)
 
+static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
+				       int node)
+{
+	if (size != 0 && n > SIZE_MAX / size)
+		return NULL;
+	if (__builtin_constant_p(n) && __builtin_constant_p(size))
+		return kmalloc_node(n * size, flags, node);
+	return __kmalloc_node(n * size, flags, node);
+}
+
+static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
+{
+	return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
+}
+
+
 #ifdef CONFIG_NUMA
 extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
 #define kmalloc_node_track_caller(size, flags, node) \
-- 
cgit v1.2.3


From 41710443f790b5f7f5305eba99dacce88e259f4c Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@intel.com>
Date: Wed, 15 Nov 2017 17:32:53 -0800
Subject: mm: update comments for struct page.mapping

struct page.mapping can be NULL or points to one object of type
address_space, anon_vma or KSM private structure.

Link: http://lkml.kernel.org/r/1506485067-15954-1-git-send-email-changbin.du@intel.com
Signed-off-by: Changbin Du <changbin.du@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c85f11dafd56..d1b8e8f97fc2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -48,8 +48,10 @@ struct page {
 						 * inode address_space, or NULL.
 						 * If page mapped as anonymous
 						 * memory, low bit is set, and
-						 * it points to anon_vma object:
-						 * see PAGE_MAPPING_ANON below.
+						 * it points to anon_vma object
+						 * or KSM private structure. See
+						 * PAGE_MAPPING_ANON and
+						 * PAGE_MAPPING_KSM.
 						 */
 		void *s_mem;			/* slab first object */
 		atomic_t compound_mapcount;	/* first tail page */
-- 
cgit v1.2.3


From 23c47d2ada9f96731492a67b28c0072715075baa Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 15 Nov 2017 17:33:00 -0800
Subject: bdi: introduce BDI_CAP_SYNCHRONOUS_IO

As discussed at

  https://lkml.kernel.org/r/<20170728165604.10455-1-ross.zwisler@linux.intel.com>

someday we will remove rw_page().  If so, we need something to detect
such super-fast storage on which synchronous IO operations like the
current rw_page are always a win.

Introduces BDI_CAP_SYNCHRONOUS_IO to indicate such devices.  With it, we
could use various optimization techniques.

Link: http://lkml.kernel.org/r/1505886205-9671-3-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f41ca8486e02..7360e79e9a2f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -122,6 +122,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
  *
  * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
+ * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be
+ *			   inefficient.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
@@ -129,6 +131,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_STABLE_WRITES	0x00000008
 #define BDI_CAP_STRICTLIMIT	0x00000010
 #define BDI_CAP_CGROUP_WRITEBACK 0x00000020
+#define BDI_CAP_SYNCHRONOUS_IO	0x00000040
 
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
 	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
@@ -174,6 +177,11 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 long congestion_wait(int sync, long timeout);
 long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
 
+static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi)
+{
+	return bdi->capabilities & BDI_CAP_SYNCHRONOUS_IO;
+}
+
 static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi)
 {
 	return bdi->capabilities & BDI_CAP_STABLE_WRITES;
-- 
cgit v1.2.3


From 539a6fea7fdcade532bd3e77be2862a683f8f0c9 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 15 Nov 2017 17:33:04 -0800
Subject: mm, swap: introduce SWP_SYNCHRONOUS_IO

If rw-page based fast storage is used for swap devices, we need to
detect it to enhance swap IO operations.  This patch is preparation for
optimizing of swap-in operation with next patch.

Link: http://lkml.kernel.org/r/1505886205-9671-4-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f02fb5db8914..933d7c0c3542 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -171,8 +171,9 @@ enum {
 	SWP_AREA_DISCARD = (1 << 8),	/* single-time swap area discards */
 	SWP_PAGE_DISCARD = (1 << 9),	/* freed swap page-cluster discards */
 	SWP_STABLE_WRITES = (1 << 10),	/* no overwrite PG_writeback pages */
+	SWP_SYNCHRONOUS_IO = (1 << 11),	/* synchronous IO is efficient */
 					/* add others here before... */
-	SWP_SCANNING	= (1 << 11),	/* refcount in scan_swap_map */
+	SWP_SCANNING	= (1 << 12),	/* refcount in scan_swap_map */
 };
 
 #define SWAP_CLUSTER_MAX 32UL
-- 
cgit v1.2.3


From 0bcac06f27d7528591c27ac2b093ccd71c5d0168 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 15 Nov 2017 17:33:07 -0800
Subject: mm, swap: skip swapcache for swapin of synchronous device

With fast swap storage, the platforms want to use swap more aggressively
and swap-in is crucial to application latency.

The rw_page() based synchronous devices like zram, pmem and btt are such
fast storage.  When I profile swapin performance with zram lz4
decompress test, S/W overhead is more than 70%.  Maybe, it would be
bigger in nvdimm.

This patch aims to reduce swap-in latency by skipping swapcache if the
swap device is synchronous device like rw_page based device.  It
enhances 45% my swapin test(5G sequential swapin, no readahead, from
2.41sec to 1.64sec).

Link: http://lkml.kernel.org/r/1505886205-9671-5-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 933d7c0c3542..32c06f028c7b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -466,6 +466,7 @@ extern int page_swapcount(struct page *);
 extern int __swp_swapcount(swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
+extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
 extern bool reuse_swap_page(struct page *, int *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
@@ -474,6 +475,16 @@ extern void exit_swap_address_space(unsigned int type);
 
 #else /* CONFIG_SWAP */
 
+static inline int swap_readpage(struct page *page, bool do_poll)
+{
+	return 0;
+}
+
+static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
+{
+	return NULL;
+}
+
 #define swap_address_space(entry)		(NULL)
 #define get_nr_swap_pages()			0L
 #define total_swap_pages			0L
-- 
cgit v1.2.3


From aa8d22a11da933dbf880b4933b58931f4aefe91c Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 15 Nov 2017 17:33:11 -0800
Subject: mm: swap: SWP_SYNCHRONOUS_IO: skip swapcache only if swapped page has
 no other reference

When SWP_SYNCHRONOUS_IO swapped-in pages are shared by several
processes, it can cause unnecessary memory wastage by skipping swap
cache.  Because, with swapin fault by read, they could share a page if
the page were in swap cache.  Thus, it avoids allocating same content
new pages.

This patch makes the swapcache skipping work only if the swap pte is
non-sharable.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/1507620825-5537-1-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 32c06f028c7b..8b8a6f965785 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -463,6 +463,7 @@ extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
+extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry);
 extern int __swp_swapcount(swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
@@ -589,6 +590,11 @@ static inline int page_swapcount(struct page *page)
 	return 0;
 }
 
+static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+{
+	return 0;
+}
+
 static inline int __swp_swapcount(swp_entry_t entry)
 {
 	return 0;
-- 
cgit v1.2.3


From 4da2ce250f986060750fcc5b29112914e31803ba Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 15 Nov 2017 17:33:26 -0800
Subject: mm: distinguish CMA and MOVABLE isolation in has_unmovable_pages()

Joonsoo has noticed that "mm: drop migrate type checks from
has_unmovable_pages" would break CMA allocator because it relies on
has_unmovable_pages returning false even for CMA pageblocks which in
fact don't have to be movable:

 alloc_contig_range
   start_isolate_page_range
     set_migratetype_isolate
       has_unmovable_pages

This is a result of the code sharing between CMA and memory hotplug
while each one has a different idea of what has_unmovable_pages should
return.  This is unfortunate but fixing it properly would require a lot
of code duplication.

Fix the issue by introducing the requested migrate type argument and
special case MIGRATE_CMA case where CMA page blocks are handled
properly.  This will work for memory hotplug because it requires
MIGRATE_MOVABLE.

Link: http://lkml.kernel.org/r/20171019122118.y6cndierwl2vnguj@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Tested-by: Stefan Wahren <stefan.wahren@i2se.com>
Tested-by: Ran Wang <ran.wang_1@nxp.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-isolation.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 05a04e603686..cdad58bbfd8b 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -31,7 +31,7 @@ static inline bool is_migrate_isolate(int migratetype)
 #endif
 
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
-			 bool skip_hwpoisoned_pages);
+			 int migratetype, bool skip_hwpoisoned_pages);
 void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype, int *num_movable);
-- 
cgit v1.2.3


From 66e8b438bd5c75498cfe915c4219049eaebcb869 Mon Sep 17 00:00:00 2001
From: Gioh Kim <gi-oh.kim@profitbricks.com>
Date: Wed, 15 Nov 2017 17:33:42 -0800
Subject: mm/memblock.c: make the index explicit argument of
 for_each_memblock_type

for_each_memblock_type macro function relies on idx variable defined in
the caller context.  Silent macro arguments are almost always wrong
thing to do.  They make code harder to read and easier to get wrong.
Let's use an explicit iterator parameter for for_each_memblock_type and
make the code more obious.  This patch is a mere cleanup and it
shouldn't introduce any functional change.

Link: http://lkml.kernel.org/r/20170913133029.28911-1-gi-oh.kim@profitbricks.com
Signed-off-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index bae11c7e7bf3..ce0e5634c2f9 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -389,10 +389,10 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
 	     region < (memblock.memblock_type.regions + memblock.memblock_type.cnt);	\
 	     region++)
 
-#define for_each_memblock_type(memblock_type, rgn)			\
-	for (idx = 0, rgn = &memblock_type->regions[0];			\
-	     idx < memblock_type->cnt;					\
-	     idx++, rgn = &memblock_type->regions[idx])
+#define for_each_memblock_type(i, memblock_type, rgn)			\
+	for (i = 0, rgn = &memblock_type->regions[0];			\
+	     i < memblock_type->cnt;					\
+	     i++, rgn = &memblock_type->regions[i])
 
 #ifdef CONFIG_MEMTEST
 extern void early_memtest(phys_addr_t start, phys_addr_t end);
-- 
cgit v1.2.3


From 0bea803e9e6bf3836d5368a6426c30a8c0e5eab5 Mon Sep 17 00:00:00 2001
From: Ralph Campbell <rcampbell@nvidia.com>
Date: Wed, 15 Nov 2017 17:34:00 -0800
Subject: mm/hmm: constify hmm_devmem_page_get_drvdata() parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Constify pointer parameter to avoid issue when use from code that only
has const struct page pointer to use in the first place.

Link: http://lkml.kernel.org/r/1506972774-10191-1-git-send-email-jglisse@redhat.com
Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 96e69979f84d..325017ad9311 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -471,9 +471,9 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page,
  * @page: pointer to struct page
  * Return: driver data value
  */
-static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page)
+static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
 {
-	unsigned long *drvdata = (unsigned long *)&page->pgmap;
+	const unsigned long *drvdata = (const unsigned long *)&page->pgmap;
 
 	return drvdata[1];
 }
-- 
cgit v1.2.3


From 0f10851ea475e08896ee5d9a2036d1bb46a8f3a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
Date: Wed, 15 Nov 2017 17:34:07 -0800
Subject: mm/mmu_notifier: avoid double notification when it is useless
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch only affects users of mmu_notifier->invalidate_range callback
which are device drivers related to ATS/PASID, CAPI, IOMMUv2, SVM ...
and it is an optimization for those users.  Everyone else is unaffected
by it.

When clearing a pte/pmd we are given a choice to notify the event under
the page table lock (notify version of *_clear_flush helpers do call the
mmu_notifier_invalidate_range).  But that notification is not necessary
in all cases.

This patch removes almost all cases where it is useless to have a call
to mmu_notifier_invalidate_range before
mmu_notifier_invalidate_range_end.  It also adds documentation in all
those cases explaining why.

Below is a more in depth analysis of why this is fine to do this:

For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when
device use thing like ATS/PASID to get the IOMMU to walk the CPU page
table to access a process virtual address space).  There is only 2 cases
when you need to notify those secondary TLB while holding page table
lock when clearing a pte/pmd:

  A) page backing address is free before mmu_notifier_invalidate_range_end
  B) a page table entry is updated to point to a new page (COW, write fault
     on zero page, __replace_page(), ...)

Case A is obvious you do not want to take the risk for the device to write
to a page that might now be used by something completely different.

Case B is more subtle. For correctness it requires the following sequence
to happen:
  - take page table lock
  - clear page table entry and notify (pmd/pte_huge_clear_flush_notify())
  - set page table entry to point to new page

If clearing the page table entry is not followed by a notify before setting
the new pte/pmd value then you can break memory model like C11 or C++11 for
the device.

Consider the following scenario (device use a feature similar to ATS/
PASID):

Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we
assume they are write protected for COW (other case of B apply too).

[Time N] -----------------------------------------------------------------
CPU-thread-0  {try to write to addrA}
CPU-thread-1  {try to write to addrB}
CPU-thread-2  {}
CPU-thread-3  {}
DEV-thread-0  {read addrA and populate device TLB}
DEV-thread-2  {read addrB and populate device TLB}
[Time N+1] ---------------------------------------------------------------
CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
CPU-thread-2  {}
CPU-thread-3  {}
DEV-thread-0  {}
DEV-thread-2  {}
[Time N+2] ---------------------------------------------------------------
CPU-thread-0  {COW_step1: {update page table point to new page for addrA}}
CPU-thread-1  {COW_step1: {update page table point to new page for addrB}}
CPU-thread-2  {}
CPU-thread-3  {}
DEV-thread-0  {}
DEV-thread-2  {}
[Time N+3] ---------------------------------------------------------------
CPU-thread-0  {preempted}
CPU-thread-1  {preempted}
CPU-thread-2  {write to addrA which is a write to new page}
CPU-thread-3  {}
DEV-thread-0  {}
DEV-thread-2  {}
[Time N+3] ---------------------------------------------------------------
CPU-thread-0  {preempted}
CPU-thread-1  {preempted}
CPU-thread-2  {}
CPU-thread-3  {write to addrB which is a write to new page}
DEV-thread-0  {}
DEV-thread-2  {}
[Time N+4] ---------------------------------------------------------------
CPU-thread-0  {preempted}
CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
CPU-thread-2  {}
CPU-thread-3  {}
DEV-thread-0  {}
DEV-thread-2  {}
[Time N+5] ---------------------------------------------------------------
CPU-thread-0  {preempted}
CPU-thread-1  {}
CPU-thread-2  {}
CPU-thread-3  {}
DEV-thread-0  {read addrA from old page}
DEV-thread-2  {read addrB from new page}

So here because at time N+2 the clear page table entry was not pair with a
notification to invalidate the secondary TLB, the device see the new value
for addrB before seing the new value for addrA.  This break total memory
ordering for the device.

When changing a pte to write protect or to point to a new write protected
page with same content (KSM) it is ok to delay invalidate_range callback
to mmu_notifier_invalidate_range_end() outside the page table lock.  This
is true even if the thread doing page table update is preempted right
after releasing page table lock before calling
mmu_notifier_invalidate_range_end

Thanks to Andrea for thinking of a problematic scenario for COW.

[jglisse@redhat.com: v2]
  Link: http://lkml.kernel.org/r/20171017031003.7481-2-jglisse@redhat.com
Link: http://lkml.kernel.org/r/20170901173011.10745-1-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alistair Popple <alistair@popple.id.au>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 2cf1c3c807f6..130831718e95 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -156,7 +156,8 @@ struct mmu_notifier_ops {
 	 * shared page-tables, it not necessary to implement the
 	 * invalidate_range_start()/end() notifiers, as
 	 * invalidate_range() alread catches the points in time when an
-	 * external TLB range needs to be flushed.
+	 * external TLB range needs to be flushed. For more in depth
+	 * discussion on this see Documentation/vm/mmu_notifier.txt
 	 *
 	 * The invalidate_range() function is called under the ptl
 	 * spin-lock and not allowed to sleep.
-- 
cgit v1.2.3


From 4645b9fe84bf4878f04c7959a75df7c3c2d1bbb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
Date: Wed, 15 Nov 2017 17:34:11 -0800
Subject: mm/mmu_notifier: avoid call to invalidate_range() in range_end()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is an optimization patch that only affect mmu_notifier users which
rely on the invalidate_range() callback.  This patch avoids calling that
callback twice in a row from inside __mmu_notifier_invalidate_range_end

Existing pattern (before this patch):
    mmu_notifier_invalidate_range_start()
        pte/pmd/pud_clear_flush_notify()
            mmu_notifier_invalidate_range()
    mmu_notifier_invalidate_range_end()
        mmu_notifier_invalidate_range()

New pattern (after this patch):
    mmu_notifier_invalidate_range_start()
        pte/pmd/pud_clear_flush_notify()
            mmu_notifier_invalidate_range()
    mmu_notifier_invalidate_range_only_end()

We call the invalidate_range callback after clearing the page table
under the page table lock and we skip the call to invalidate_range
inside the __mmu_notifier_invalidate_range_end() function.

Idea from Andrea Arcangeli

Link: http://lkml.kernel.org/r/20171017031003.7481-3-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alistair Popple <alistair@popple.id.au>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 130831718e95..b25dc9db19fc 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -214,7 +214,8 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm,
 extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
-				  unsigned long start, unsigned long end);
+				  unsigned long start, unsigned long end,
+				  bool only_end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
 
@@ -268,7 +269,14 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  unsigned long start, unsigned long end)
 {
 	if (mm_has_notifiers(mm))
-		__mmu_notifier_invalidate_range_end(mm, start, end);
+		__mmu_notifier_invalidate_range_end(mm, start, end, false);
+}
+
+static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_invalidate_range_end(mm, start, end, true);
 }
 
 static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
@@ -439,6 +447,11 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 {
 }
 
+static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+}
+
 static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end)
 {
-- 
cgit v1.2.3


From 3a50d14d0df5776e002a8683a290c87eeac93a21 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Wed, 15 Nov 2017 17:34:15 -0800
Subject: mm: remove unused pgdat->inactive_ratio

Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file
list") 'pgdat->inactive_ratio' is not used, except for printing
"node_inactive_ratio: 0" in /proc/zoneinfo output.

Remove it.

Link: http://lkml.kernel.org/r/20171003152611.27483-1-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a507f43ad221..39ccaa9c428b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -712,12 +712,6 @@ typedef struct pglist_data {
 	/* Fields commonly accessed by the page reclaim scanner */
 	struct lruvec		lruvec;
 
-	/*
-	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
-	 * this node's LRU.  Maintained by the pageout code.
-	 */
-	unsigned int inactive_ratio;
-
 	unsigned long		flags;
 
 	ZONE_PADDING(_pad2_)
-- 
cgit v1.2.3


From 8745808fda84c638e45cc860c8fb600bf4b0a2a6 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 15 Nov 2017 17:34:22 -0800
Subject: mm, arch: remove empty_bad_page*

empty_bad_page() and empty_bad_pte_table() seem to be relics from old
days which is not used by any code for a long time.  I have tried to
find when exactly but this is not really all that straightforward due to
many code movements - traces disappear around 2.4 times.

Anyway no code really references neither empty_bad_page nor
empty_bad_pte_table.  We only allocate the storage which is not used by
anybody so remove them.

Link: http://lkml.kernel.org/r/20171004150045.30755-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Ralf Baechle <ralf@linus-mips.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: David Howells <dhowells@redhat.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 584b14c774c1..3ec44e27aa9d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -18,7 +18,7 @@
  * Various page->flags bits:
  *
  * PG_reserved is set for special pages, which can never be swapped out. Some
- * of them might not even exist (eg empty_bad_page)...
+ * of them might not even exist...
  *
  * The PG_private bitflag is set on pagecache pages if they contain filesystem
  * specific data (which is normally at page->private). It can be used by
-- 
cgit v1.2.3


From 72b045aecdd856b083521f2a963705b4c2e59680 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Nov 2017 17:34:33 -0800
Subject: mm: implement find_get_pages_range_tag()

Patch series "Ranged pagevec tagged lookup", v3.

In this series I provide a ranged variant of pagevec_lookup_tag() and
use it in places where it makes sense.  This series removes some common
code and it also has a potential for speeding up some operations
similarly as for pagevec_lookup_range() (but for now I can think of only
artificial cases where this happens).

This patch (of 16):

Implement a variant of find_get_pages_tag() that stops iterating at
given index.  Lots of users of this function (through pagevec_lookup())
actually want a range lookup and all of them are currently open-coding
this.

Also create corresponding pagevec_lookup_range_tag() function.

Link: http://lkml.kernel.org/r/20171009151359.31984-2-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: David Howells <dhowells@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Steve French <sfrench@samba.org>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 12 ++++++++++--
 include/linux/pagevec.h | 11 +++++++++--
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e08b5339023c..956d6a80e126 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -366,8 +366,16 @@ static inline unsigned find_get_pages(struct address_space *mapping,
 }
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 			       unsigned int nr_pages, struct page **pages);
-unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
-			int tag, unsigned int nr_pages, struct page **pages);
+unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
+			pgoff_t end, int tag, unsigned int nr_pages,
+			struct page **pages);
+static inline unsigned find_get_pages_tag(struct address_space *mapping,
+			pgoff_t *index, int tag, unsigned int nr_pages,
+			struct page **pages)
+{
+	return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag,
+					nr_pages, pages);
+}
 unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
 			int tag, unsigned int nr_entries,
 			struct page **entries, pgoff_t *indices);
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 2636c0c0f279..afc718f586f8 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -38,9 +38,16 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec,
 	return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1);
 }
 
-unsigned pagevec_lookup_tag(struct pagevec *pvec,
+unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
+		struct address_space *mapping, pgoff_t *index, pgoff_t end,
+		int tag, unsigned nr_pages);
+static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, int tag,
-		unsigned nr_pages);
+		unsigned nr_pages)
+{
+	return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag,
+					nr_pages);
+}
 
 static inline void pagevec_init(struct pagevec *pvec, int cold)
 {
-- 
cgit v1.2.3


From 93d3b7140ad379885849ad2674b4290c9e8273da Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Nov 2017 17:35:12 -0800
Subject: mm: add variant of pagevec_lookup_range_tag() taking number of pages

Currently pagevec_lookup_range_tag() takes number of pages to look up
but most users don't need this.  Create a new function
pagevec_lookup_range_nr_tag() that takes maximum number of pages to
lookup for Ceph which wants this functionality so that we can drop
nr_pages argument from pagevec_lookup_range_tag().

Link: http://lkml.kernel.org/r/20171009151359.31984-13-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index afc718f586f8..87a2dfd62f5e 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -41,6 +41,9 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec,
 unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
 		int tag, unsigned nr_pages);
+unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
+		struct address_space *mapping, pgoff_t *index, pgoff_t end,
+		int tag, unsigned max_pages);
 static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, int tag,
 		unsigned nr_pages)
-- 
cgit v1.2.3


From 67fd707f468142d0f689a6240044bb45c1913003 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Nov 2017 17:35:19 -0800
Subject: mm: remove nr_pages argument from pagevec_lookup_{,range}_tag()

All users of pagevec_lookup() and pagevec_lookup_range() now pass
PAGEVEC_SIZE as a desired number of pages.  Just drop the argument.

Link: http://lkml.kernel.org/r/20171009151359.31984-15-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 87a2dfd62f5e..4f56e0ad9d00 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -40,16 +40,14 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec,
 
 unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
-		int tag, unsigned nr_pages);
+		int tag);
 unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
 		int tag, unsigned max_pages);
 static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
-		struct address_space *mapping, pgoff_t *index, int tag,
-		unsigned nr_pages)
+		struct address_space *mapping, pgoff_t *index, int tag)
 {
-	return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag,
-					nr_pages);
+	return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag);
 }
 
 static inline void pagevec_init(struct pagevec *pvec, int cold)
-- 
cgit v1.2.3


From b4e98d9ac775907cc53fb08fcb6776deb7694e30 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:33 -0800
Subject: mm: account pud page tables

On a machine with 5-level paging support a process can allocate
significant amount of memory and stay unnoticed by oom-killer and memory
cgroup.  The trick is to allocate a lot of PUD page tables.  We don't
account PUD page tables, only PMD and PTE.

We already addressed the same issue for PMD page tables, see commit
dc6c9a35b66b ("mm: account pmd page tables to the process").
Introduction of 5-level paging brings the same issue for PUD page
tables.

The patch expands accounting to PUD level.

[kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/]
  Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com
[heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting]
  Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com
Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h       | 36 +++++++++++++++++++++++++++++++++---
 include/linux/mm_types.h |  3 +++
 2 files changed, 36 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 91b46f99b4d2..9af86f39d928 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1599,14 +1599,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
 int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
 #endif
 
-#ifdef __PAGETABLE_PUD_FOLDED
+#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
 static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
 						unsigned long address)
 {
 	return 0;
 }
+
+static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_nr_puds_init(struct mm_struct *mm) {}
+static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
+static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
+
 #else
 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
+
+static inline void mm_nr_puds_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->nr_puds, 0);
+}
+
+static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_puds);
+}
+
+static inline void mm_inc_nr_puds(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_puds);
+}
+
+static inline void mm_dec_nr_puds(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_puds);
+}
 #endif
 
 #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
@@ -1618,7 +1648,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 
 static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
 
-static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
 {
 	return 0;
 }
@@ -1634,7 +1664,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm)
 	atomic_long_set(&mm->nr_pmds, 0);
 }
 
-static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
 {
 	return atomic_long_read(&mm->nr_pmds);
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d1b8e8f97fc2..e9e561e02d22 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -404,6 +404,9 @@ struct mm_struct {
 	atomic_long_t nr_ptes;			/* PTE page table pages */
 #if CONFIG_PGTABLE_LEVELS > 2
 	atomic_long_t nr_pmds;			/* PMD page table pages */
+#endif
+#if CONFIG_PGTABLE_LEVELS > 3
+	atomic_long_t nr_puds;			/* PUD page table pages */
 #endif
 	int map_count;				/* number of VMAs */
 
-- 
cgit v1.2.3


From c4812909f5d5a9b7f1c85a2d95be388a066cda52 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:37 -0800
Subject: mm: introduce wrappers to access mm->nr_ptes

Let's add wrappers for ->nr_ptes with the same interface as for nr_pmd
and nr_pud.

The patch also makes nr_ptes accounting dependent onto CONFIG_MMU.  Page
table accounting doesn't make sense if you don't have page tables.

It's preparation for consolidation of page-table counters in mm_struct.

Link: http://lkml.kernel.org/r/20171006100651.44742-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h       | 32 ++++++++++++++++++++++++++++++++
 include/linux/mm_types.h |  2 ++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9af86f39d928..2ca799f0d762 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1680,6 +1680,38 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm)
 }
 #endif
 
+#ifdef CONFIG_MMU
+static inline void mm_nr_ptes_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->nr_ptes, 0);
+}
+
+static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_ptes);
+}
+
+static inline void mm_inc_nr_ptes(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_ptes);
+}
+
+static inline void mm_dec_nr_ptes(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_ptes);
+}
+#else
+static inline void mm_nr_ptes_init(struct mm_struct *mm) {}
+
+static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
+static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
+#endif
+
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e9e561e02d22..e42048020664 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -401,7 +401,9 @@ struct mm_struct {
 	 */
 	atomic_t mm_count;
 
+#ifdef CONFIG_MMU
 	atomic_long_t nr_ptes;			/* PTE page table pages */
+#endif
 #if CONFIG_PGTABLE_LEVELS > 2
 	atomic_long_t nr_pmds;			/* PMD page table pages */
 #endif
-- 
cgit v1.2.3


From af5b0f6a09e42c9f4fa87735f2a366748767b686 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:40 -0800
Subject: mm: consolidate page table accounting

Currently, we account page tables separately for each page table level,
but that's redundant -- we only make use of total memory allocated to
page tables for oom_badness calculation.  We also provide the
information to userspace, but it has dubious value there too.

This patch switches page table accounting to single counter.

mm->pgtables_bytes is now used to account all page table levels.  We use
bytes, because page table size for different levels of page table tree
may be different.

The change has user-visible effect: we don't have VmPMD and VmPUD
reported in /proc/[pid]/status.  Not sure if anybody uses them.  (As
alternative, we can always report 0 kB for them.)

OOM-killer report is also slightly changed: we now report pgtables_bytes
instead of nr_ptes, nr_pmd, nr_puds.

Apart from reducing number of counters per-mm, the benefit is that we
now calculate oom_badness() more correctly for machines which have
different size of page tables depending on level or where page tables
are less than a page in size.

The only downside can be debuggability because we do not know which page
table level could leak.  But I do not remember many bugs that would be
caught by separate counters so I wouldn't lose sleep over this.

[akpm@linux-foundation.org: fix mm/huge_memory.c]
Link: http://lkml.kernel.org/r/20171006100651.44742-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
[kirill.shutemov@linux.intel.com: fix build]
  Link: http://lkml.kernel.org/r/20171016150113.ikfxy3e7zzfvsr4w@black.fi.intel.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h       | 58 ++++++++++--------------------------------------
 include/linux/mm_types.h |  8 +------
 2 files changed, 13 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2ca799f0d762..7c1e82a1aa77 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1605,37 +1605,20 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
 {
 	return 0;
 }
-
-static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void mm_nr_puds_init(struct mm_struct *mm) {}
 static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
 static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
 
 #else
 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
 
-static inline void mm_nr_puds_init(struct mm_struct *mm)
-{
-	atomic_long_set(&mm->nr_puds, 0);
-}
-
-static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
-{
-	return atomic_long_read(&mm->nr_puds);
-}
-
 static inline void mm_inc_nr_puds(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_puds);
+	atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_puds(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_puds);
+	atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
 }
 #endif
 
@@ -1646,64 +1629,47 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 	return 0;
 }
 
-static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
-
-static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
-{
-	return 0;
-}
-
 static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
 static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
 
 #else
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
 
-static inline void mm_nr_pmds_init(struct mm_struct *mm)
-{
-	atomic_long_set(&mm->nr_pmds, 0);
-}
-
-static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
-{
-	return atomic_long_read(&mm->nr_pmds);
-}
-
 static inline void mm_inc_nr_pmds(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_pmds);
+	atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_pmds(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_pmds);
+	atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
 }
 #endif
 
 #ifdef CONFIG_MMU
-static inline void mm_nr_ptes_init(struct mm_struct *mm)
+static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
 {
-	atomic_long_set(&mm->nr_ptes, 0);
+	atomic_long_set(&mm->pgtables_bytes, 0);
 }
 
-static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
 {
-	return atomic_long_read(&mm->nr_ptes);
+	return atomic_long_read(&mm->pgtables_bytes);
 }
 
 static inline void mm_inc_nr_ptes(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_ptes);
+	atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_ptes(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_ptes);
+	atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
 }
 #else
-static inline void mm_nr_ptes_init(struct mm_struct *mm) {}
 
-static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
+static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
 {
 	return 0;
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e42048020664..09643e0472fc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -402,13 +402,7 @@ struct mm_struct {
 	atomic_t mm_count;
 
 #ifdef CONFIG_MMU
-	atomic_long_t nr_ptes;			/* PTE page table pages */
-#endif
-#if CONFIG_PGTABLE_LEVELS > 2
-	atomic_long_t nr_pmds;			/* PMD page table pages */
-#endif
-#if CONFIG_PGTABLE_LEVELS > 3
-	atomic_long_t nr_puds;			/* PUD page table pages */
+	atomic_long_t pgtables_bytes;		/* PTE page table pages */
 #endif
 	int map_count;				/* number of VMAs */
 
-- 
cgit v1.2.3


From 4950276672fce5c241857540f8561c440663673d Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:35:51 -0800
Subject: kmemcheck: remove annotations

Patch series "kmemcheck: kill kmemcheck", v2.

As discussed at LSF/MM, kill kmemcheck.

KASan is a replacement that is able to work without the limitation of
kmemcheck (single CPU, slow).  KASan is already upstream.

We are also not aware of any users of kmemcheck (or users who don't
consider KASan as a suitable replacement).

The only objection was that since KASAN wasn't supported by all GCC
versions provided by distros at that time we should hold off for 2
years, and try again.

Now that 2 years have passed, and all distros provide gcc that supports
KASAN, kill kmemcheck again for the very same reasons.

This patch (of 4):

Remove kmemcheck annotations, and calls to kmemcheck from the kernel.

[alexander.levin@verizon.com: correctly remove kmemcheck call from dma_map_sg_attrs]
  Link: http://lkml.kernel.org/r/20171012192151.26531-1-alexander.levin@verizon.com
Link: http://lkml.kernel.org/r/20171007030159.22241-2-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Hansen <devtimhansen@gmail.com>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/c2port.h           | 4 ----
 include/linux/dma-mapping.h      | 8 +-------
 include/linux/filter.h           | 2 --
 include/linux/mm_types.h         | 8 --------
 include/linux/net.h              | 3 ---
 include/linux/ring_buffer.h      | 3 ---
 include/linux/skbuff.h           | 3 ---
 include/net/inet_sock.h          | 3 ---
 include/net/inet_timewait_sock.h | 4 ----
 include/net/sock.h               | 3 ---
 10 files changed, 1 insertion(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/linux/c2port.h b/include/linux/c2port.h
index 4efabcb51347..f2736348ca26 100644
--- a/include/linux/c2port.h
+++ b/include/linux/c2port.h
@@ -9,8 +9,6 @@
  * the Free Software Foundation
  */
 
-#include <linux/kmemcheck.h>
-
 #define C2PORT_NAME_LEN			32
 
 struct device;
@@ -22,10 +20,8 @@ struct device;
 /* Main struct */
 struct c2port_ops;
 struct c2port_device {
-	kmemcheck_bitfield_begin(flags);
 	unsigned int access:1;
 	unsigned int flash_access:1;
-	kmemcheck_bitfield_end(flags);
 
 	int id;
 	char name[C2PORT_NAME_LEN];
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index eee1499db396..e8f8e8fb244d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -9,7 +9,6 @@
 #include <linux/dma-debug.h>
 #include <linux/dma-direction.h>
 #include <linux/scatterlist.h>
-#include <linux/kmemcheck.h>
 #include <linux/bug.h>
 #include <linux/mem_encrypt.h>
 
@@ -232,7 +231,6 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	dma_addr_t addr;
 
-	kmemcheck_mark_initialized(ptr, size);
 	BUG_ON(!valid_dma_direction(dir));
 	addr = ops->map_page(dev, virt_to_page(ptr),
 			     offset_in_page(ptr), size,
@@ -265,11 +263,8 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 				   unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-	int i, ents;
-	struct scatterlist *s;
+	int ents;
 
-	for_each_sg(sg, s, nents, i)
-		kmemcheck_mark_initialized(sg_virt(s), s->length);
 	BUG_ON(!valid_dma_direction(dir));
 	ents = ops->map_sg(dev, sg, nents, dir, attrs);
 	BUG_ON(ents < 0);
@@ -299,7 +294,6 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	dma_addr_t addr;
 
-	kmemcheck_mark_initialized(page_address(page) + offset, size);
 	BUG_ON(!valid_dma_direction(dir));
 	addr = ops->map_page(dev, page, offset, size, dir, attrs);
 	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 48ec57e70f9f..42197b16dd78 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -454,13 +454,11 @@ struct bpf_binary_header {
 
 struct bpf_prog {
 	u16			pages;		/* Number of allocated pages */
-	kmemcheck_bitfield_begin(meta);
 	u16			jited:1,	/* Is our filter JIT'ed? */
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
 				dst_needed:1;	/* Do we need dst entry? */
-	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
 	u32			jited_len;	/* Size of jited insns in bytes */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 09643e0472fc..cfd0ac4e5e0e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -209,14 +209,6 @@ struct page {
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 
-#ifdef CONFIG_KMEMCHECK
-	/*
-	 * kmemcheck wants to track the status of each byte in a page; this
-	 * is a pointer to such a status block. NULL if not tracked.
-	 */
-	void *shadow;
-#endif
-
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
 	int _last_cpupid;
 #endif
diff --git a/include/linux/net.h b/include/linux/net.h
index d97d80d7fdf8..caeb159abda5 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -22,7 +22,6 @@
 #include <linux/random.h>
 #include <linux/wait.h>
 #include <linux/fcntl.h>	/* For O_CLOEXEC and O_NONBLOCK */
-#include <linux/kmemcheck.h>
 #include <linux/rcupdate.h>
 #include <linux/once.h>
 #include <linux/fs.h>
@@ -111,9 +110,7 @@ struct socket_wq {
 struct socket {
 	socket_state		state;
 
-	kmemcheck_bitfield_begin(type);
 	short			type;
-	kmemcheck_bitfield_end(type);
 
 	unsigned long		flags;
 
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index fa6ace66fea5..289e4d54e3e0 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -2,7 +2,6 @@
 #ifndef _LINUX_RING_BUFFER_H
 #define _LINUX_RING_BUFFER_H
 
-#include <linux/kmemcheck.h>
 #include <linux/mm.h>
 #include <linux/seq_file.h>
 #include <linux/poll.h>
@@ -14,9 +13,7 @@ struct ring_buffer_iter;
  * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
-	kmemcheck_bitfield_begin(bitfield);
 	u32		type_len:5, time_delta:27;
-	kmemcheck_bitfield_end(bitfield);
 
 	u32		array[];
 };
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d448a4804aea..aa1341474916 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -15,7 +15,6 @@
 #define _LINUX_SKBUFF_H
 
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/compiler.h>
 #include <linux/time.h>
 #include <linux/bug.h>
@@ -704,7 +703,6 @@ struct sk_buff {
 	/* Following fields are _not_ copied in __copy_skb_header()
 	 * Note that queue_mapping is here mostly to fill a hole.
 	 */
-	kmemcheck_bitfield_begin(flags1);
 	__u16			queue_mapping;
 
 /* if you move cloned around you also must adapt those constants */
@@ -723,7 +721,6 @@ struct sk_buff {
 				head_frag:1,
 				xmit_more:1,
 				__unused:1; /* one bit hole */
-	kmemcheck_bitfield_end(flags1);
 
 	/* fields enclosed in headers_start/headers_end are copied
 	 * using a single memcpy() in __copy_skb_header()
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index db8162dd8c0b..8e51b4a69088 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -17,7 +17,6 @@
 #define _INET_SOCK_H
 
 #include <linux/bitops.h>
-#include <linux/kmemcheck.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/jhash.h>
@@ -84,7 +83,6 @@ struct inet_request_sock {
 #define ireq_state		req.__req_common.skc_state
 #define ireq_family		req.__req_common.skc_family
 
-	kmemcheck_bitfield_begin(flags);
 	u16			snd_wscale : 4,
 				rcv_wscale : 4,
 				tstamp_ok  : 1,
@@ -93,7 +91,6 @@ struct inet_request_sock {
 				ecn_ok	   : 1,
 				acked	   : 1,
 				no_srccheck: 1;
-	kmemcheck_bitfield_end(flags);
 	u32                     ir_mark;
 	union {
 		struct ip_options_rcu __rcu	*ireq_opt;
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 6a75d67a30fd..1356fa6a7566 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -15,8 +15,6 @@
 #ifndef _INET_TIMEWAIT_SOCK_
 #define _INET_TIMEWAIT_SOCK_
 
-
-#include <linux/kmemcheck.h>
 #include <linux/list.h>
 #include <linux/timer.h>
 #include <linux/types.h>
@@ -69,14 +67,12 @@ struct inet_timewait_sock {
 	/* Socket demultiplex comparisons on incoming packets. */
 	/* these three are in inet_sock */
 	__be16			tw_sport;
-	kmemcheck_bitfield_begin(flags);
 	/* And these are ours. */
 	unsigned int		tw_kill		: 1,
 				tw_transparent  : 1,
 				tw_flowlabel	: 20,
 				tw_pad		: 2,	/* 2 bits hole */
 				tw_tos		: 8;
-	kmemcheck_bitfield_end(flags);
 	struct timer_list	tw_timer;
 	struct inet_bind_bucket	*tw_tb;
 };
diff --git a/include/net/sock.h b/include/net/sock.h
index c577286dbffb..a63e6a8bb7e0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -436,7 +436,6 @@ struct sock {
 #define SK_FL_TYPE_MASK    0xffff0000
 #endif
 
-	kmemcheck_bitfield_begin(flags);
 	unsigned int		sk_padding : 1,
 				sk_kern_sock : 1,
 				sk_no_check_tx : 1,
@@ -445,8 +444,6 @@ struct sock {
 				sk_protocol  : 8,
 				sk_type      : 16;
 #define SK_PROTOCOL_MAX U8_MAX
-	kmemcheck_bitfield_end(flags);
-
 	u16			sk_gso_max_segs;
 	unsigned long	        sk_lingertime;
 	struct proto		*sk_prot_creator;
-- 
cgit v1.2.3


From 75f296d93bcebcfe375884ddac79e30263a31766 Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:35:54 -0800
Subject: kmemcheck: stop using GFP_NOTRACK and SLAB_NOTRACK

Convert all allocations that used a NOTRACK flag to stop using it.

Link: http://lkml.kernel.org/r/20171007030159.22241-3-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Hansen <devtimhansen@gmail.com>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/thread_info.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 4bcdf00c110f..34f053a150a9 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -44,10 +44,9 @@ enum {
 #endif
 
 #if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK)
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
-				 __GFP_ZERO)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_ZERO)
 #else
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT)
 #endif
 
 /*
-- 
cgit v1.2.3


From d8be75663cec0069b85f80191abd2682ce4a512f Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:35:58 -0800
Subject: kmemcheck: remove whats left of NOTRACK flags

Now that kmemcheck is gone, we don't need the NOTRACK flags.

Link: http://lkml.kernel.org/r/20171007030159.22241-5-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Hansen <devtimhansen@gmail.com>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h            | 9 ---------
 include/linux/slab.h           | 6 ------
 include/trace/events/mmflags.h | 1 -
 3 files changed, 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 710143741eb5..b041f94678de 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -37,7 +37,6 @@ struct vm_area_struct;
 #define ___GFP_THISNODE		0x40000u
 #define ___GFP_ATOMIC		0x80000u
 #define ___GFP_ACCOUNT		0x100000u
-#define ___GFP_NOTRACK		0x200000u
 #define ___GFP_DIRECT_RECLAIM	0x400000u
 #define ___GFP_WRITE		0x800000u
 #define ___GFP_KSWAPD_RECLAIM	0x1000000u
@@ -201,19 +200,11 @@ struct vm_area_struct;
  * __GFP_COMP address compound page metadata.
  *
  * __GFP_ZERO returns a zeroed page on success.
- *
- * __GFP_NOTRACK avoids tracking with kmemcheck.
- *
- * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of
- *   distinguishing in the source between false positives and allocations that
- *   cannot be supported (e.g. page tables).
  */
 #define __GFP_COLD	((__force gfp_t)___GFP_COLD)
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
-#define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)
-#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
 /* Disable lockdep for GFP context tracking */
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ebddfca9e24b..79f6532f8a0b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -89,12 +89,6 @@
 /* Avoid kmemleak tracing */
 #define SLAB_NOLEAKTRACE	((slab_flags_t __force)0x00800000U)
 
-/* Don't track use of uninitialized memory */
-#ifdef CONFIG_KMEMCHECK
-# define SLAB_NOTRACK		((slab_flags_t __force)0x01000000U)
-#else
-# define SLAB_NOTRACK		0
-#endif
 /* Fault injection mark */
 #ifdef CONFIG_FAILSLAB
 # define SLAB_FAILSLAB		((slab_flags_t __force)0x02000000U)
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 648cbf603736..72162f3a03fa 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -46,7 +46,6 @@
 	{(unsigned long)__GFP_RECLAIMABLE,	"__GFP_RECLAIMABLE"},	\
 	{(unsigned long)__GFP_MOVABLE,		"__GFP_MOVABLE"},	\
 	{(unsigned long)__GFP_ACCOUNT,		"__GFP_ACCOUNT"},	\
-	{(unsigned long)__GFP_NOTRACK,		"__GFP_NOTRACK"},	\
 	{(unsigned long)__GFP_WRITE,		"__GFP_WRITE"},		\
 	{(unsigned long)__GFP_RECLAIM,		"__GFP_RECLAIM"},	\
 	{(unsigned long)__GFP_DIRECT_RECLAIM,	"__GFP_DIRECT_RECLAIM"},\
-- 
cgit v1.2.3


From 4675ff05de2d76d167336b368bd07f3fef6ed5a6 Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:36:02 -0800
Subject: kmemcheck: rip it out

Fix up makefiles, remove references, and git rm kmemcheck.

Link: http://lkml.kernel.org/r/20171007030159.22241-4-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Tim Hansen <devtimhansen@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/interrupt.h |  15 ----
 include/linux/kmemcheck.h | 171 ----------------------------------------------
 2 files changed, 186 deletions(-)

(limited to 'include')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index baeb872283d9..69c238210325 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -594,21 +594,6 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t)
 		__tasklet_hi_schedule(t);
 }
 
-extern void __tasklet_hi_schedule_first(struct tasklet_struct *t);
-
-/*
- * This version avoids touching any other tasklets. Needed for kmemcheck
- * in order not to take any page faults while enqueueing this tasklet;
- * consider VERY carefully whether you really need this or
- * tasklet_hi_schedule()...
- */
-static inline void tasklet_hi_schedule_first(struct tasklet_struct *t)
-{
-	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
-		__tasklet_hi_schedule_first(t);
-}
-
-
 static inline void tasklet_disable_nosync(struct tasklet_struct *t)
 {
 	atomic_inc(&t->count);
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index 7b1d7bead7d9..ea32a7d3cf1b 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -1,172 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LINUX_KMEMCHECK_H
-#define LINUX_KMEMCHECK_H
-
-#include <linux/mm_types.h>
-#include <linux/types.h>
-
-#ifdef CONFIG_KMEMCHECK
-extern int kmemcheck_enabled;
-
-/* The slab-related functions. */
-void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node);
-void kmemcheck_free_shadow(struct page *page, int order);
-void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
-			  size_t size);
-void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size);
-
-void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order,
-			       gfp_t gfpflags);
-
-void kmemcheck_show_pages(struct page *p, unsigned int n);
-void kmemcheck_hide_pages(struct page *p, unsigned int n);
-
-bool kmemcheck_page_is_tracked(struct page *p);
-
-void kmemcheck_mark_unallocated(void *address, unsigned int n);
-void kmemcheck_mark_uninitialized(void *address, unsigned int n);
-void kmemcheck_mark_initialized(void *address, unsigned int n);
-void kmemcheck_mark_freed(void *address, unsigned int n);
-
-void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n);
-void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n);
-void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n);
-
-int kmemcheck_show_addr(unsigned long address);
-int kmemcheck_hide_addr(unsigned long address);
-
-bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size);
-
-/*
- * Bitfield annotations
- *
- * How to use: If you have a struct using bitfields, for example
- *
- *     struct a {
- *             int x:8, y:8;
- *     };
- *
- * then this should be rewritten as
- *
- *     struct a {
- *             kmemcheck_bitfield_begin(flags);
- *             int x:8, y:8;
- *             kmemcheck_bitfield_end(flags);
- *     };
- *
- * Now the "flags_begin" and "flags_end" members may be used to refer to the
- * beginning and end, respectively, of the bitfield (and things like
- * &x.flags_begin is allowed). As soon as the struct is allocated, the bit-
- * fields should be annotated:
- *
- *     struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL);
- *     kmemcheck_annotate_bitfield(a, flags);
- */
-#define kmemcheck_bitfield_begin(name)	\
-	int name##_begin[0];
-
-#define kmemcheck_bitfield_end(name)	\
-	int name##_end[0];
-
-#define kmemcheck_annotate_bitfield(ptr, name)				\
-	do {								\
-		int _n;							\
-									\
-		if (!ptr)						\
-			break;						\
-									\
-		_n = (long) &((ptr)->name##_end)			\
-			- (long) &((ptr)->name##_begin);		\
-		BUILD_BUG_ON(_n < 0);					\
-									\
-		kmemcheck_mark_initialized(&((ptr)->name##_begin), _n);	\
-	} while (0)
-
-#define kmemcheck_annotate_variable(var)				\
-	do {								\
-		kmemcheck_mark_initialized(&(var), sizeof(var));	\
-	} while (0)							\
-
-#else
-#define kmemcheck_enabled 0
-
-static inline void
-kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
-{
-}
-
-static inline void
-kmemcheck_free_shadow(struct page *page, int order)
-{
-}
-
-static inline void
-kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
-		     size_t size)
-{
-}
-
-static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object,
-				       size_t size)
-{
-}
-
-static inline void kmemcheck_pagealloc_alloc(struct page *p,
-	unsigned int order, gfp_t gfpflags)
-{
-}
-
-static inline bool kmemcheck_page_is_tracked(struct page *p)
-{
-	return false;
-}
-
-static inline void kmemcheck_mark_unallocated(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_freed(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_unallocated_pages(struct page *p,
-						    unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_uninitialized_pages(struct page *p,
-						      unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_initialized_pages(struct page *p,
-						    unsigned int n)
-{
-}
-
-static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
-{
-	return true;
-}
-
-#define kmemcheck_bitfield_begin(name)
-#define kmemcheck_bitfield_end(name)
-#define kmemcheck_annotate_bitfield(ptr, name)	\
-	do {					\
-	} while (0)
-
-#define kmemcheck_annotate_variable(var)	\
-	do {					\
-	} while (0)
-
-#endif /* CONFIG_KMEMCHECK */
-
-#endif /* LINUX_KMEMCHECK_H */
-- 
cgit v1.2.3


From ea1f5f3712afe895dfa4176ec87376b4a9ac23be Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Wed, 15 Nov 2017 17:36:27 -0800
Subject: mm: define memblock_virt_alloc_try_nid_raw

* A new variant of memblock_virt_alloc_* allocations:
memblock_virt_alloc_try_nid_raw()
    - Does not zero the allocated memory
    - Does not panic if request cannot be satisfied

* optimize early system hash allocations

Clients can call alloc_large_system_hash() with flag: HASH_ZERO to
specify that memory that was allocated for system hash needs to be
zeroed, otherwise the memory does not need to be zeroed, and client will
initialize it.

If memory does not need to be zero'd, call the new
memblock_virt_alloc_raw() interface, and thus improve the boot
performance.

* debug for raw alloctor

When CONFIG_DEBUG_VM is enabled, this patch sets all the memory that is
returned by memblock_virt_alloc_try_nid_raw() to ones to ensure that no
places excpect zeroed memory.

Link: http://lkml.kernel.org/r/20171013173214.27300-6-pasha.tatashin@oracle.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Steven Sistare <steven.sistare@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index fdf40ca04b3c..a53063e9d7d8 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -161,6 +161,9 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 #define BOOTMEM_ALLOC_ANYWHERE		(~(phys_addr_t)0)
 
 /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
+void *memblock_virt_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
+				      phys_addr_t min_addr,
+				      phys_addr_t max_addr, int nid);
 void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
 		phys_addr_t align, phys_addr_t min_addr,
 		phys_addr_t max_addr, int nid);
@@ -177,6 +180,14 @@ static inline void * __init memblock_virt_alloc(
 					    NUMA_NO_NODE);
 }
 
+static inline void * __init memblock_virt_alloc_raw(
+					phys_addr_t size,  phys_addr_t align)
+{
+	return memblock_virt_alloc_try_nid_raw(size, align, BOOTMEM_LOW_LIMIT,
+					    BOOTMEM_ALLOC_ACCESSIBLE,
+					    NUMA_NO_NODE);
+}
+
 static inline void * __init memblock_virt_alloc_nopanic(
 					phys_addr_t size, phys_addr_t align)
 {
@@ -258,6 +269,14 @@ static inline void * __init memblock_virt_alloc(
 	return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT);
 }
 
+static inline void * __init memblock_virt_alloc_raw(
+					phys_addr_t size,  phys_addr_t align)
+{
+	if (!align)
+		align = SMP_CACHE_BYTES;
+	return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT);
+}
+
 static inline void * __init memblock_virt_alloc_nopanic(
 					phys_addr_t size, phys_addr_t align)
 {
@@ -310,6 +329,14 @@ static inline void * __init memblock_virt_alloc_try_nid(phys_addr_t size,
 					  min_addr);
 }
 
+static inline void * __init memblock_virt_alloc_try_nid_raw(
+			phys_addr_t size, phys_addr_t align,
+			phys_addr_t min_addr, phys_addr_t max_addr, int nid)
+{
+	return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align,
+				min_addr, max_addr);
+}
+
 static inline void * __init memblock_virt_alloc_try_nid_nopanic(
 			phys_addr_t size, phys_addr_t align,
 			phys_addr_t min_addr, phys_addr_t max_addr, int nid)
-- 
cgit v1.2.3


From a4a3ede2132ae0863e2d43e06f9b5697c51a7a3b Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Wed, 15 Nov 2017 17:36:31 -0800
Subject: mm: zero reserved and unavailable struct pages

Some memory is reserved but unavailable: not present in memblock.memory
(because not backed by physical pages), but present in memblock.reserved.
Such memory has backing struct pages, but they are not initialized by
going through __init_single_page().

In some cases these struct pages are accessed even if they do not
contain any data.  One example is page_to_pfn() might access page->flags
if this is where section information is stored (CONFIG_SPARSEMEM,
SECTION_IN_PAGE_FLAGS).

One example of such memory: trim_low_memory_range() unconditionally
reserves from pfn 0, but e820__memblock_setup() might provide the
exiting memory from pfn 1 (i.e.  KVM).

Since struct pages are zeroed in __init_single_page(), and not during
allocation time, we must zero such struct pages explicitly.

The patch involves adding a new memblock iterator:
	for_each_resv_unavail_range(i, p_start, p_end)

Which iterates through reserved && !memory lists, and we zero struct pages
explicitly by calling mm_zero_struct_page().

===

Here is more detailed example of problem that this patch is addressing:

Run tested on qemu with the following arguments:

	-enable-kvm -cpu kvm64 -m 512 -smp 2

This patch reports that there are 98 unavailable pages.

They are: pfn 0 and pfns in range [159, 255].

Note, trim_low_memory_range() reserves only pfns in range [0, 15], it does
not reserve [159, 255] ones.

e820__memblock_setup() reports linux that the following physical ranges are
available:
    [1 , 158]
[256, 130783]

Notice, that exactly unavailable pfns are missing!

Now, lets check what we have in zone 0: [1, 131039]

pfn 0, is not part of the zone, but pfns [1, 158], are.

However, the bigger problem we have if we do not initialize these struct
pages is with memory hotplug.  Because, that path operates at 2M
boundaries (section_nr).  And checks if 2M range of pages is hot
removable.  It starts with first pfn from zone, rounds it down to 2M
boundary (sturct pages are allocated at 2M boundaries when vmemmap is
created), and checks if that section is hot removable.  In this case
start with pfn 1 and convert it down to pfn 0.  Later pfn is converted
to struct page, and some fields are checked.  Now, if we do not zero
struct pages, we get unpredictable results.

In fact when CONFIG_VM_DEBUG is enabled, and we explicitly set all
vmemmap memory to ones, the following panic is observed with kernel test
without this patch applied:

  BUG: unable to handle kernel NULL pointer dereference at          (null)
  IP: is_pageblock_removable_nolock+0x35/0x90
  PGD 0 P4D 0
  Oops: 0000 [#1] PREEMPT
  ...
  task: ffff88001f4e2900 task.stack: ffffc90000314000
  RIP: 0010:is_pageblock_removable_nolock+0x35/0x90
  Call Trace:
   ? is_mem_section_removable+0x5a/0xd0
   show_mem_removable+0x6b/0xa0
   dev_attr_show+0x1b/0x50
   sysfs_kf_seq_show+0xa1/0x100
   kernfs_seq_show+0x22/0x30
   seq_read+0x1ac/0x3a0
   kernfs_fop_read+0x36/0x190
   ? security_file_permission+0x90/0xb0
   __vfs_read+0x16/0x30
   vfs_read+0x81/0x130
   SyS_read+0x44/0xa0
   entry_SYSCALL_64_fastpath+0x1f/0xbd

Link: http://lkml.kernel.org/r/20171013173214.27300-7-pasha.tatashin@oracle.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Steven Sistare <steven.sistare@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 16 ++++++++++++++++
 include/linux/mm.h       | 15 +++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ce0e5634c2f9..7ed0f7782d16 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -237,6 +237,22 @@ unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
 	for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved,	\
 			       nid, flags, p_start, p_end, p_nid)
 
+/**
+ * for_each_resv_unavail_range - iterate through reserved and unavailable memory
+ * @i: u64 used as loop variable
+ * @flags: pick from blocks based on memory attributes
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over unavailable but reserved (reserved && !memory) areas of memblock.
+ * Available as soon as memblock is initialized.
+ * Note: because this memory does not belong to any physical node, flags and
+ * nid arguments do not make sense and thus not exported as arguments.
+ */
+#define for_each_resv_unavail_range(i, p_start, p_end)			\
+	for_each_mem_range(i, &memblock.reserved, &memblock.memory,	\
+			   NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL)
+
 static inline void memblock_set_region_flags(struct memblock_region *r,
 					     unsigned long flags)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7c1e82a1aa77..703599fa8288 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -95,6 +95,15 @@ extern int mmap_rnd_compat_bits __read_mostly;
 #define mm_forbids_zeropage(X)	(0)
 #endif
 
+/*
+ * On some architectures it is expensive to call memset() for small sizes.
+ * Those architectures should provide their own implementation of "struct page"
+ * zeroing by defining this macro in <asm/pgtable.h>.
+ */
+#ifndef mm_zero_struct_page
+#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
+#endif
+
 /*
  * Default maximum number of active map areas, this limits the number of vmas
  * per mm struct. Users can overwrite this number by sysctl but there is a
@@ -2030,6 +2039,12 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn,
 					struct mminit_pfnnid_cache *state);
 #endif
 
+#ifdef CONFIG_HAVE_MEMBLOCK
+void zero_resv_unavail(void);
+#else
+static inline void zero_resv_unavail(void) {}
+#endif
+
 extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_zone(unsigned long, int, unsigned long,
 				unsigned long, enum memmap_context);
-- 
cgit v1.2.3


From 736304f3245f39392895ff3392e1325d3e49e7d2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Nov 2017 17:37:11 -0800
Subject: mm: speed up cancel_dirty_page() for clean pages

Patch series "Speed up page cache truncation", v1.

When rebasing our enterprise distro to a newer kernel (from 4.4 to 4.12)
we have noticed a regression in bonnie++ benchmark when deleting files.
Eventually we have tracked this down to a fact that page cache
truncation got slower by about 10%.  There were both gains and losses in
the above interval of kernels but we have been able to identify that
commit 83929372f629 ("filemap: prepare find and delete operations for
huge pages") caused about 10% regression on its own.

After some investigation it didn't seem easily possible to fix the
regression while maintaining the THP in page cache functionality so
we've decided to optimize the page cache truncation path instead to make
up for the change.  This series is a result of that effort.

Patch 1 is an easy speedup of cancel_dirty_page().  Patches 2-6 refactor
page cache truncation code so that it is easier to batch radix tree
operations.  Patch 7 implements batching of deletes from the radix tree
which more than makes up for the original regression.

This patch (of 7):

cancel_dirty_page() does quite some work even for clean pages (fetching
of mapping, locking of memcg, atomic bit op on page flags) so it
accounts for ~2.5% of cost of truncation of a clean page.  That is not
much but still dumb for something we don't need at all.  Check whether a
page is actually dirty and avoid any work if not.

Link: http://lkml.kernel.org/r/20171010151937.26984-2-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 703599fa8288..c7b1d617dff6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1440,7 +1440,13 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
 			  struct bdi_writeback *wb);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
-void cancel_dirty_page(struct page *page);
+void __cancel_dirty_page(struct page *page);
+static inline void cancel_dirty_page(struct page *page)
+{
+	/* Avoid atomic ops, locking, etc. when not actually needed. */
+	if (PageDirty(page))
+		__cancel_dirty_page(page);
+}
 int clear_page_dirty_for_io(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
-- 
cgit v1.2.3


From aa65c29ce1b6e1990cd2c7d8004bbea7ff3aff38 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Nov 2017 17:37:33 -0800
Subject: mm: batch radix tree operations when truncating pages

Currently we remove pages from the radix tree one by one.  To speed up
page cache truncation, lock several pages at once and free them in one
go.  This allows us to batch radix tree operations in a more efficient
way and also save round-trips on mapping->tree_lock.  As a result we
gain about 20% speed improvement in page cache truncation.

Data from a simple benchmark timing 10000 truncates of 1024 pages (on
ext4 on ramdisk but the filesystem is barely visible in the profiles).
The range shows 1% and 95% percentiles of the measured times:

  4.14-rc2	4.14-rc2 + batched truncation
  248-256	209-219
  249-258	209-217
  248-255	211-239
  248-255	209-217
  247-256	210-218

[jack@suse.cz: convert delete_from_page_cache_batch() to pagevec]
  Link: http://lkml.kernel.org/r/20171018111648.13714-1-jack@suse.cz
[akpm@linux-foundation.org: move struct pagevec forward declaration to top-of-file]
Link: http://lkml.kernel.org/r/20171010151937.26984-8-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 956d6a80e126..e0f7181118fe 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -16,6 +16,8 @@
 #include <linux/hardirq.h> /* for in_interrupt() */
 #include <linux/hugetlb_inline.h>
 
+struct pagevec;
+
 /*
  * Bits in mapping->flags.
  */
@@ -624,6 +626,8 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 extern void delete_from_page_cache(struct page *page);
 extern void __delete_from_page_cache(struct page *page, void *shadow);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
+void delete_from_page_cache_batch(struct address_space *mapping,
+				  struct pagevec *pvec);
 
 /*
  * Like add_to_page_cache_locked, but used to add newly allocated pages:
-- 
cgit v1.2.3


From c7df8ad2910e965a6241b6d8f52fd122e26b0315 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:37:41 -0800
Subject: mm, truncate: do not check mapping for every page being truncated

During truncation, the mapping has already been checked for shmem and
dax so it's known that workingset_update_node is required.

This patch avoids the checks on mapping for each page being truncated.
In all other cases, a lookup helper is used to determine if
workingset_update_node() needs to be called.  The one danger is that the
API is slightly harder to use as calling workingset_update_node directly
without checking for dax or shmem mappings could lead to surprises.
However, the API rarely needs to be used and hopefully the comment is
enough to give people the hint.

sparsetruncate (tiny)
                              4.14.0-rc4             4.14.0-rc4
                             oneirq-v1r1        pickhelper-v1r1
Min          Time      141.00 (   0.00%)      140.00 (   0.71%)
1st-qrtle    Time      142.00 (   0.00%)      141.00 (   0.70%)
2nd-qrtle    Time      142.00 (   0.00%)      142.00 (   0.00%)
3rd-qrtle    Time      143.00 (   0.00%)      143.00 (   0.00%)
Max-90%      Time      144.00 (   0.00%)      144.00 (   0.00%)
Max-95%      Time      147.00 (   0.00%)      145.00 (   1.36%)
Max-99%      Time      195.00 (   0.00%)      191.00 (   2.05%)
Max          Time      230.00 (   0.00%)      205.00 (  10.87%)
Amean        Time      144.37 (   0.00%)      143.82 (   0.38%)
Stddev       Time       10.44 (   0.00%)        9.00 (  13.74%)
Coeff        Time        7.23 (   0.00%)        6.26 (  13.41%)
Best99%Amean Time      143.72 (   0.00%)      143.34 (   0.26%)
Best95%Amean Time      142.37 (   0.00%)      142.00 (   0.26%)
Best90%Amean Time      142.19 (   0.00%)      141.85 (   0.24%)
Best75%Amean Time      141.92 (   0.00%)      141.58 (   0.24%)
Best50%Amean Time      141.69 (   0.00%)      141.31 (   0.27%)
Best25%Amean Time      141.38 (   0.00%)      140.97 (   0.29%)

As you'd expect, the gain is marginal but it can be detected.  The
differences in bonnie are all within the noise which is not surprising
given the impact on the microbenchmark.

radix_tree_update_node_t is a callback for some radix operations that
optionally passes in a private field.  The only user of the callback is
workingset_update_node and as it no longer requires a mapping, the
private field is removed.

Link: http://lkml.kernel.org/r/20171018075952.10627-3-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  7 +++----
 include/linux/swap.h       | 13 ++++++++++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 567ebb5eaab0..0ca448c1cb42 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -301,18 +301,17 @@ void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
 void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
 void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
 					unsigned long index);
-typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *);
+typedef void (*radix_tree_update_node_t)(struct radix_tree_node *);
 void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
 			  void __rcu **slot, void *entry,
-			  radix_tree_update_node_t update_node, void *private);
+			  radix_tree_update_node_t update_node);
 void radix_tree_iter_replace(struct radix_tree_root *,
 		const struct radix_tree_iter *, void __rcu **slot, void *entry);
 void radix_tree_replace_slot(struct radix_tree_root *,
 			     void __rcu **slot, void *entry);
 void __radix_tree_delete_node(struct radix_tree_root *,
 			      struct radix_tree_node *,
-			      radix_tree_update_node_t update_node,
-			      void *private);
+			      radix_tree_update_node_t update_node);
 void radix_tree_iter_delete(struct radix_tree_root *,
 			struct radix_tree_iter *iter, void __rcu **slot);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8b8a6f965785..454f042bcdd5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -298,7 +298,18 @@ struct vma_swap_readahead {
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 bool workingset_refault(void *shadow);
 void workingset_activation(struct page *page);
-void workingset_update_node(struct radix_tree_node *node, void *private);
+
+/* Do not use directly, use workingset_lookup_update */
+void workingset_update_node(struct radix_tree_node *node);
+
+/* Returns workingset_update_node() if the mapping has shadow entries. */
+#define workingset_lookup_update(mapping)				\
+({									\
+	radix_tree_update_node_t __helper = workingset_update_node;	\
+	if (dax_mapping(mapping) || shmem_mapping(mapping))		\
+		__helper = NULL;					\
+	__helper;							\
+})
 
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
-- 
cgit v1.2.3


From d9ed0d08b6c6a882da1d8e75bb3162fc889fd199 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:37:48 -0800
Subject: mm: only drain per-cpu pagevecs once per pagevec usage

When a pagevec is initialised on the stack, it is generally used
multiple times over a range of pages, looking up entries and then
releasing them.  On each pagevec_release, the per-cpu deferred LRU
pagevecs are drained on the grounds the page being released may be on
those queues and the pages may be cache hot.  In many cases only the
first drain is necessary as it's unlikely that the range of pages being
walked is racing against LRU addition.  Even if there is such a race,
the impact is marginal where as constantly redraining the lru pagevecs
costs.

This patch ensures that pagevec is only drained once in a given
lifecycle without increasing the cache footprint of the pagevec
structure.  Only sparsetruncate tiny is shown here as large files have
many exceptional entries and calls pagecache_release less frequently.

sparsetruncate (tiny)
                              4.14.0-rc4             4.14.0-rc4
                        batchshadow-v1r1          onedrain-v1r1
Min          Time      141.00 (   0.00%)      141.00 (   0.00%)
1st-qrtle    Time      142.00 (   0.00%)      142.00 (   0.00%)
2nd-qrtle    Time      142.00 (   0.00%)      142.00 (   0.00%)
3rd-qrtle    Time      143.00 (   0.00%)      143.00 (   0.00%)
Max-90%      Time      144.00 (   0.00%)      144.00 (   0.00%)
Max-95%      Time      146.00 (   0.00%)      145.00 (   0.68%)
Max-99%      Time      198.00 (   0.00%)      194.00 (   2.02%)
Max          Time      254.00 (   0.00%)      208.00 (  18.11%)
Amean        Time      145.12 (   0.00%)      144.30 (   0.56%)
Stddev       Time       12.74 (   0.00%)        9.62 (  24.49%)
Coeff        Time        8.78 (   0.00%)        6.67 (  24.06%)
Best99%Amean Time      144.29 (   0.00%)      143.82 (   0.32%)
Best95%Amean Time      142.68 (   0.00%)      142.31 (   0.26%)
Best90%Amean Time      142.52 (   0.00%)      142.19 (   0.24%)
Best75%Amean Time      142.26 (   0.00%)      141.98 (   0.20%)
Best50%Amean Time      141.90 (   0.00%)      141.71 (   0.13%)
Best25%Amean Time      141.80 (   0.00%)      141.43 (   0.26%)

The impact on bonnie is marginal and within the noise because a
significant percentage of the file being truncated has been reclaimed
and consists of shadow entries which reduce the hotness of the
pagevec_release path.

Link: http://lkml.kernel.org/r/20171018075952.10627-5-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 4f56e0ad9d00..95c75c858d1f 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -17,7 +17,8 @@ struct address_space;
 
 struct pagevec {
 	unsigned long nr;
-	unsigned long cold;
+	bool cold;
+	bool drained;
 	struct page *pages[PAGEVEC_SIZE];
 };
 
@@ -54,6 +55,7 @@ static inline void pagevec_init(struct pagevec *pvec, int cold)
 {
 	pvec->nr = 0;
 	pvec->cold = cold;
+	pvec->drained = false;
 }
 
 static inline void pagevec_reinit(struct pagevec *pvec)
-- 
cgit v1.2.3


From 8667982014d6048e0b5e286b6247ff24f48d4cc6 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:37:52 -0800
Subject: mm, pagevec: remove cold parameter for pagevecs

Every pagevec_init user claims the pages being released are hot even in
cases where it is unlikely the pages are hot.  As no one cares about the
hotness of pages being released to the allocator, just ditch the
parameter.

No performance impact is expected as the overhead is marginal.  The
parameter is removed simply because it is a bit stupid to have a useless
parameter copied everywhere.

Link: http://lkml.kernel.org/r/20171018075952.10627-6-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 95c75c858d1f..eebefd209424 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -17,7 +17,6 @@ struct address_space;
 
 struct pagevec {
 	unsigned long nr;
-	bool cold;
 	bool drained;
 	struct page *pages[PAGEVEC_SIZE];
 };
@@ -51,10 +50,9 @@ static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
 	return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag);
 }
 
-static inline void pagevec_init(struct pagevec *pvec, int cold)
+static inline void pagevec_init(struct pagevec *pvec)
 {
 	pvec->nr = 0;
-	pvec->cold = cold;
 	pvec->drained = false;
 }
 
-- 
cgit v1.2.3


From c6f92f9fbe7dbcc8903a67229aa88b4077ae4422 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:37:55 -0800
Subject: mm: remove cold parameter for release_pages

All callers of release_pages claim the pages being released are cache
hot.  As no one cares about the hotness of pages being released to the
allocator, just ditch the parameter.

No performance impact is expected as the overhead is marginal.  The
parameter is removed simply because it is a bit stupid to have a useless
parameter copied everywhere.

Link: http://lkml.kernel.org/r/20171018075952.10627-7-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 2 +-
 include/linux/swap.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e0f7181118fe..4c6790bb7afb 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -118,7 +118,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 	m->gfp_mask = mask;
 }
 
-void release_pages(struct page **pages, int nr, bool cold);
+void release_pages(struct page **pages, int nr);
 
 /*
  * speculatively take a reference to a page.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 454f042bcdd5..c2b8128799c1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -510,7 +510,7 @@ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
 #define free_page_and_swap_cache(page) \
 	put_page(page)
 #define free_pages_and_swap_cache(pages, nr) \
-	release_pages((pages), (nr), false);
+	release_pages((pages), (nr));
 
 static inline void show_swap_cache_info(void)
 {
-- 
cgit v1.2.3


From 2d4894b5d2ae0fe1725ea7abd57b33bfbbe45492 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:37:59 -0800
Subject: mm: remove cold parameter from free_hot_cold_page*

Most callers users of free_hot_cold_page claim the pages being released
are cache hot.  The exception is the page reclaim paths where it is
likely that enough pages will be freed in the near future that the
per-cpu lists are going to be recycled and the cache hotness information
is lost.  As no one really cares about the hotness of pages being
released to the allocator, just ditch the parameter.

The APIs are renamed to indicate that it's no longer about hot/cold
pages.  It should also be less confusing as there are subtle differences
between them.  __free_pages drops a reference and frees a page when the
refcount reaches zero.  free_hot_cold_page handled pages whose refcount
was already zero which is non-obvious from the name.  free_unref_page
should be more obvious.

No performance impact is expected as the overhead is marginal.  The
parameter is removed simply because it is a bit stupid to have a useless
parameter copied everywhere.

[mgorman@techsingularity.net: add pages to head, not tail]
  Link: http://lkml.kernel.org/r/20171019154321.qtpzaeftoyyw4iey@techsingularity.net
Link: http://lkml.kernel.org/r/20171018075952.10627-8-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h         |  4 ++--
 include/trace/events/kmem.h | 11 ++++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index b041f94678de..f7e62d9096fe 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -530,8 +530,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
 
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
-extern void free_hot_cold_page(struct page *page, bool cold);
-extern void free_hot_cold_page_list(struct list_head *list, bool cold);
+extern void free_unref_page(struct page *page);
+extern void free_unref_page_list(struct list_head *list);
 
 struct page_frag_cache;
 extern void __page_frag_cache_drain(struct page *page, unsigned int count);
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 285feeadac39..eb57e3037deb 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -172,24 +172,21 @@ TRACE_EVENT(mm_page_free,
 
 TRACE_EVENT(mm_page_free_batched,
 
-	TP_PROTO(struct page *page, int cold),
+	TP_PROTO(struct page *page),
 
-	TP_ARGS(page, cold),
+	TP_ARGS(page),
 
 	TP_STRUCT__entry(
 		__field(	unsigned long,	pfn		)
-		__field(	int,		cold		)
 	),
 
 	TP_fast_assign(
 		__entry->pfn		= page_to_pfn(page);
-		__entry->cold		= cold;
 	),
 
-	TP_printk("page=%p pfn=%lu order=0 cold=%d",
+	TP_printk("page=%p pfn=%lu order=0",
 			pfn_to_page(__entry->pfn),
-			__entry->pfn,
-			__entry->cold)
+			__entry->pfn)
 );
 
 TRACE_EVENT(mm_page_alloc,
-- 
cgit v1.2.3


From 453f85d43fa9ee243f0fc3ac4e1be45615301e3f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:38:03 -0800
Subject: mm: remove __GFP_COLD

As the page free path makes no distinction between cache hot and cold
pages, there is no real useful ordering of pages in the free list that
allocation requests can take advantage of.  Juding from the users of
__GFP_COLD, it is likely that a number of them are the result of copying
other sites instead of actually measuring the impact.  Remove the
__GFP_COLD parameter which simplifies a number of paths in the page
allocator.

This is potentially controversial but bear in mind that the size of the
per-cpu pagelists versus modern cache sizes means that the whole per-cpu
list can often fit in the L3 cache.  Hence, there is only a potential
benefit for microbenchmarks that alloc/free pages in a tight loop.  It's
even worse when THP is taken into account which has little or no chance
of getting a cache-hot page as the per-cpu list is bypassed and the
zeroing of multiple pages will thrash the cache anyway.

The truncate microbenchmarks are not shown as this patch affects the
allocation path and not the free path.  A page fault microbenchmark was
tested but it showed no sigificant difference which is not surprising
given that the __GFP_COLD branches are a miniscule percentage of the
fault path.

Link: http://lkml.kernel.org/r/20171018075952.10627-9-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h            | 5 -----
 include/linux/pagemap.h        | 8 +-------
 include/linux/skbuff.h         | 2 +-
 include/linux/slab.h           | 3 ---
 include/trace/events/mmflags.h | 1 -
 5 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f7e62d9096fe..1a4582b44d32 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -24,7 +24,6 @@ struct vm_area_struct;
 #define ___GFP_HIGH		0x20u
 #define ___GFP_IO		0x40u
 #define ___GFP_FS		0x80u
-#define ___GFP_COLD		0x100u
 #define ___GFP_NOWARN		0x200u
 #define ___GFP_RETRY_MAYFAIL	0x400u
 #define ___GFP_NOFAIL		0x800u
@@ -192,16 +191,12 @@ struct vm_area_struct;
 /*
  * Action modifiers
  *
- * __GFP_COLD indicates that the caller does not expect to be used in the near
- *   future. Where possible, a cache-cold page will be returned.
- *
  * __GFP_NOWARN suppresses allocation failure reports.
  *
  * __GFP_COMP address compound page metadata.
  *
  * __GFP_ZERO returns a zeroed page on success.
  */
-#define __GFP_COLD	((__force gfp_t)___GFP_COLD)
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4c6790bb7afb..34ce3ebf97d5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -234,15 +234,9 @@ static inline struct page *page_cache_alloc(struct address_space *x)
 	return __page_cache_alloc(mapping_gfp_mask(x));
 }
 
-static inline struct page *page_cache_alloc_cold(struct address_space *x)
-{
-	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
-}
-
 static inline gfp_t readahead_gfp_mask(struct address_space *x)
 {
-	return mapping_gfp_mask(x) |
-				  __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN;
+	return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
 }
 
 typedef int filler_t(void *, struct page *);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index aa1341474916..7c46fd0b8b64 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2672,7 +2672,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
 	 * 4.  __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
 	 *     code in gfp_to_alloc_flags that should be enforcing this.
 	 */
-	gfp_mask |= __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC;
+	gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;
 
 	return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
 }
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 79f6532f8a0b..50697a1d6621 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -467,9 +467,6 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
  * Also it is possible to set different flags by OR'ing
  * in one or more of the following additional @flags:
  *
- * %__GFP_COLD - Request cache-cold pages instead of
- *   trying to return cache-warm pages.
- *
  * %__GFP_HIGH - This allocation has high priority and may use emergency pools.
  *
  * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 72162f3a03fa..dbe1bb058c09 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -32,7 +32,6 @@
 	{(unsigned long)__GFP_ATOMIC,		"__GFP_ATOMIC"},	\
 	{(unsigned long)__GFP_IO,		"__GFP_IO"},		\
 	{(unsigned long)__GFP_FS,		"__GFP_FS"},		\
-	{(unsigned long)__GFP_COLD,		"__GFP_COLD"},		\
 	{(unsigned long)__GFP_NOWARN,		"__GFP_NOWARN"},	\
 	{(unsigned long)__GFP_RETRY_MAYFAIL,	"__GFP_RETRY_MAYFAIL"},	\
 	{(unsigned long)__GFP_NOFAIL,		"__GFP_NOFAIL"},	\
-- 
cgit v1.2.3


From 7f0b5fb953e750a7410cc96c67a656d79db48bcb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:38:10 -0800
Subject: mm, pagevec: rename pagevec drained field

According to Vlastimil Babka, the drained field in pagevec is
potentially misleading because it might be interpreted as draining this
pagevec instead of the percpu lru pagevecs.  Rename the field for
clarity.

Link: http://lkml.kernel.org/r/20171019093346.ylahzdpzmoriyf4v@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index eebefd209424..5fb6580f7f23 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -17,7 +17,7 @@ struct address_space;
 
 struct pagevec {
 	unsigned long nr;
-	bool drained;
+	bool percpu_pvec_drained;
 	struct page *pages[PAGEVEC_SIZE];
 };
 
@@ -53,7 +53,7 @@ static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
 static inline void pagevec_init(struct pagevec *pvec)
 {
 	pvec->nr = 0;
-	pvec->drained = false;
+	pvec->percpu_pvec_drained = false;
 }
 
 static inline void pagevec_reinit(struct pagevec *pvec)
-- 
cgit v1.2.3


From 4518085e127dff97e74f74a8780d7564e273bec8 Mon Sep 17 00:00:00 2001
From: Kemi Wang <kemi.wang@intel.com>
Date: Wed, 15 Nov 2017 17:38:22 -0800
Subject: mm, sysctl: make NUMA stats configurable

This is the second step which introduces a tunable interface that allow
numa stats configurable for optimizing zone_statistics(), as suggested
by Dave Hansen and Ying Huang.

=========================================================================

When page allocation performance becomes a bottleneck and you can
tolerate some possible tool breakage and decreased numa counter
precision, you can do:

	echo 0 > /proc/sys/vm/numa_stat

In this case, numa counter update is ignored.  We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and
reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315)
drop of cpu cycles per single page allocation and reclaim on Jesper's
page_bench03 (88 threads) running on a 2-Socket Broadwell-based server
(88 threads, 126G memory).

Benchmark link provided by Jesper D Brouer (increase loop times to
10000000):

  https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench

=========================================================================

When page allocation performance is not a bottleneck and you want all
tooling to work, you can do:

	echo 1 > /proc/sys/vm/numa_stat

This is system default setting.

Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka
for comments to help improve the original patch.

[keescook@chromium.org: make sure mutex is a global static]
  Link: http://lkml.kernel.org/r/20171107213809.GA4314@beast
Link: http://lkml.kernel.org/r/1508290927-8518-1-git-send-email-kemi.wang@intel.com
Signed-off-by: Kemi Wang <kemi.wang@intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Suggested-by: Ying Huang <ying.huang@intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Luis R . Rodriguez" <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christopher Lameter <cl@linux.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1e0cb72e0598..1779c9817b39 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -7,9 +7,19 @@
 #include <linux/mmzone.h>
 #include <linux/vm_event_item.h>
 #include <linux/atomic.h>
+#include <linux/static_key.h>
 
 extern int sysctl_stat_interval;
 
+#ifdef CONFIG_NUMA
+#define ENABLE_NUMA_STAT   1
+#define DISABLE_NUMA_STAT   0
+extern int sysctl_vm_numa_stat;
+DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
+extern int sysctl_vm_numa_stat_handler(struct ctl_table *table,
+		int write, void __user *buffer, size_t *length, loff_t *ppos);
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
-- 
cgit v1.2.3


From d135e5750205a21a212a19dbb05aeb339e2cbea7 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Wed, 15 Nov 2017 17:38:41 -0800
Subject: mm/page_alloc.c: broken deferred calculation

In reset_deferred_meminit() we determine number of pages that must not
be deferred.  We initialize pages for at least 2G of memory, but also
pages for reserved memory in this node.

The reserved memory is determined in this function:
memblock_reserved_memory_within(), which operates over physical
addresses, and returns size in bytes.  However, reset_deferred_meminit()
assumes that that this function operates with pfns, and returns page
count.

The result is that in the best case machine boots slower than expected
due to initializing more pages than needed in single thread, and in the
worst case panics because fewer than needed pages are initialized early.

Link: http://lkml.kernel.org/r/20171021011707.15191-1-pasha.tatashin@oracle.com
Fixes: 864b9a393dcb ("mm: consider memblock reservations for deferred memory initialization sizing")
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39ccaa9c428b..67f2e3c38939 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -700,7 +700,8 @@ typedef struct pglist_data {
 	 * is the first PFN that needs to be initialised.
 	 */
 	unsigned long first_deferred_pfn;
-	unsigned long static_init_size;
+	/* Number of non-deferred pages */
+	unsigned long static_init_pgcnt;
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-- 
cgit v1.2.3


From 2bce774e8245e95db81872ec39522cde8b486fc8 Mon Sep 17 00:00:00 2001
From: Wang Long <wanglong19@meituan.com>
Date: Wed, 15 Nov 2017 17:39:03 -0800
Subject: writeback: remove unused function parameter

The parameter `struct bdi_writeback *wb` is not been used in the
function body.  Remove it.

Link: http://lkml.kernel.org/r/1509685485-15278-1-git-send-email-wanglong19@meituan.com
Signed-off-by: Wang Long <wanglong19@meituan.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 7360e79e9a2f..e54e7e0033eb 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -93,7 +93,7 @@ extern void wb_writeout_inc(struct bdi_writeback *wb);
 /*
  * maximal error of a stat counter.
  */
-static inline unsigned long wb_stat_error(struct bdi_writeback *wb)
+static inline unsigned long wb_stat_error(void)
 {
 #ifdef CONFIG_SMP
 	return nr_cpu_ids * WB_STAT_BATCH;
-- 
cgit v1.2.3


From 0205f75571e3a70c35f0dd5e608773cce97d9dbb Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 15 Nov 2017 17:39:14 -0800
Subject: mm: simplify nodemask printing

alloc_warn() and dump_header() have to explicitly handle NULL nodemask
which forces both paths to use pr_cont.  We can do better.  printk
already handles NULL pointers properly so all we need is to teach
nodemask_pr_args to handle NULL nodemask carefully.  This allows
simplification of both alloc_warn() and dump_header() and gets rid of
pr_cont altogether.

This patch has been motivated by patch from Joe Perches

  http://lkml.kernel.org/r/b31236dfe3fc924054fd7842bde678e71d193638.1509991345.git.joe@perches.com

[akpm@linux-foundation.org: fix tile warning, per Arnd]
Link: http://lkml.kernel.org/r/20171109100531.3cn2hcqnuj7mjaju@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Joe Perches <joe@perches.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nodemask.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index de1c50b93c61..15cab3967d6d 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -104,7 +104,9 @@ extern nodemask_t _unused_nodemask_arg_;
  *
  * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
  */
-#define nodemask_pr_args(maskp)		MAX_NUMNODES, (maskp)->bits
+#define nodemask_pr_args(maskp)				\
+	((maskp) != NULL) ? MAX_NUMNODES : 0,		\
+	((maskp) != NULL) ? (maskp)->bits : NULL
 
 /*
  * The inline keyword gives the compiler room to decide to inline, or
-- 
cgit v1.2.3