From 6b524995a71d49ae032dba308d117dbf2a18d175 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 26 Jul 2016 15:21:05 -0700
Subject: dax: remote unused fault wrappers

Remove the unused wrappers dax_fault() and dax_pmd_fault().  After this
removal, rename __dax_fault() and __dax_pmd_fault() to dax_fault() and
dax_pmd_fault() respectively, and update all callers.

The dax_fault() and dax_pmd_fault() wrappers were initially intended to
capture some filesystem independent functionality around page faults
(calling sb_start_pagefault() & sb_end_pagefault(), updating file mtime
and ctime).

However, the following commits:

   5726b27b09cc ("ext2: Add locking for DAX faults")
   ea3d7209ca01 ("ext4: fix races between page faults and hole punching")

added locking to the ext2 and ext4 filesystems after these common
operations but before __dax_fault() and __dax_pmd_fault() were called.
This means that these wrappers are no longer used, and are unlikely to
be used in the future.

XFS has had locking analogous to what was recently added to ext2 and
ext4 since DAX support was initially introduced by:

   6b698edeeef0 ("xfs: add DAX file operations support")

Link: http://lkml.kernel.org/r/20160714214049.20075-2-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dax.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 43d5f0b799c7..9c6dc7704043 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -14,7 +14,6 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
-int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 				   pgoff_t index, bool wake_all);
@@ -46,19 +45,15 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
 				unsigned int flags, get_block_t);
-int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
-				unsigned int flags, get_block_t);
 #else
 static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 				pmd_t *pmd, unsigned int flags, get_block_t gb)
 {
 	return VM_FAULT_FALLBACK;
 }
-#define __dax_pmd_fault dax_pmd_fault
 #endif
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
 #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
-#define __dax_mkwrite(vma, vmf, gb)	__dax_fault(vma, vmf, gb)
 
 static inline bool vma_is_dax(struct vm_area_struct *vma)
 {
-- 
cgit v1.2.3


From 17359a80b9315670f56cd016a92c083665e80ae1 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 26 Jul 2016 15:21:26 -0700
Subject: debugobjects.h: fix trivial kernel doc warning

Add ':' to fix trivial kernel-doc warning in <linux/debugobjects.h>:

  ..//include/linux/debugobjects.h:63: warning: No description found for parameter 'is_static_object'

Link: http://lkml.kernel.org/r/575B01B8.5060600@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debugobjects.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 46056cb161fc..d82bf1994485 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -38,7 +38,7 @@ struct debug_obj {
  * @name:		name of the object typee
  * @debug_hint:		function returning address, which have associated
  *			kernel symbol, to allow identify the object
- * @is_static_object	return true if the obj is static, otherwise return false
+ * @is_static_object:	return true if the obj is static, otherwise return false
  * @fixup_init:		fixup function, which is called when the init check
  *			fails. All fixup functions must return true if fixup
  *			was successful, otherwise return false
-- 
cgit v1.2.3


From 6c60d2b5746cf23025ffe71bd7ff9075048fc90c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 26 Jul 2016 15:21:50 -0700
Subject: fs/fs-writeback.c: add a new writeback list for sync
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

wait_sb_inodes() currently does a walk of all inodes in the filesystem
to find dirty one to wait on during sync.  This is highly inefficient
and wastes a lot of CPU when there are lots of clean cached inodes that
we don't need to wait on.

To avoid this "all inode" walk, we need to track inodes that are
currently under writeback that we need to wait for.  We do this by
adding inodes to a writeback list on the sb when the mapping is first
tagged as having pages under writeback.  wait_sb_inodes() can then walk
this list of "inodes under IO" and wait specifically just for the inodes
that the current sync(2) needs to wait for.

Define a couple helpers to add/remove an inode from the writeback list
and call them when the overall mapping is tagged for or cleared from
writeback.  Update wait_sb_inodes() to walk only the inodes under
writeback due to the sync.

With this change, filesystem sync times are significantly reduced for
fs' with largely populated inode caches and otherwise no other work to
do.  For example, on a 16xcpu 2GHz x86-64 server, 10TB XFS filesystem
with a ~10m entry inode cache, sync times are reduced from ~7.3s to less
than 0.1s when the filesystem is fully clean.

Link: http://lkml.kernel.org/r/1466594593-6757-2-git-send-email-bfoster@redhat.com
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Holger Hoffstätte <holger.hoffstaette@applied-asynchrony.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h        | 4 ++++
 include/linux/writeback.h | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd288148a6b1..0c9ebf530d9e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -665,6 +665,7 @@ struct inode {
 #endif
 	struct list_head	i_lru;		/* inode LRU list */
 	struct list_head	i_sb_list;
+	struct list_head	i_wb_list;	/* backing dev writeback list */
 	union {
 		struct hlist_head	i_dentry;
 		struct rcu_head		i_rcu;
@@ -1448,6 +1449,9 @@ struct super_block {
 	/* s_inode_list_lock protects s_inodes */
 	spinlock_t		s_inode_list_lock ____cacheline_aligned_in_smp;
 	struct list_head	s_inodes;	/* all inodes */
+
+	spinlock_t		s_inode_wblist_lock;
+	struct list_head	s_inodes_wb;	/* writeback inodes */
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d0b5ca5d4e08..717e6149e753 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -384,4 +384,7 @@ void tag_pages_for_writeback(struct address_space *mapping,
 
 void account_page_redirty(struct page *page);
 
+void sb_mark_inode_writeback(struct inode *inode);
+void sb_clear_inode_writeback(struct inode *inode);
+
 #endif		/* WRITEBACK_H */
-- 
cgit v1.2.3


From 7c00fce98c3e15334a603925b41aa49f76e83227 Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Tue, 26 Jul 2016 15:21:56 -0700
Subject: mm: reorganize SLAB freelist randomization

The kernel heap allocators are using a sequential freelist making their
allocation predictable.  This predictability makes kernel heap overflow
easier to exploit.  An attacker can careful prepare the kernel heap to
control the following chunk overflowed.

For example these attacks exploit the predictability of the heap:
 - Linux Kernel CAN SLUB overflow (https://goo.gl/oMNWkU)
 - Exploiting Linux Kernel Heap corruptions (http://goo.gl/EXLn95)

***Problems that needed solving:
 - Randomize the Freelist (singled linked) used in the SLUB allocator.
 - Ensure good performance to encourage usage.
 - Get best entropy in early boot stage.

***Parts:
 - 01/02 Reorganize the SLAB Freelist randomization to share elements
   with the SLUB implementation.
 - 02/02 The SLUB Freelist randomization implementation. Similar approach
   than the SLAB but tailored to the singled freelist used in SLUB.

***Performance data:

slab_test impact is between 3% to 4% on average for 100000 attempts
without smp.  It is a very focused testing, kernbench show the overall
impact on the system is way lower.

Before:

  Single thread testing
  =====================
  1. Kmalloc: Repeatedly allocate then free test
  100000 times kmalloc(8) -> 49 cycles kfree -> 77 cycles
  100000 times kmalloc(16) -> 51 cycles kfree -> 79 cycles
  100000 times kmalloc(32) -> 53 cycles kfree -> 83 cycles
  100000 times kmalloc(64) -> 62 cycles kfree -> 90 cycles
  100000 times kmalloc(128) -> 81 cycles kfree -> 97 cycles
  100000 times kmalloc(256) -> 98 cycles kfree -> 121 cycles
  100000 times kmalloc(512) -> 95 cycles kfree -> 122 cycles
  100000 times kmalloc(1024) -> 96 cycles kfree -> 126 cycles
  100000 times kmalloc(2048) -> 115 cycles kfree -> 140 cycles
  100000 times kmalloc(4096) -> 149 cycles kfree -> 171 cycles
  2. Kmalloc: alloc/free test
  100000 times kmalloc(8)/kfree -> 70 cycles
  100000 times kmalloc(16)/kfree -> 70 cycles
  100000 times kmalloc(32)/kfree -> 70 cycles
  100000 times kmalloc(64)/kfree -> 70 cycles
  100000 times kmalloc(128)/kfree -> 70 cycles
  100000 times kmalloc(256)/kfree -> 69 cycles
  100000 times kmalloc(512)/kfree -> 70 cycles
  100000 times kmalloc(1024)/kfree -> 73 cycles
  100000 times kmalloc(2048)/kfree -> 72 cycles
  100000 times kmalloc(4096)/kfree -> 71 cycles

After:

  Single thread testing
  =====================
  1. Kmalloc: Repeatedly allocate then free test
  100000 times kmalloc(8) -> 57 cycles kfree -> 78 cycles
  100000 times kmalloc(16) -> 61 cycles kfree -> 81 cycles
  100000 times kmalloc(32) -> 76 cycles kfree -> 93 cycles
  100000 times kmalloc(64) -> 83 cycles kfree -> 94 cycles
  100000 times kmalloc(128) -> 106 cycles kfree -> 107 cycles
  100000 times kmalloc(256) -> 118 cycles kfree -> 117 cycles
  100000 times kmalloc(512) -> 114 cycles kfree -> 116 cycles
  100000 times kmalloc(1024) -> 115 cycles kfree -> 118 cycles
  100000 times kmalloc(2048) -> 147 cycles kfree -> 131 cycles
  100000 times kmalloc(4096) -> 214 cycles kfree -> 161 cycles
  2. Kmalloc: alloc/free test
  100000 times kmalloc(8)/kfree -> 66 cycles
  100000 times kmalloc(16)/kfree -> 66 cycles
  100000 times kmalloc(32)/kfree -> 66 cycles
  100000 times kmalloc(64)/kfree -> 66 cycles
  100000 times kmalloc(128)/kfree -> 65 cycles
  100000 times kmalloc(256)/kfree -> 67 cycles
  100000 times kmalloc(512)/kfree -> 67 cycles
  100000 times kmalloc(1024)/kfree -> 64 cycles
  100000 times kmalloc(2048)/kfree -> 67 cycles
  100000 times kmalloc(4096)/kfree -> 67 cycles

Kernbench, before:

  Average Optimal load -j 12 Run (std deviation):
  Elapsed Time 101.873 (1.16069)
  User Time 1045.22 (1.60447)
  System Time 88.969 (0.559195)
  Percent CPU 1112.9 (13.8279)
  Context Switches 189140 (2282.15)
  Sleeps 99008.6 (768.091)

After:

  Average Optimal load -j 12 Run (std deviation):
  Elapsed Time 102.47 (0.562732)
  User Time 1045.3 (1.34263)
  System Time 88.311 (0.342554)
  Percent CPU 1105.8 (6.49444)
  Context Switches 189081 (2355.78)
  Sleeps 99231.5 (800.358)

This patch (of 2):

This commit reorganizes the previous SLAB freelist randomization to
prepare for the SLUB implementation.  It moves functions that will be
shared to slab_common.

The entropy functions are changed to align with the SLUB implementation,
now using get_random_(int|long) functions.  These functions were chosen
because they provide a bit more entropy early on boot and better
performance when specific arch instructions are not available.

[akpm@linux-foundation.org: fix build]
Link: http://lkml.kernel.org/r/1464295031-26375-2-git-send-email-thgarnie@google.com
Signed-off-by: Thomas Garnier <thgarnie@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab_def.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 8694f7a5d92b..339ba027ade9 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -81,7 +81,7 @@ struct kmem_cache {
 #endif
 
 #ifdef CONFIG_SLAB_FREELIST_RANDOM
-	void *random_seq;
+	unsigned int *random_seq;
 #endif
 
 	struct kmem_cache_node *node[MAX_NUMNODES];
-- 
cgit v1.2.3


From 210e7a43fa905bccafa9bb5966fba1d71f33eb8b Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Tue, 26 Jul 2016 15:21:59 -0700
Subject: mm: SLUB freelist randomization

Implements freelist randomization for the SLUB allocator.  It was
previous implemented for the SLAB allocator.  Both use the same
configuration option (CONFIG_SLAB_FREELIST_RANDOM).

The list is randomized during initialization of a new set of pages.  The
order on different freelist sizes is pre-computed at boot for
performance.  Each kmem_cache has its own randomized freelist.

This security feature reduces the predictability of the kernel SLUB
allocator against heap overflows rendering attacks much less stable.

For example these attacks exploit the predictability of the heap:
 - Linux Kernel CAN SLUB overflow (https://goo.gl/oMNWkU)
 - Exploiting Linux Kernel Heap corruptions (http://goo.gl/EXLn95)

Performance results:

slab_test impact is between 3% to 4% on average for 100000 attempts
without smp.  It is a very focused testing, kernbench show the overall
impact on the system is way lower.

Before:

  Single thread testing
  =====================
  1. Kmalloc: Repeatedly allocate then free test
  100000 times kmalloc(8) -> 49 cycles kfree -> 77 cycles
  100000 times kmalloc(16) -> 51 cycles kfree -> 79 cycles
  100000 times kmalloc(32) -> 53 cycles kfree -> 83 cycles
  100000 times kmalloc(64) -> 62 cycles kfree -> 90 cycles
  100000 times kmalloc(128) -> 81 cycles kfree -> 97 cycles
  100000 times kmalloc(256) -> 98 cycles kfree -> 121 cycles
  100000 times kmalloc(512) -> 95 cycles kfree -> 122 cycles
  100000 times kmalloc(1024) -> 96 cycles kfree -> 126 cycles
  100000 times kmalloc(2048) -> 115 cycles kfree -> 140 cycles
  100000 times kmalloc(4096) -> 149 cycles kfree -> 171 cycles
  2. Kmalloc: alloc/free test
  100000 times kmalloc(8)/kfree -> 70 cycles
  100000 times kmalloc(16)/kfree -> 70 cycles
  100000 times kmalloc(32)/kfree -> 70 cycles
  100000 times kmalloc(64)/kfree -> 70 cycles
  100000 times kmalloc(128)/kfree -> 70 cycles
  100000 times kmalloc(256)/kfree -> 69 cycles
  100000 times kmalloc(512)/kfree -> 70 cycles
  100000 times kmalloc(1024)/kfree -> 73 cycles
  100000 times kmalloc(2048)/kfree -> 72 cycles
  100000 times kmalloc(4096)/kfree -> 71 cycles

After:

  Single thread testing
  =====================
  1. Kmalloc: Repeatedly allocate then free test
  100000 times kmalloc(8) -> 57 cycles kfree -> 78 cycles
  100000 times kmalloc(16) -> 61 cycles kfree -> 81 cycles
  100000 times kmalloc(32) -> 76 cycles kfree -> 93 cycles
  100000 times kmalloc(64) -> 83 cycles kfree -> 94 cycles
  100000 times kmalloc(128) -> 106 cycles kfree -> 107 cycles
  100000 times kmalloc(256) -> 118 cycles kfree -> 117 cycles
  100000 times kmalloc(512) -> 114 cycles kfree -> 116 cycles
  100000 times kmalloc(1024) -> 115 cycles kfree -> 118 cycles
  100000 times kmalloc(2048) -> 147 cycles kfree -> 131 cycles
  100000 times kmalloc(4096) -> 214 cycles kfree -> 161 cycles
  2. Kmalloc: alloc/free test
  100000 times kmalloc(8)/kfree -> 66 cycles
  100000 times kmalloc(16)/kfree -> 66 cycles
  100000 times kmalloc(32)/kfree -> 66 cycles
  100000 times kmalloc(64)/kfree -> 66 cycles
  100000 times kmalloc(128)/kfree -> 65 cycles
  100000 times kmalloc(256)/kfree -> 67 cycles
  100000 times kmalloc(512)/kfree -> 67 cycles
  100000 times kmalloc(1024)/kfree -> 64 cycles
  100000 times kmalloc(2048)/kfree -> 67 cycles
  100000 times kmalloc(4096)/kfree -> 67 cycles

Kernbench, before:

  Average Optimal load -j 12 Run (std deviation):
  Elapsed Time 101.873 (1.16069)
  User Time 1045.22 (1.60447)
  System Time 88.969 (0.559195)
  Percent CPU 1112.9 (13.8279)
  Context Switches 189140 (2282.15)
  Sleeps 99008.6 (768.091)

After:

  Average Optimal load -j 12 Run (std deviation):
  Elapsed Time 102.47 (0.562732)
  User Time 1045.3 (1.34263)
  System Time 88.311 (0.342554)
  Percent CPU 1105.8 (6.49444)
  Context Switches 189081 (2355.78)
  Sleeps 99231.5 (800.358)

Link: http://lkml.kernel.org/r/1464295031-26375-3-git-send-email-thgarnie@google.com
Signed-off-by: Thomas Garnier <thgarnie@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d1faa019c02a..5624c1f3eb0a 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -99,6 +99,11 @@ struct kmem_cache {
 	 */
 	int remote_node_defrag_ratio;
 #endif
+
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+	unsigned int *random_seq;
+#endif
+
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
-- 
cgit v1.2.3


From 91c6a05f72a996bee5133e76374ab3ad7d3b9b72 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 26 Jul 2016 15:22:08 -0700
Subject: mm: faster kmalloc_array(), kcalloc()

When both arguments to kmalloc_array() or kcalloc() are known at compile
time then their product is known at compile time but search for kmalloc
cache happens at runtime not at compile time.

Link: http://lkml.kernel.org/r/20160627213454.GA2440@p183.telecom.by
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index aeb3e6d00a66..1a4ea551aae5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -565,6 +565,8 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
 {
 	if (size != 0 && n > SIZE_MAX / size)
 		return NULL;
+	if (__builtin_constant_p(n) && __builtin_constant_p(size))
+		return kmalloc(n * size, flags);
 	return __kmalloc(n * size, flags);
 }
 
-- 
cgit v1.2.3


From df429ac039360005299d56247647ca77098d660e Mon Sep 17 00:00:00 2001
From: Reza Arbab <arbab@linux.vnet.ibm.com>
Date: Tue, 26 Jul 2016 15:22:23 -0700
Subject: memory-hotplug: more general validation of zone during online

When memory is onlined, we are only able to rezone from ZONE_MOVABLE to
ZONE_KERNEL, or from (ZONE_MOVABLE - 1) to ZONE_MOVABLE.

To be more flexible, use the following criteria instead; to online
memory from zone X into zone Y,

* Any zones between X and Y must be unused.
* If X is lower than Y, the onlined memory must lie at the end of X.
* If X is higher than Y, the onlined memory must lie at the start of X.

Add zone_can_shift() to make this determination.

Link: http://lkml.kernel.org/r/1462816419-4479-3-git-send-email-arbab@linux.vnet.ibm.com
Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Reviewd-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Chen Yucong <slaoub@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Zhang Zhen <zhenzhang.zhang@huawei.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 5145620ba48a..01033fadea47 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -284,5 +284,7 @@ extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
 		unsigned long map_offset);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
+extern int zone_can_shift(unsigned long pfn, unsigned long nr_pages,
+			  enum zone_type target);
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
-- 
cgit v1.2.3


From 798fd756952c4b6cb7dfe6f6437e9f02da79a5bc Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Tue, 26 Jul 2016 15:22:30 -0700
Subject: mm: zap ZONE_OOM_LOCKED

Not used since oom_lock was instroduced.

Link: http://lkml.kernel.org/r/1464358093-22663-1-git-send-email-vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 02069c23486d..3388ccbab7d6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -524,7 +524,6 @@ struct zone {
 
 enum zone_flags {
 	ZONE_RECLAIM_LOCKED,		/* prevents concurrent reclaim */
-	ZONE_OOM_LOCKED,		/* zone is in OOM killer zonelist */
 	ZONE_CONGESTED,			/* zone has many dirty pages backed by
 					 * a congested BDI
 					 */
-- 
cgit v1.2.3


From 2a966b77ae3ede207e787e7538b87d1011c4364e Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Tue, 26 Jul 2016 15:22:33 -0700
Subject: mm: oom: add memcg to oom_control

It's a part of oom context just like allocation order and nodemask, so
let's move it to oom_control instead of passing it in the argument list.

Link: http://lkml.kernel.org/r/40e03fd7aaf1f55c75d787128d6d17c5a71226c2.1464358556.git.vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 83469522690a..cbc24a5fe28d 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -23,6 +23,9 @@ struct oom_control {
 	/* Used to determine mempolicy */
 	nodemask_t *nodemask;
 
+	/* Memory cgroup in which oom is invoked, or NULL for global oom */
+	struct mem_cgroup *memcg;
+
 	/* Used to determine cpuset and node locality requirement */
 	const gfp_t gfp_mask;
 
@@ -83,11 +86,10 @@ extern unsigned long oom_badness(struct task_struct *p,
 
 extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 			     unsigned int points, unsigned long totalpages,
-			     struct mem_cgroup *memcg, const char *message);
+			     const char *message);
 
 extern void check_panic_on_oom(struct oom_control *oc,
-			       enum oom_constraint constraint,
-			       struct mem_cgroup *memcg);
+			       enum oom_constraint constraint);
 
 extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 		struct task_struct *task, unsigned long totalpages);
-- 
cgit v1.2.3


From a54f9aebaa9f0ea2ce6b01f12b65062fb2e74e6c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 26 Jul 2016 15:22:36 -0700
Subject: include/linux/mmdebug.h: add VM_WARN which maps to WARN()

This enables us to do VM_WARN(condition, "warn message");

Link: http://lkml.kernel.org/r/1464692688-6612-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Reviewed-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmdebug.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index de7be78c6f0e..451a811f48f2 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -39,6 +39,7 @@ void dump_mm(const struct mm_struct *mm);
 #define VM_WARN_ON(cond) WARN_ON(cond)
 #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond)
 #define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format)
+#define VM_WARN(cond, format...) WARN(cond, format)
 #else
 #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
@@ -47,6 +48,7 @@ void dump_mm(const struct mm_struct *mm);
 #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #endif
 
 #ifdef CONFIG_DEBUG_VIRTUAL
-- 
cgit v1.2.3


From bda807d4445414e8e77da704f116bb0880fe0c76 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 26 Jul 2016 15:23:05 -0700
Subject: mm: migrate: support non-lru movable page migration

We have allowed migration for only LRU pages until now and it was enough
to make high-order pages.  But recently, embedded system(e.g., webOS,
android) uses lots of non-movable pages(e.g., zram, GPU memory) so we
have seen several reports about troubles of small high-order allocation.
For fixing the problem, there were several efforts (e,g,.  enhance
compaction algorithm, SLUB fallback to 0-order page, reserved memory,
vmalloc and so on) but if there are lots of non-movable pages in system,
their solutions are void in the long run.

So, this patch is to support facility to change non-movable pages with
movable.  For the feature, this patch introduces functions related to
migration to address_space_operations as well as some page flags.

If a driver want to make own pages movable, it should define three
functions which are function pointers of struct
address_space_operations.

1. bool (*isolate_page) (struct page *page, isolate_mode_t mode);

What VM expects on isolate_page function of driver is to return *true*
if driver isolates page successfully.  On returing true, VM marks the
page as PG_isolated so concurrent isolation in several CPUs skip the
page for isolation.  If a driver cannot isolate the page, it should
return *false*.

Once page is successfully isolated, VM uses page.lru fields so driver
shouldn't expect to preserve values in that fields.

2. int (*migratepage) (struct address_space *mapping,
		struct page *newpage, struct page *oldpage, enum migrate_mode);

After isolation, VM calls migratepage of driver with isolated page.  The
function of migratepage is to move content of the old page to new page
and set up fields of struct page newpage.  Keep in mind that you should
indicate to the VM the oldpage is no longer movable via
__ClearPageMovable() under page_lock if you migrated the oldpage
successfully and returns 0.  If driver cannot migrate the page at the
moment, driver can return -EAGAIN.  On -EAGAIN, VM will retry page
migration in a short time because VM interprets -EAGAIN as "temporal
migration failure".  On returning any error except -EAGAIN, VM will give
up the page migration without retrying in this time.

Driver shouldn't touch page.lru field VM using in the functions.

3. void (*putback_page)(struct page *);

If migration fails on isolated page, VM should return the isolated page
to the driver so VM calls driver's putback_page with migration failed
page.  In this function, driver should put the isolated page back to the
own data structure.

4. non-lru movable page flags

There are two page flags for supporting non-lru movable page.

* PG_movable

Driver should use the below function to make page movable under
page_lock.

	void __SetPageMovable(struct page *page, struct address_space *mapping)

It needs argument of address_space for registering migration family
functions which will be called by VM.  Exactly speaking, PG_movable is
not a real flag of struct page.  Rather than, VM reuses page->mapping's
lower bits to represent it.

	#define PAGE_MAPPING_MOVABLE 0x2
	page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;

so driver shouldn't access page->mapping directly.  Instead, driver
should use page_mapping which mask off the low two bits of page->mapping
so it can get right struct address_space.

For testing of non-lru movable page, VM supports __PageMovable function.
However, it doesn't guarantee to identify non-lru movable page because
page->mapping field is unified with other variables in struct page.  As
well, if driver releases the page after isolation by VM, page->mapping
doesn't have stable value although it has PAGE_MAPPING_MOVABLE (Look at
__ClearPageMovable).  But __PageMovable is cheap to catch whether page
is LRU or non-lru movable once the page has been isolated.  Because LRU
pages never can have PAGE_MAPPING_MOVABLE in page->mapping.  It is also
good for just peeking to test non-lru movable pages before more
expensive checking with lock_page in pfn scanning to select victim.

For guaranteeing non-lru movable page, VM provides PageMovable function.
Unlike __PageMovable, PageMovable functions validates page->mapping and
mapping->a_ops->isolate_page under lock_page.  The lock_page prevents
sudden destroying of page->mapping.

Driver using __SetPageMovable should clear the flag via
__ClearMovablePage under page_lock before the releasing the page.

* PG_isolated

To prevent concurrent isolation among several CPUs, VM marks isolated
page as PG_isolated under lock_page.  So if a CPU encounters PG_isolated
non-lru movable page, it can skip it.  Driver doesn't need to manipulate
the flag because VM will set/clear it automatically.  Keep in mind that
if driver sees PG_isolated page, it means the page have been isolated by
VM so it shouldn't touch page.lru field.  PG_isolated is alias with
PG_reclaim flag so driver shouldn't use the flag for own purpose.

[opensource.ganesh@gmail.com: mm/compaction: remove local variable is_lru]
  Link: http://lkml.kernel.org/r/20160618014841.GA7422@leo-test
Link: http://lkml.kernel.org/r/1464736881-24886-3-git-send-email-minchan@kernel.org
Signed-off-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: John Einar Reitan <john.reitan@foss.arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 17 +++++++++++++++++
 include/linux/fs.h         |  2 ++
 include/linux/ksm.h        |  3 +--
 include/linux/migrate.h    |  2 ++
 include/linux/mm.h         |  1 +
 include/linux/page-flags.h | 33 +++++++++++++++++++++++----------
 6 files changed, 46 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index a58c852a268f..c6b47c861cea 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -54,6 +54,9 @@ enum compact_result {
 struct alloc_context; /* in mm/internal.h */
 
 #ifdef CONFIG_COMPACTION
+extern int PageMovable(struct page *page);
+extern void __SetPageMovable(struct page *page, struct address_space *mapping);
+extern void __ClearPageMovable(struct page *page);
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos);
@@ -151,6 +154,19 @@ extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
 
 #else
+static inline int PageMovable(struct page *page)
+{
+	return 0;
+}
+static inline void __SetPageMovable(struct page *page,
+			struct address_space *mapping)
+{
+}
+
+static inline void __ClearPageMovable(struct page *page)
+{
+}
+
 static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 			unsigned int order, int alloc_flags,
 			const struct alloc_context *ac,
@@ -212,6 +228,7 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_i
 #endif /* CONFIG_COMPACTION */
 
 #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+struct node;
 extern int compaction_register_node(struct node *node);
 extern void compaction_unregister_node(struct node *node);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0c9ebf530d9e..97fe08d17d89 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -402,6 +402,8 @@ struct address_space_operations {
 	 */
 	int (*migratepage) (struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
+	bool (*isolate_page)(struct page *, isolate_mode_t);
+	void (*putback_page)(struct page *);
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, unsigned long,
 					unsigned long);
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 7ae216a39c9e..481c8c4627ca 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -43,8 +43,7 @@ static inline struct stable_node *page_stable_node(struct page *page)
 static inline void set_page_stable_node(struct page *page,
 					struct stable_node *stable_node)
 {
-	page->mapping = (void *)stable_node +
-				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
 }
 
 /*
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 9b50325e4ddf..404fbfefeb33 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -37,6 +37,8 @@ extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
+extern bool isolate_movable_page(struct page *page, isolate_mode_t mode);
+extern void putback_movable_page(struct page *page);
 
 extern int migrate_prep(void);
 extern int migrate_prep_local(void);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ece042dfe23c..3e22335a435c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1035,6 +1035,7 @@ static inline pgoff_t page_file_index(struct page *page)
 }
 
 bool page_mapped(struct page *page);
+struct address_space *page_mapping(struct page *page);
 
 /*
  * Return true only if the page has been allocated with
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e5a32445f930..f36dbb3a3060 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -129,6 +129,9 @@ enum pageflags {
 
 	/* Compound pages. Stored in first tail page's flags */
 	PG_double_map = PG_private_2,
+
+	/* non-lru isolated movable page */
+	PG_isolated = PG_reclaim,
 };
 
 #ifndef __GENERATING_BOUNDS_H
@@ -357,29 +360,37 @@ PAGEFLAG(Idle, idle, PF_ANY)
  * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
  *
  * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
- * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit;
- * and then page->mapping points, not to an anon_vma, but to a private
+ * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON
+ * bit; and then page->mapping points, not to an anon_vma, but to a private
  * structure which KSM associates with that merged page.  See ksm.h.
  *
- * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used.
+ * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable
+ * page and then page->mapping points a struct address_space.
  *
  * Please note that, confusingly, "page_mapping" refers to the inode
  * address_space which maps the page from disk; whereas "page_mapped"
  * refers to user virtual address space into which the page is mapped.
  */
-#define PAGE_MAPPING_ANON	1
-#define PAGE_MAPPING_KSM	2
-#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
+#define PAGE_MAPPING_ANON	0x1
+#define PAGE_MAPPING_MOVABLE	0x2
+#define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
+#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
 
-static __always_inline int PageAnonHead(struct page *page)
+static __always_inline int PageMappingFlags(struct page *page)
 {
-	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
 }
 
 static __always_inline int PageAnon(struct page *page)
 {
 	page = compound_head(page);
-	return PageAnonHead(page);
+	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
+static __always_inline int __PageMovable(struct page *page)
+{
+	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
+				PAGE_MAPPING_MOVABLE;
 }
 
 #ifdef CONFIG_KSM
@@ -393,7 +404,7 @@ static __always_inline int PageKsm(struct page *page)
 {
 	page = compound_head(page);
 	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
-				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+				PAGE_MAPPING_KSM;
 }
 #else
 TESTPAGEFLAG_FALSE(Ksm)
@@ -641,6 +652,8 @@ static inline void __ClearPageBalloon(struct page *page)
 	atomic_set(&page->_mapcount, -1);
 }
 
+__PAGEFLAG(Isolated, isolated, PF_ANY);
+
 /*
  * If network-based swap is enabled, sl*b must keep track of whether pages
  * were allocated from pfmemalloc reserves.
-- 
cgit v1.2.3


From b1123ea6d3b3da25af5c8a9d843bd07ab63213f4 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 26 Jul 2016 15:23:09 -0700
Subject: mm: balloon: use general non-lru movable page feature

Now, VM has a feature to migrate non-lru movable pages so balloon
doesn't need custom migration hooks in migrate.c and compaction.c.

Instead, this patch implements the page->mapping->a_ops->
{isolate|migrate|putback} functions.

With that, we could remove hooks for ballooning in general migration
functions and make balloon compaction simple.

[akpm@linux-foundation.org: compaction.h requires that the includer first include node.h]
Link: http://lkml.kernel.org/r/1464736881-24886-4-git-send-email-minchan@kernel.org
Signed-off-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/balloon_compaction.h | 54 ++++++++++++--------------------------
 1 file changed, 17 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 9b0a15d06a4f..504bd724e6ab 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -45,9 +45,11 @@
 #define _LINUX_BALLOON_COMPACTION_H
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
-#include <linux/migrate.h>
+#include <linux/node.h>
+#include <linux/compaction.h>
 #include <linux/gfp.h>
 #include <linux/err.h>
+#include <linux/fs.h>
 
 /*
  * Balloon device information descriptor.
@@ -62,6 +64,7 @@ struct balloon_dev_info {
 	struct list_head pages;		/* Pages enqueued & handled to Host */
 	int (*migratepage)(struct balloon_dev_info *, struct page *newpage,
 			struct page *page, enum migrate_mode mode);
+	struct inode *inode;
 };
 
 extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info);
@@ -73,44 +76,18 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
 	spin_lock_init(&balloon->pages_lock);
 	INIT_LIST_HEAD(&balloon->pages);
 	balloon->migratepage = NULL;
+	balloon->inode = NULL;
 }
 
 #ifdef CONFIG_BALLOON_COMPACTION
-extern bool balloon_page_isolate(struct page *page);
+extern const struct address_space_operations balloon_aops;
+extern bool balloon_page_isolate(struct page *page,
+				isolate_mode_t mode);
 extern void balloon_page_putback(struct page *page);
-extern int balloon_page_migrate(struct page *newpage,
+extern int balloon_page_migrate(struct address_space *mapping,
+				struct page *newpage,
 				struct page *page, enum migrate_mode mode);
 
-/*
- * __is_movable_balloon_page - helper to perform @page PageBalloon tests
- */
-static inline bool __is_movable_balloon_page(struct page *page)
-{
-	return PageBalloon(page);
-}
-
-/*
- * balloon_page_movable - test PageBalloon to identify balloon pages
- *			  and PagePrivate to check that the page is not
- *			  isolated and can be moved by compaction/migration.
- *
- * As we might return false positives in the case of a balloon page being just
- * released under us, this need to be re-tested later, under the page lock.
- */
-static inline bool balloon_page_movable(struct page *page)
-{
-	return PageBalloon(page) && PagePrivate(page);
-}
-
-/*
- * isolated_balloon_page - identify an isolated balloon page on private
- *			   compaction/migration page lists.
- */
-static inline bool isolated_balloon_page(struct page *page)
-{
-	return PageBalloon(page);
-}
-
 /*
  * balloon_page_insert - insert a page into the balloon's page list and make
  *			 the page->private assignment accordingly.
@@ -124,7 +101,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 				       struct page *page)
 {
 	__SetPageBalloon(page);
-	SetPagePrivate(page);
+	__SetPageMovable(page, balloon->inode->i_mapping);
 	set_page_private(page, (unsigned long)balloon);
 	list_add(&page->lru, &balloon->pages);
 }
@@ -140,11 +117,14 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 static inline void balloon_page_delete(struct page *page)
 {
 	__ClearPageBalloon(page);
+	__ClearPageMovable(page);
 	set_page_private(page, 0);
-	if (PagePrivate(page)) {
-		ClearPagePrivate(page);
+	/*
+	 * No touch page.lru field once @page has been isolated
+	 * because VM is using the field.
+	 */
+	if (!PageIsolated(page))
 		list_del(&page->lru);
-	}
 }
 
 /*
-- 
cgit v1.2.3


From 66c64223ad4e7a4a9161fcd9606426d9f57227ca Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 26 Jul 2016 15:23:40 -0700
Subject: mm/compaction: split freepages without holding the zone lock

We don't need to split freepages with holding the zone lock.  It will
cause more contention on zone lock so not desirable.

[rientjes@google.com: if __isolate_free_page() fails, avoid adding to freelist so we don't call map_pages() with it]
  Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1606211447001.43430@chino.kir.corp.google.com
Link: http://lkml.kernel.org/r/1464230275-25791-1-git-send-email-iamjoonsoo.kim@lge.com
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3e22335a435c..6c9a394b2979 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -537,7 +537,6 @@ void __put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
-int split_free_page(struct page *page);
 
 /*
  * Compound pages have a destructor function.  Provide a
-- 
cgit v1.2.3


From a9627bc5e34e79ae80a33241b8a1501cc498e191 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 26 Jul 2016 15:23:49 -0700
Subject: mm/page_owner: introduce split_page_owner and replace manual handling

split_page() calls set_page_owner() to set up page_owner to each pages.
But, it has a drawback that head page and the others have different
stacktrace because callsite of set_page_owner() is slightly differnt.
To avoid this problem, this patch copies head page's page_owner to the
others.  It needs to introduce new function, split_page_owner() but it
also remove the other function, get_page_owner_gfp() so looks good to
do.

Link: http://lkml.kernel.org/r/1464230275-25791-4-git-send-email-iamjoonsoo.kim@lge.com
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_owner.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 46f1b939948c..30583ab0ffb1 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -10,7 +10,7 @@ extern struct page_ext_operations page_owner_ops;
 extern void __reset_page_owner(struct page *page, unsigned int order);
 extern void __set_page_owner(struct page *page,
 			unsigned int order, gfp_t gfp_mask);
-extern gfp_t __get_page_owner_gfp(struct page *page);
+extern void __split_page_owner(struct page *page, unsigned int order);
 extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
 extern void __set_page_owner_migrate_reason(struct page *page, int reason);
 extern void __dump_page_owner(struct page *page);
@@ -28,12 +28,10 @@ static inline void set_page_owner(struct page *page,
 		__set_page_owner(page, order, gfp_mask);
 }
 
-static inline gfp_t get_page_owner_gfp(struct page *page)
+static inline void split_page_owner(struct page *page, unsigned int order)
 {
 	if (static_branch_unlikely(&page_owner_inited))
-		return __get_page_owner_gfp(page);
-	else
-		return 0;
+		__split_page_owner(page, order);
 }
 static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
 {
@@ -58,9 +56,9 @@ static inline void set_page_owner(struct page *page,
 			unsigned int order, gfp_t gfp_mask)
 {
 }
-static inline gfp_t get_page_owner_gfp(struct page *page)
+static inline void split_page_owner(struct page *page,
+			unsigned int order)
 {
-	return 0;
 }
 static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
 {
-- 
cgit v1.2.3


From f2ca0b55710752588ccff5224a11e6aea43a996a Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 26 Jul 2016 15:23:55 -0700
Subject: mm/page_owner: use stackdepot to store stacktrace

Currently, we store each page's allocation stacktrace on corresponding
page_ext structure and it requires a lot of memory.  This causes the
problem that memory tight system doesn't work well if page_owner is
enabled.  Moreover, even with this large memory consumption, we cannot
get full stacktrace because we allocate memory at boot time and just
maintain 8 stacktrace slots to balance memory consumption.  We could
increase it to more but it would make system unusable or change system
behaviour.

To solve the problem, this patch uses stackdepot to store stacktrace.
It obviously provides memory saving but there is a drawback that
stackdepot could fail.

stackdepot allocates memory at runtime so it could fail if system has
not enough memory.  But, most of allocation stack are generated at very
early time and there are much memory at this time.  So, failure would
not happen easily.  And, one failure means that we miss just one page's
allocation stacktrace so it would not be a big problem.  In this patch,
when memory allocation failure happens, we store special stracktrace
handle to the page that is failed to save stacktrace.  With it, user can
guess memory usage properly even if failure happens.

Memory saving looks as following.  (4GB memory system with page_owner)
(before the patch -> after the patch)

static allocation:
92274688 bytes -> 25165824 bytes

dynamic allocation after boot + kernel build:
0 bytes -> 327680 bytes

total:
92274688 bytes -> 25493504 bytes

72% reduction in total.

Note that implementation looks complex than someone would imagine
because there is recursion issue.  stackdepot uses page allocator and
page_owner is called at page allocation.  Using stackdepot in page_owner
could re-call page allcator and then page_owner.  That is a recursion.
To detect and avoid it, whenever we obtain stacktrace, recursion is
checked and page_owner is set to dummy information if found.  Dummy
information means that this page is allocated for page_owner feature
itself (such as stackdepot) and it's understandable behavior for user.

[iamjoonsoo.kim@lge.com: mm-page_owner-use-stackdepot-to-store-stacktrace-v3]
  Link: http://lkml.kernel.org/r/1464230275-25791-6-git-send-email-iamjoonsoo.kim@lge.com
  Link: http://lkml.kernel.org/r/1466150259-27727-7-git-send-email-iamjoonsoo.kim@lge.com
Link: http://lkml.kernel.org/r/1464230275-25791-6-git-send-email-iamjoonsoo.kim@lge.com
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_ext.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index e1fe7cf5bddf..03f2a3e7d76d 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/stacktrace.h>
+#include <linux/stackdepot.h>
 
 struct pglist_data;
 struct page_ext_operations {
@@ -44,9 +45,8 @@ struct page_ext {
 #ifdef CONFIG_PAGE_OWNER
 	unsigned int order;
 	gfp_t gfp_mask;
-	unsigned int nr_entries;
 	int last_migrate_reason;
-	unsigned long trace_entries[8];
+	depot_stack_handle_t handle;
 #endif
 };
 
-- 
cgit v1.2.3


From 99691addb42919251dcc082a70b7a11733dfcbcc Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Tue, 26 Jul 2016 15:24:16 -0700
Subject: mm: remove pointless struct in struct page definition

This patchset implements per kmemcg accounting of page tables
(x86-only), pipe buffers, and unix socket buffers.

Patches 1-3 are just cleanups that are not supposed to introduce any
functional changes.  Patches 4 and 5 move charge/uncharge to generic
page allocator paths for the sake of accounting pipe and unix socket
buffers.  Patches 5-7 make x86 page tables, pipe buffers, and unix
socket buffers accountable.

This patch (of 8):

... to reduce indentation level thus leaving more space for comments.

Link: http://lkml.kernel.org/r/f34ffe70fce2b0b9220856437f77972d67c14275.1464079537.git.vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 68 +++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 917f2b6a0cde..a50ad735d518 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -60,51 +60,47 @@ struct page {
 	};
 
 	/* Second double word */
-	struct {
-		union {
-			pgoff_t index;		/* Our offset within mapping. */
-			void *freelist;		/* sl[aou]b first free object */
-			/* page_deferred_list().prev	-- second tail page */
-		};
+	union {
+		pgoff_t index;		/* Our offset within mapping. */
+		void *freelist;		/* sl[aou]b first free object */
+		/* page_deferred_list().prev	-- second tail page */
+	};
 
-		union {
+	union {
 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
 	defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
-			/* Used for cmpxchg_double in slub */
-			unsigned long counters;
+		/* Used for cmpxchg_double in slub */
+		unsigned long counters;
 #else
-			/*
-			 * Keep _refcount separate from slub cmpxchg_double
-			 * data.  As the rest of the double word is protected by
-			 * slab_lock but _refcount is not.
-			 */
-			unsigned counters;
+		/*
+		 * Keep _refcount separate from slub cmpxchg_double data.
+		 * As the rest of the double word is protected by slab_lock
+		 * but _refcount is not.
+		 */
+		unsigned counters;
 #endif
+		struct {
 
-			struct {
-
-				union {
-					/*
-					 * Count of ptes mapped in mms, to show
-					 * when page is mapped & limit reverse
-					 * map searches.
-					 */
-					atomic_t _mapcount;
-
-					struct { /* SLUB */
-						unsigned inuse:16;
-						unsigned objects:15;
-						unsigned frozen:1;
-					};
-					int units;	/* SLOB */
-				};
+			union {
 				/*
-				 * Usage count, *USE WRAPPER FUNCTION*
-				 * when manual accounting. See page_ref.h
+				 * Count of ptes mapped in mms, to show when
+				 * page is mapped & limit reverse map searches.
 				 */
-				atomic_t _refcount;
+				atomic_t _mapcount;
+
+				unsigned int active;		/* SLAB */
+				struct {			/* SLUB */
+					unsigned inuse:16;
+					unsigned objects:15;
+					unsigned frozen:1;
+				};
+				int units;			/* SLOB */
 			};
-			unsigned int active;	/* SLAB */
+			/*
+			 * Usage count, *USE WRAPPER FUNCTION* when manual
+			 * accounting. See page_ref.h
+			 */
+			atomic_t _refcount;
 		};
 	};
 
-- 
cgit v1.2.3


From 632c0a1affd861f81abdd136c886418571e19a51 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Tue, 26 Jul 2016 15:24:18 -0700
Subject: mm: clean up non-standard page->_mapcount users

 - Add a proper comment to page->_mapcount.

 - Introduce a macro for generating helper functions.

 - Place all special page->_mapcount values next to each other so that
   readers can see all possible values and so we don't get duplicates.

Link: http://lkml.kernel.org/r/502f49000e0b63e6c62e338fac6b420bf34fb526.1464079537.git.vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h   |  5 ++++
 include/linux/page-flags.h | 73 ++++++++++++++++++++--------------------------
 2 files changed, 37 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a50ad735d518..79472b22d23f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -85,6 +85,11 @@ struct page {
 				/*
 				 * Count of ptes mapped in mms, to show when
 				 * page is mapped & limit reverse map searches.
+				 *
+				 * Extra information about page type may be
+				 * stored here for pages that are never mapped,
+				 * in which case the value MUST BE <= -2.
+				 * See page-flags.h for more details.
 				 */
 				atomic_t _mapcount;
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f36dbb3a3060..96084ee74ee8 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -604,54 +604,45 @@ TESTPAGEFLAG_FALSE(DoubleMap)
 #endif
 
 /*
- * PageBuddy() indicate that the page is free and in the buddy system
- * (see mm/page_alloc.c).
- *
- * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to
- * -2 so that an underflow of the page_mapcount() won't be mistaken
- * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very
- * efficiently by most CPU architectures.
+ * For pages that are never mapped to userspace, page->mapcount may be
+ * used for storing extra information about page type. Any value used
+ * for this purpose must be <= -2, but it's better start not too close
+ * to -2 so that an underflow of the page_mapcount() won't be mistaken
+ * for a special page.
  */
-#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
-
-static inline int PageBuddy(struct page *page)
-{
-	return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE;
+#define PAGE_MAPCOUNT_OPS(uname, lname)					\
+static __always_inline int Page##uname(struct page *page)		\
+{									\
+	return atomic_read(&page->_mapcount) ==				\
+				PAGE_##lname##_MAPCOUNT_VALUE;		\
+}									\
+static __always_inline void __SetPage##uname(struct page *page)		\
+{									\
+	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);	\
+	atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE);	\
+}									\
+static __always_inline void __ClearPage##uname(struct page *page)	\
+{									\
+	VM_BUG_ON_PAGE(!Page##uname(page), page);			\
+	atomic_set(&page->_mapcount, -1);				\
 }
 
-static inline void __SetPageBuddy(struct page *page)
-{
-	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
-	atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE);
-}
+/*
+ * PageBuddy() indicate that the page is free and in the buddy system
+ * (see mm/page_alloc.c).
+ */
+#define PAGE_BUDDY_MAPCOUNT_VALUE		(-128)
+PAGE_MAPCOUNT_OPS(Buddy, BUDDY)
 
-static inline void __ClearPageBuddy(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageBuddy(page), page);
-	atomic_set(&page->_mapcount, -1);
-}
+/*
+ * PageBalloon() is set on pages that are on the balloon page list
+ * (see mm/balloon_compaction.c).
+ */
+#define PAGE_BALLOON_MAPCOUNT_VALUE		(-256)
+PAGE_MAPCOUNT_OPS(Balloon, BALLOON)
 
 extern bool is_free_buddy_page(struct page *page);
 
-#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
-
-static inline int PageBalloon(struct page *page)
-{
-	return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE;
-}
-
-static inline void __SetPageBalloon(struct page *page)
-{
-	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
-	atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE);
-}
-
-static inline void __ClearPageBalloon(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageBalloon(page), page);
-	atomic_set(&page->_mapcount, -1);
-}
-
 __PAGEFLAG(Isolated, isolated, PF_ANY);
 
 /*
-- 
cgit v1.2.3


From 452647784b2fccfdeeb976f6f842c6719fb2daac Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Tue, 26 Jul 2016 15:24:21 -0700
Subject: mm: memcontrol: cleanup kmem charge functions

 - Handle memcg_kmem_enabled check out to the caller. This reduces the
   number of function definitions making the code easier to follow. At
   the same time it doesn't result in code bloat, because all of these
   functions are used only in one or two places.

 - Move __GFP_ACCOUNT check to the caller as well so that one wouldn't
   have to dive deep into memcg implementation to see which allocations
   are charged and which are not.

 - Refresh comments.

Link: http://lkml.kernel.org/r/52882a28b542c1979fd9a033b4dc8637fc347399.1464079537.git.vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 103 +++------------------------------------------
 1 file changed, 7 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 56e6069d2452..71aff733a497 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -749,6 +749,13 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 }
 #endif
 
+struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
+void memcg_kmem_put_cache(struct kmem_cache *cachep);
+int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+			    struct mem_cgroup *memcg);
+int memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
+void memcg_kmem_uncharge(struct page *page, int order);
+
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 extern struct static_key_false memcg_kmem_enabled_key;
 
@@ -769,22 +776,6 @@ static inline bool memcg_kmem_enabled(void)
 	return static_branch_unlikely(&memcg_kmem_enabled_key);
 }
 
-/*
- * In general, we'll do everything in our power to not incur in any overhead
- * for non-memcg users for the kmem functions. Not even a function call, if we
- * can avoid it.
- *
- * Therefore, we'll inline all those functions so that in the best case, we'll
- * see that kmemcg is off for everybody and proceed quickly.  If it is on,
- * we'll still do most of the flag checking inline. We check a lot of
- * conditions, but because they are pretty simple, they are expected to be
- * fast.
- */
-int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
-			      struct mem_cgroup *memcg);
-int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
-void __memcg_kmem_uncharge(struct page *page, int order);
-
 /*
  * helper for accessing a memcg's index. It will be used as an index in the
  * child cache array in kmem_cache, and also to derive its name. This function
@@ -795,67 +786,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
 	return memcg ? memcg->kmemcg_id : -1;
 }
 
-struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
-void __memcg_kmem_put_cache(struct kmem_cache *cachep);
-
-static inline bool __memcg_kmem_bypass(void)
-{
-	if (!memcg_kmem_enabled())
-		return true;
-	if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
-		return true;
-	return false;
-}
-
-/**
- * memcg_kmem_charge: charge a kmem page
- * @page: page to charge
- * @gfp: reclaim mode
- * @order: allocation order
- *
- * Returns 0 on success, an error code on failure.
- */
-static __always_inline int memcg_kmem_charge(struct page *page,
-					     gfp_t gfp, int order)
-{
-	if (__memcg_kmem_bypass())
-		return 0;
-	if (!(gfp & __GFP_ACCOUNT))
-		return 0;
-	return __memcg_kmem_charge(page, gfp, order);
-}
-
-/**
- * memcg_kmem_uncharge: uncharge a kmem page
- * @page: page to uncharge
- * @order: allocation order
- */
-static __always_inline void memcg_kmem_uncharge(struct page *page, int order)
-{
-	if (memcg_kmem_enabled())
-		__memcg_kmem_uncharge(page, order);
-}
-
-/**
- * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
- * @cachep: the original global kmem cache
- *
- * All memory allocated from a per-memcg cache is charged to the owner memcg.
- */
-static __always_inline struct kmem_cache *
-memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
-{
-	if (__memcg_kmem_bypass())
-		return cachep;
-	return __memcg_kmem_get_cache(cachep, gfp);
-}
-
-static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
-{
-	if (memcg_kmem_enabled())
-		__memcg_kmem_put_cache(cachep);
-}
-
 /**
  * memcg_kmem_update_page_stat - update kmem page state statistics
  * @page: the page
@@ -878,15 +808,6 @@ static inline bool memcg_kmem_enabled(void)
 	return false;
 }
 
-static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
-{
-	return 0;
-}
-
-static inline void memcg_kmem_uncharge(struct page *page, int order)
-{
-}
-
 static inline int memcg_cache_id(struct mem_cgroup *memcg)
 {
 	return -1;
@@ -900,16 +821,6 @@ static inline void memcg_put_cache_ids(void)
 {
 }
 
-static inline struct kmem_cache *
-memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
-{
-	return cachep;
-}
-
-static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
-{
-}
-
 static inline void memcg_kmem_update_page_stat(struct page *page,
 				enum mem_cgroup_stat_index idx, int val)
 {
-- 
cgit v1.2.3


From 4949148ad433f6f11cf837978b2907092ec99f3a Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Tue, 26 Jul 2016 15:24:24 -0700
Subject: mm: charge/uncharge kmemcg from generic page allocator paths

Currently, to charge a non-slab allocation to kmemcg one has to use
alloc_kmem_pages helper with __GFP_ACCOUNT flag.  A page allocated with
this helper should finally be freed using free_kmem_pages, otherwise it
won't be uncharged.

This API suits its current users fine, but it turns out to be impossible
to use along with page reference counting, i.e.  when an allocation is
supposed to be freed with put_page, as it is the case with pipe or unix
socket buffers.

To overcome this limitation, this patch moves charging/uncharging to
generic page allocator paths, i.e.  to __alloc_pages_nodemask and
free_pages_prepare, and zaps alloc/free_kmem_pages helpers.  This way,
one can use any of the available page allocation functions to get the
allocated page charged to kmemcg - it's enough to pass __GFP_ACCOUNT,
just like in case of kmalloc and friends.  A charged page will be
automatically uncharged on free.

To make it possible, we need to mark pages charged to kmemcg somehow.
To avoid introducing a new page flag, we make use of page->_mapcount for
marking such pages.  Since pages charged to kmemcg are not supposed to
be mapped to userspace, it should work just fine.  There are other
(ab)users of page->_mapcount - buddy and balloon pages - but we don't
conflict with them.

In case kmemcg is compiled out or not used at runtime, this patch
introduces no overhead to generic page allocator paths.  If kmemcg is
used, it will be plus one gfp flags check on alloc and plus one
page->_mapcount check on free, which shouldn't hurt performance, because
the data accessed are hot.

Link: http://lkml.kernel.org/r/a9736d856f895bcb465d9f257b54efe32eda6f99.1464079538.git.vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h        | 10 +---------
 include/linux/page-flags.h |  7 +++++++
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 570383a41853..c29e9d347bc6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -78,8 +78,7 @@ struct vm_area_struct;
  * __GFP_THISNODE forces the allocation to be satisified from the requested
  *   node with no fallbacks or placement policy enforcements.
  *
- * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg (only relevant
- *   to kmem allocations).
+ * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
  */
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
@@ -486,10 +485,6 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 #define alloc_page_vma_node(gfp_mask, vma, addr, node)		\
 	alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
 
-extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order);
-extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask,
-					  unsigned int order);
-
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
 
@@ -513,9 +508,6 @@ extern void *__alloc_page_frag(struct page_frag_cache *nc,
 			       unsigned int fragsz, gfp_t gfp_mask);
 extern void __free_page_frag(void *addr);
 
-extern void __free_kmem_pages(struct page *page, unsigned int order);
-extern void free_kmem_pages(unsigned long addr, unsigned int order);
-
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 96084ee74ee8..7c8e82ac2eb7 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -641,6 +641,13 @@ PAGE_MAPCOUNT_OPS(Buddy, BUDDY)
 #define PAGE_BALLOON_MAPCOUNT_VALUE		(-256)
 PAGE_MAPCOUNT_OPS(Balloon, BALLOON)
 
+/*
+ * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
+ * pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
+ */
+#define PAGE_KMEMCG_MAPCOUNT_VALUE		(-512)
+PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG)
+
 extern bool is_free_buddy_page(struct page *page);
 
 __PAGEFLAG(Isolated, isolated, PF_ANY);
-- 
cgit v1.2.3


From fbe84a09da746f781553051bb3dbc63f7b0a5162 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 26 Jul 2016 15:24:39 -0700
Subject: mm,oom: remove unused argument from oom_scan_process_thread().

oom_scan_process_thread() does not use totalpages argument.
oom_badness() uses it.

Link: http://lkml.kernel.org/r/1463796041-7889-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index cbc24a5fe28d..606137b3b778 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -92,7 +92,7 @@ extern void check_panic_on_oom(struct oom_control *oc,
 			       enum oom_constraint constraint);
 
 extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
-		struct task_struct *task, unsigned long totalpages);
+					       struct task_struct *task);
 
 extern bool out_of_memory(struct oom_control *oc);
 
-- 
cgit v1.2.3


From 8ea1d2a1985a7ae096edf5850a31d844ad1b8e97 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 26 Jul 2016 15:24:42 -0700
Subject: mm, frontswap: convert frontswap_enabled to static key

I have noticed that frontswap.h first declares "frontswap_enabled" as
extern bool variable, and then overrides it with "#define
frontswap_enabled (1)" for CONFIG_FRONTSWAP=Y or (0) when disabled.  The
bool variable isn't actually instantiated anywhere.

This all looks like an unfinished attempt to make frontswap_enabled
reflect whether a backend is instantiated.  But in the current state,
all frontswap hooks call unconditionally into frontswap.c just to check
if frontswap_ops is non-NULL.  This should at least be checked inline,
but we can further eliminate the overhead when CONFIG_FRONTSWAP is
enabled and no backend registered, using a static key that is initially
disabled, and gets enabled only upon first backend registration.

Thus, checks for "frontswap_enabled" are replaced with
"frontswap_enabled()" wrapping the static key check.  There are two
exceptions:

- xen's selfballoon_process() was testing frontswap_enabled in code guarded
  by #ifdef CONFIG_FRONTSWAP, which was effectively always true when reachable.
  The patch just removes this check. Using frontswap_enabled() does not sound
  correct here, as this can be true even without xen's own backend being
  registered.

- in SYSCALL_DEFINE2(swapon), change the check to IS_ENABLED(CONFIG_FRONTSWAP)
  as it seems the bitmap allocation cannot currently be postponed until a
  backend is registered. This means that frontswap will still have some
  memory overhead by being configured, but without a backend.

After the patch, we can expect that some functions in frontswap.c are
called only when frontswap_ops is non-NULL.  Change the checks there to
VM_BUG_ONs.  While at it, convert other BUG_ONs to VM_BUG_ONs as
frontswap has been stable for some time.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/1463152235-9717-1-git-send-email-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index e65ef959546c..c46d2aa16d81 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -4,6 +4,7 @@
 #include <linux/swap.h>
 #include <linux/mm.h>
 #include <linux/bitops.h>
+#include <linux/jump_label.h>
 
 struct frontswap_ops {
 	void (*init)(unsigned); /* this swap type was just swapon'ed */
@@ -14,7 +15,6 @@ struct frontswap_ops {
 	struct frontswap_ops *next; /* private pointer to next ops */
 };
 
-extern bool frontswap_enabled;
 extern void frontswap_register_ops(struct frontswap_ops *ops);
 extern void frontswap_shrink(unsigned long);
 extern unsigned long frontswap_curr_pages(void);
@@ -30,7 +30,12 @@ extern void __frontswap_invalidate_page(unsigned, pgoff_t);
 extern void __frontswap_invalidate_area(unsigned);
 
 #ifdef CONFIG_FRONTSWAP
-#define frontswap_enabled (1)
+extern struct static_key_false frontswap_enabled_key;
+
+static inline bool frontswap_enabled(void)
+{
+	return static_branch_unlikely(&frontswap_enabled_key);
+}
 
 static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
 {
@@ -50,7 +55,10 @@ static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
 #else
 /* all inline routines become no-ops and all externs are ignored */
 
-#define frontswap_enabled (0)
+static inline bool frontswap_enabled(void)
+{
+	return false;
+}
 
 static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
 {
@@ -70,37 +78,35 @@ static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
 
 static inline int frontswap_store(struct page *page)
 {
-	int ret = -1;
+	if (frontswap_enabled())
+		return __frontswap_store(page);
 
-	if (frontswap_enabled)
-		ret = __frontswap_store(page);
-	return ret;
+	return -1;
 }
 
 static inline int frontswap_load(struct page *page)
 {
-	int ret = -1;
+	if (frontswap_enabled())
+		return __frontswap_load(page);
 
-	if (frontswap_enabled)
-		ret = __frontswap_load(page);
-	return ret;
+	return -1;
 }
 
 static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset)
 {
-	if (frontswap_enabled)
+	if (frontswap_enabled())
 		__frontswap_invalidate_page(type, offset);
 }
 
 static inline void frontswap_invalidate_area(unsigned type)
 {
-	if (frontswap_enabled)
+	if (frontswap_enabled())
 		__frontswap_invalidate_area(type);
 }
 
 static inline void frontswap_init(unsigned type, unsigned long *map)
 {
-	if (frontswap_enabled)
+	if (frontswap_enabled())
 		__frontswap_init(type, map);
 }
 
-- 
cgit v1.2.3


From 91537fee001361b1a4d485f1af65d8efa03d49b5 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 26 Jul 2016 15:24:45 -0700
Subject: mm: add NR_ZSMALLOC to vmstat

zram is very popular for some of the embedded world (e.g., TV, mobile
phones).  On those system, zsmalloc's consumed memory size is never
trivial (one of example from real product system, total memory: 800M,
zsmalloc consumed: 150M), so we have used this out of tree patch to
monitor system memory behavior via /proc/vmstat.

With zsmalloc in vmstat, it helps in tracking down system behavior due
to memory usage.

[minchan@kernel.org: zsmalloc: follow up zsmalloc vmstat]
  Link: http://lkml.kernel.org/r/20160607091737.GC23435@bbox
[akpm@linux-foundation.org: fix build with CONFIG_ZSMALLOC=m]
Link: http://lkml.kernel.org/r/1464919731-13255-1-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Sangseok Lee <sangseok.lee@lge.com>
Cc: Chanho Min <chanho.min@lge.com>
Cc: Chan Gyun Jeong <chan.jeong@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3388ccbab7d6..3d7ab30d4940 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -140,6 +140,9 @@ enum zone_stat_item {
 	NR_DIRTIED,		/* page dirtyings since bootup */
 	NR_WRITTEN,		/* page writings since bootup */
 	NR_PAGES_SCANNED,	/* pages scanned since last reclaim */
+#if IS_ENABLED(CONFIG_ZSMALLOC)
+	NR_ZSPAGES,		/* allocated in zsmalloc */
+#endif
 #ifdef CONFIG_NUMA
 	NUMA_HIT,		/* allocated in intended node */
 	NUMA_MISS,		/* allocated in non intended node */
-- 
cgit v1.2.3


From ba6c19fd113a3965f8cf4c183a813d528008d03e Mon Sep 17 00:00:00 2001
From: Chen Gang <chengang@emindsoft.com.cn>
Date: Tue, 26 Jul 2016 15:24:47 -0700
Subject: include/linux/memblock.h: Clean up code for several trivial details

Correct the function parameters alignment, since original code already
use both tabs and white spaces together for the incorrect parameters
alignment functions.

If one line can hold one statement within 80 columns, let it in one line
(original code did not consider about the tabs/spaces for 2nd line when
a statement is separated into 2 lines).

Try to let '' aligned within one macro, since all related lines are
short enough.

Remove useless statement "idx = 0;", and always assign rgn within the
'for' statement.

Link: http://lkml.kernel.org/r/1464904899-1714-1-git-send-email-chengang@emindsoft.com.cn
Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 3106ac1c895e..6c14b6179727 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -73,8 +73,8 @@ extern bool movable_node_enabled;
 	if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 
 phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
-					    phys_addr_t start, phys_addr_t end,
-					    int nid, ulong flags);
+					phys_addr_t start, phys_addr_t end,
+					int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
 				   phys_addr_t size, phys_addr_t align);
 phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
@@ -110,7 +110,7 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 			  phys_addr_t *out_end, int *out_nid);
 
 void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
-			       phys_addr_t *out_end);
+				phys_addr_t *out_end);
 
 /**
  * for_each_mem_range - iterate through memblock areas from type_a and not
@@ -148,7 +148,7 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
 			       p_start, p_end, p_nid)			\
 	for (i = (u64)ULLONG_MAX,					\
 		     __next_mem_range_rev(&i, nid, flags, type_a, type_b,\
-					 p_start, p_end, p_nid);	\
+					  p_start, p_end, p_nid);	\
 	     i != (u64)ULLONG_MAX;					\
 	     __next_mem_range_rev(&i, nid, flags, type_a, type_b,	\
 				  p_start, p_end, p_nid))
@@ -163,8 +163,7 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
  * is initialized.
  */
 #define for_each_reserved_mem_region(i, p_start, p_end)			\
-	for (i = 0UL,							\
-	     __next_reserved_mem_region(&i, p_start, p_end);		\
+	for (i = 0UL, __next_reserved_mem_region(&i, p_start, p_end);	\
 	     i != (u64)ULLONG_MAX;					\
 	     __next_reserved_mem_region(&i, p_start, p_end))
 
@@ -403,15 +402,14 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
 }
 
 #define for_each_memblock(memblock_type, region)					\
-	for (region = memblock.memblock_type.regions;				\
+	for (region = memblock.memblock_type.regions;					\
 	     region < (memblock.memblock_type.regions + memblock.memblock_type.cnt);	\
 	     region++)
 
 #define for_each_memblock_type(memblock_type, rgn)			\
-	idx = 0;							\
-	rgn = &memblock_type->regions[idx];				\
-	for (idx = 0; idx < memblock_type->cnt;				\
-	     idx++,rgn = &memblock_type->regions[idx])
+	for (idx = 0, rgn = &memblock_type->regions[0];			\
+	     idx < memblock_type->cnt;					\
+	     idx++, rgn = &memblock_type->regions[idx])
 
 #ifdef CONFIG_MEMTEST
 extern void early_memtest(phys_addr_t start, phys_addr_t end);
-- 
cgit v1.2.3


From 8a5c743e308dd2b90ad10d1faaa7a1b09173a132 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 26 Jul 2016 15:24:53 -0700
Subject: mm, memcg: use consistent gfp flags during readahead

Vladimir has noticed that we might declare memcg oom even during
readahead because read_pages only uses GFP_KERNEL (with mapping_gfp
restriction) while __do_page_cache_readahead uses
page_cache_alloc_readahead which adds __GFP_NORETRY to prevent from
OOMs.  This gfp mask discrepancy is really unfortunate and easily
fixable.  Drop page_cache_alloc_readahead() which only has one user and
outsource the gfp_mask logic into readahead_gfp_mask and propagate this
mask from __do_page_cache_readahead down to read_pages.

This alone would have only very limited impact as most filesystems are
implementing ->readpages and the common implementation mpage_readpages
does GFP_KERNEL (with mapping_gfp restriction) again.  We can tell it to
use readahead_gfp_mask instead as this function is called only during
readahead as well.  The same applies to read_cache_pages.

ext4 has its own ext4_mpage_readpages but the path which has pages !=
NULL can use the same gfp mask.  Btrfs, cifs, f2fs and orangefs are
doing a very similar pattern to mpage_readpages so the same can be
applied to them as well.

[akpm@linux-foundation.org: coding-style fixes]
[mhocko@suse.com: restrict gfp mask in mpage_alloc]
  Link: http://lkml.kernel.org/r/20160610074223.GC32285@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/1465301556-26431-1-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Chris Mason <clm@fb.com>
Cc: Steve French <sfrench@samba.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jan Kara <jack@suse.cz>
Cc: Mike Marshall <hubcap@omnibond.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Changman Lee <cm224.lee@samsung.com>
Cc: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 97354102794d..81363b834900 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -209,10 +209,10 @@ static inline struct page *page_cache_alloc_cold(struct address_space *x)
 	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
 }
 
-static inline struct page *page_cache_alloc_readahead(struct address_space *x)
+static inline gfp_t readahead_gfp_mask(struct address_space *x)
 {
-	return __page_cache_alloc(mapping_gfp_mask(x) |
-				  __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN);
+	return mapping_gfp_mask(x) |
+				  __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN;
 }
 
 typedef int filler_t(void *, struct page *);
-- 
cgit v1.2.3


From dcddffd41d3f1d3bdcc1dce3f1cd142779b6d4c1 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:25:18 -0700
Subject: mm: do not pass mm_struct into handle_mm_fault

We always have vma->vm_mm around.

Link: http://lkml.kernel.org/r/1466021202-61880-8-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6c9a394b2979..646bc36b4d1b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1215,15 +1215,14 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page);
 int invalidate_inode_page(struct page *page);
 
 #ifdef CONFIG_MMU
-extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, unsigned int flags);
+extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+		unsigned int flags);
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 			    unsigned long address, unsigned int fault_flags,
 			    bool *unlocked);
 #else
-static inline int handle_mm_fault(struct mm_struct *mm,
-			struct vm_area_struct *vma, unsigned long address,
-			unsigned int flags)
+static inline int handle_mm_fault(struct vm_area_struct *vma,
+		unsigned long address, unsigned int flags)
 {
 	/* should never happen if there's no MMU */
 	BUG();
-- 
cgit v1.2.3


From bae473a423f65e480db83c85b5e92254f6dfcb28 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:25:20 -0700
Subject: mm: introduce fault_env

The idea borrowed from Peter's patch from patchset on speculative page
faults[1]:

Instead of passing around the endless list of function arguments,
replace the lot with a single structure so we can change context without
endless function signature changes.

The changes are mostly mechanical with exception of faultaround code:
filemap_map_pages() got reworked a bit.

This patch is preparation for the next one.

[1] http://lkml.kernel.org/r/20141020222841.302891540@infradead.org

Link: http://lkml.kernel.org/r/1466021202-61880-9-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h       | 20 +++++---------------
 include/linux/mm.h            | 34 ++++++++++++++++++++++++++--------
 include/linux/userfaultfd_k.h |  8 ++------
 3 files changed, 33 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f0a7a0320300..9bed9249156f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -1,20 +1,12 @@
 #ifndef _LINUX_HUGE_MM_H
 #define _LINUX_HUGE_MM_H
 
-extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
-				      struct vm_area_struct *vma,
-				      unsigned long address, pmd_t *pmd,
-				      unsigned int flags);
+extern int do_huge_pmd_anonymous_page(struct fault_env *fe);
 extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 			 struct vm_area_struct *vma);
-extern void huge_pmd_set_accessed(struct mm_struct *mm,
-				  struct vm_area_struct *vma,
-				  unsigned long address, pmd_t *pmd,
-				  pmd_t orig_pmd, int dirty);
-extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
-			       unsigned long address, pmd_t *pmd,
-			       pmd_t orig_pmd);
+extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd);
+extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd);
 extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 					  unsigned long addr,
 					  pmd_t *pmd,
@@ -134,8 +126,7 @@ static inline int hpage_nr_pages(struct page *page)
 	return 1;
 }
 
-extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-				unsigned long addr, pmd_t pmd, pmd_t *pmdp);
+extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd);
 
 extern struct page *huge_zero_page;
 
@@ -196,8 +187,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 	return NULL;
 }
 
-static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-					unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd)
 {
 	return 0;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 646bc36b4d1b..8bd74558c0e4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -309,10 +309,27 @@ struct vm_fault {
 					 * VM_FAULT_DAX_LOCKED and fill in
 					 * entry here.
 					 */
-	/* for ->map_pages() only */
-	pgoff_t max_pgoff;		/* map pages for offset from pgoff till
-					 * max_pgoff inclusive */
-	pte_t *pte;			/* pte entry associated with ->pgoff */
+};
+
+/*
+ * Page fault context: passes though page fault handler instead of endless list
+ * of function arguments.
+ */
+struct fault_env {
+	struct vm_area_struct *vma;	/* Target VMA */
+	unsigned long address;		/* Faulting virtual address */
+	unsigned int flags;		/* FAULT_FLAG_xxx flags */
+	pmd_t *pmd;			/* Pointer to pmd entry matching
+					 * the 'address'
+					 */
+	pte_t *pte;			/* Pointer to pte entry matching
+					 * the 'address'. NULL if the page
+					 * table hasn't been allocated.
+					 */
+	spinlock_t *ptl;		/* Page table lock.
+					 * Protects pte page table if 'pte'
+					 * is not NULL, otherwise pmd.
+					 */
 };
 
 /*
@@ -327,7 +344,8 @@ struct vm_operations_struct {
 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
 	int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
 						pmd_t *, unsigned int flags);
-	void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	void (*map_pages)(struct fault_env *fe,
+			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
@@ -600,8 +618,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 	return pte;
 }
 
-void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-		struct page *page, pte_t *pte, bool write, bool anon);
+void do_set_pte(struct fault_env *fe, struct page *page);
 #endif
 
 /*
@@ -2062,7 +2079,8 @@ extern void truncate_inode_pages_final(struct address_space *);
 
 /* generic vm_area_ops exported for stackable file systems */
 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
-extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern void filemap_map_pages(struct fault_env *fe,
+		pgoff_t start_pgoff, pgoff_t end_pgoff);
 extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 
 /* mm/page-writeback.c */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 587480ad41b7..dd66a952e8cd 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -27,8 +27,7 @@
 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
 
-extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
-			    unsigned int flags, unsigned long reason);
+extern int handle_userfault(struct fault_env *fe, unsigned long reason);
 
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 			    unsigned long src_start, unsigned long len);
@@ -56,10 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
-static inline int handle_userfault(struct vm_area_struct *vma,
-				   unsigned long address,
-				   unsigned int flags,
-				   unsigned long reason)
+static inline int handle_userfault(struct fault_env *fe, unsigned long reason)
 {
 	return VM_FAULT_SIGBUS;
 }
-- 
cgit v1.2.3


From 7267ec008b5cd8b3579e188b1ff238815643e372 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:25:23 -0700
Subject: mm: postpone page table allocation until we have page to map

The idea (and most of code) is borrowed again: from Hugh's patchset on
huge tmpfs[1].

Instead of allocation pte page table upfront, we postpone this until we
have page to map in hands.  This approach opens possibility to map the
page as huge if filesystem supports this.

Comparing to Hugh's patch I've pushed page table allocation a bit
further: into do_set_pte().  This way we can postpone allocation even in
faultaround case without moving do_fault_around() after __do_fault().

do_set_pte() got renamed to alloc_set_pte() as it can allocate page
table if required.

[1] http://lkml.kernel.org/r/alpine.LSU.2.11.1502202015090.14414@eggly.anvils

Link: http://lkml.kernel.org/r/1466021202-61880-10-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8bd74558c0e4..192c1bbe5fcd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -330,6 +330,13 @@ struct fault_env {
 					 * Protects pte page table if 'pte'
 					 * is not NULL, otherwise pmd.
 					 */
+	pgtable_t prealloc_pte;		/* Pre-allocated pte page table.
+					 * vm_ops->map_pages() calls
+					 * alloc_set_pte() from atomic context.
+					 * do_fault_around() pre-allocates
+					 * page table to avoid allocation from
+					 * atomic context.
+					 */
 };
 
 /*
@@ -618,7 +625,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 	return pte;
 }
 
-void do_set_pte(struct fault_env *fe, struct page *page);
+int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+		struct page *page);
 #endif
 
 /*
-- 
cgit v1.2.3


From dd78fedde4b99b322f2dc849d467d365a82e23ca Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:25:26 -0700
Subject: rmap: support file thp

Naive approach: on mapping/unmapping the page as compound we update
->_mapcount on each 4k page.  That's not efficient, but it's not obvious
how we can optimize this.  We can look into optimization later.

PG_double_map optimization doesn't work for file pages since lifecycle
of file pages is different comparing to anon pages: file page can be
mapped again at any time.

Link: http://lkml.kernel.org/r/1466021202-61880-11-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 2b0fad83683f..b46bb5620a76 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -165,7 +165,7 @@ void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
 			   unsigned long, int);
 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
 		unsigned long, bool);
-void page_add_file_rmap(struct page *);
+void page_add_file_rmap(struct page *, bool);
 void page_remove_rmap(struct page *, bool);
 
 void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
-- 
cgit v1.2.3


From 1010245964415bb7403463115bab2cd26244b445 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:25:29 -0700
Subject: mm: introduce do_set_pmd()

With postponed page table allocation we have chance to setup huge pages.
do_set_pte() calls do_set_pmd() if following criteria met:

 - page is compound;
 - pmd entry in pmd_none();
 - vma has suitable size and alignment;

Link: http://lkml.kernel.org/r/1466021202-61880-12-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 9bed9249156f..254aac4c3963 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -143,6 +143,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 struct page *get_huge_zero_page(void);
 void put_huge_zero_page(void);
 
+#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
+
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
-- 
cgit v1.2.3


From 95ecedcd6abbb05d8177331e2fa697888dcd634b Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:25:31 -0700
Subject: thp, vmstats: add counters for huge file pages

THP_FILE_ALLOC: how many times huge page was allocated and put page
cache.

THP_FILE_MAPPED: how many times file huge page was mapped.

Link: http://lkml.kernel.org/r/1466021202-61880-13-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index ec084321fe09..42604173f122 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -70,6 +70,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		THP_FAULT_FALLBACK,
 		THP_COLLAPSE_ALLOC,
 		THP_COLLAPSE_ALLOC_FAILED,
+		THP_FILE_ALLOC,
+		THP_FILE_MAPPED,
 		THP_SPLIT_PAGE,
 		THP_SPLIT_PAGE_FAILED,
 		THP_DEFERRED_SPLIT_PAGE,
@@ -100,4 +102,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		NR_VM_EVENT_ITEMS
 };
 
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+#define THP_FILE_ALLOC ({ BUILD_BUG(); 0; })
+#define THP_FILE_MAPPED ({ BUILD_BUG(); 0; })
+#endif
+
 #endif		/* VM_EVENT_ITEM_H_INCLUDED */
-- 
cgit v1.2.3


From 9a73f61bdb8acdc01bbaf72a3fe0a8854f2463ad Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:25:53 -0700
Subject: thp, mlock: do not mlock PTE-mapped file huge pages

As with anon THP, we only mlock file huge pages if we can prove that the
page is not mapped with PTE.  This way we can avoid mlock leak into
non-mlocked vma on split.

We rely on PageDoubleMap() under lock_page() to check if the the page
may be PTE mapped.  PG_double_map is set by page_add_file_rmap() when
the page mapped with PTEs.

Link: http://lkml.kernel.org/r/1466021202-61880-21-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 7c8e82ac2eb7..8cf09639185a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -581,6 +581,17 @@ static inline int PageDoubleMap(struct page *page)
 	return PageHead(page) && test_bit(PG_double_map, &page[1].flags);
 }
 
+static inline void SetPageDoubleMap(struct page *page)
+{
+	VM_BUG_ON_PAGE(!PageHead(page), page);
+	set_bit(PG_double_map, &page[1].flags);
+}
+
+static inline void ClearPageDoubleMap(struct page *page)
+{
+	VM_BUG_ON_PAGE(!PageHead(page), page);
+	clear_bit(PG_double_map, &page[1].flags);
+}
 static inline int TestSetPageDoubleMap(struct page *page)
 {
 	VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -598,7 +609,7 @@ TESTPAGEFLAG_FALSE(TransHuge)
 TESTPAGEFLAG_FALSE(TransCompound)
 TESTPAGEFLAG_FALSE(TransCompoundMap)
 TESTPAGEFLAG_FALSE(TransTail)
-TESTPAGEFLAG_FALSE(DoubleMap)
+PAGEFLAG_FALSE(DoubleMap)
 	TESTSETFLAG_FALSE(DoubleMap)
 	TESTCLEARFLAG_FALSE(DoubleMap)
 #endif
-- 
cgit v1.2.3


From e2f0a0db95979a4aa951d883248da9d361507abf Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:25:59 -0700
Subject: page-flags: relax policy for PG_mappedtodisk and PG_reclaim

These flags are in use for file THP.

Link: http://lkml.kernel.org/r/1466021202-61880-23-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 8cf09639185a..74e4dda91238 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -295,11 +295,11 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
  */
 TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
 	TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
-PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND)
+PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
 
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
-PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
-	TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
+	TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
 PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
 	TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
 
-- 
cgit v1.2.3


From c78c66d1ddfdbd2353f3fcfeba0268524537b096 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:26:02 -0700
Subject: radix-tree: implement radix_tree_maybe_preload_order()

The new helper is similar to radix_tree_maybe_preload(), but tries to
preload number of nodes required to insert (1 << order) continuous
naturally-aligned elements.

This is required to push huge pages into pagecache.

Link: http://lkml.kernel.org/r/1466021202-61880-24-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index eca6f626c16e..cbfee507c839 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -291,6 +291,7 @@ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
 			unsigned long first_index, unsigned int max_items);
 int radix_tree_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload(gfp_t gfp_mask);
+int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
-- 
cgit v1.2.3


From 65c453778aea374a46597f4d9826274d1eaf7338 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:26:10 -0700
Subject: mm, rmap: account shmem thp pages

Let's add ShmemHugePages and ShmemPmdMapped fields into meminfo and
smaps.  It indicates how many times we allocate and map shmem THP.

NR_ANON_TRANSPARENT_HUGEPAGES is renamed to NR_ANON_THPS.

Link: http://lkml.kernel.org/r/1466021202-61880-27-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3d7ab30d4940..19425e988bdc 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -154,7 +154,9 @@ enum zone_stat_item {
 	WORKINGSET_REFAULT,
 	WORKINGSET_ACTIVATE,
 	WORKINGSET_NODERECLAIM,
-	NR_ANON_TRANSPARENT_HUGEPAGES,
+	NR_ANON_THPS,
+	NR_SHMEM_THPS,
+	NR_SHMEM_PMDMAPPED,
 	NR_FREE_CMA_PAGES,
 	NR_VM_ZONE_STAT_ITEMS };
 
-- 
cgit v1.2.3


From 5a6e75f8110c97e2a5488894d4e922187e6cb343 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:26:13 -0700
Subject: shmem: prepare huge= mount option and sysfs knob

This patch adds new mount option "huge=".  It can have following values:

  - "always":
	Attempt to allocate huge pages every time we need a new page;

  - "never":
	Do not allocate huge pages;

  - "within_size":
	Only allocate huge page if it will be fully within i_size.
	Also respect fadvise()/madvise() hints;

  - "advise:
	Only allocate huge pages if requested with fadvise()/madvise();

Default is "never" for now.

"mount -o remount,huge= /mountpoint" works fine after mount: remounting
huge=never will not attempt to break up huge pages at all, just stop
more from being allocated.

No new config option: put this under CONFIG_TRANSPARENT_HUGEPAGE, which
is the appropriate option to protect those who don't want the new bloat,
and with which we shall share some pmd code.

Prohibit the option when !CONFIG_TRANSPARENT_HUGEPAGE, just as mpol is
invalid without CONFIG_NUMA (was hidden in mpol_parse_str(): make it
explicit).

Allow enabling THP only if the machine has_transparent_hugepage().

But what about Shmem with no user-visible mount? SysV SHM, memfds,
shared anonymous mmaps (of /dev/zero or MAP_ANONYMOUS), GPU drivers' DRM
objects, Ashmem.  Though unlikely to suit all usages, provide sysfs knob
/sys/kernel/mm/transparent_hugepage/shmem_enabled to experiment with
huge on those.

And allow shmem_enabled two further values:

  - "deny":
	For use in emergencies, to force the huge option off from
	all mounts;
  - "force":
	Force the huge option on for all - very useful for testing;

Based on patch by Hugh Dickins.

Link: http://lkml.kernel.org/r/1466021202-61880-28-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h  | 2 ++
 include/linux/shmem_fs.h | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 254aac4c3963..2f757132662a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -41,6 +41,8 @@ enum transparent_hugepage_flag {
 #endif
 };
 
+extern struct kobj_attribute shmem_enabled_attr;
+
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 4d4780c00d34..466f18c73a49 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -28,9 +28,10 @@ struct shmem_sb_info {
 	unsigned long max_inodes;   /* How many inodes are allowed */
 	unsigned long free_inodes;  /* How many are left for allocation */
 	spinlock_t stat_lock;	    /* Serialize shmem_sb_info changes */
+	umode_t mode;		    /* Mount mode for root directory */
+	unsigned char huge;	    /* Whether to try for hugepages */
 	kuid_t uid;		    /* Mount uid for root directory */
 	kgid_t gid;		    /* Mount gid for root directory */
-	umode_t mode;		    /* Mount mode for root directory */
 	struct mempolicy *mpol;     /* default memory policy for mappings */
 };
 
-- 
cgit v1.2.3


From c01d5b300774d130a24d787825b01eb24e6e20cb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 26 Jul 2016 15:26:15 -0700
Subject: shmem: get_unmapped_area align huge page

Provide a shmem_get_unmapped_area method in file_operations, called at
mmap time to decide the mapping address.  It could be conditional on
CONFIG_TRANSPARENT_HUGEPAGE, but save #ifdefs in other places by making
it unconditional.

shmem_get_unmapped_area() first calls the usual mm->get_unmapped_area
(which we treat as a black box, highly dependent on architecture and
config and executable layout).  Lots of conditions, and in most cases it
just goes with the address that chose; but when our huge stars are
rightly aligned, yet that did not provide a suitable address, go back to
ask for a larger arena, within which to align the mapping suitably.

There have to be some direct calls to shmem_get_unmapped_area(), not via
the file_operations: because of the way shmem_zero_setup() is called to
create a shmem object late in the mmap sequence, when MAP_SHARED is
requested with MAP_ANONYMOUS or /dev/zero.  Though this only matters
when /proc/sys/vm/shmem_huge has been set.

Link: http://lkml.kernel.org/r/1466021202-61880-29-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 466f18c73a49..ff2de4bab61f 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -50,6 +50,8 @@ extern struct file *shmem_file_setup(const char *name,
 extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
 					    unsigned long flags);
 extern int shmem_zero_setup(struct vm_area_struct *);
+extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags);
 extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
 extern bool shmem_mapping(struct address_space *mapping);
 extern void shmem_unlock_mapping(struct address_space *mapping);
-- 
cgit v1.2.3


From 800d8c63b2e989c2e349632d1648119bf5862f01 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:26:18 -0700
Subject: shmem: add huge pages support

Here's basic implementation of huge pages support for shmem/tmpfs.

It's all pretty streight-forward:

  - shmem_getpage() allcoates huge page if it can and try to inserd into
    radix tree with shmem_add_to_page_cache();

  - shmem_add_to_page_cache() puts the page onto radix-tree if there's
    space for it;

  - shmem_undo_range() removes huge pages, if it fully within range.
    Partial truncate of huge pages zero out this part of THP.

    This have visible effect on fallocate(FALLOC_FL_PUNCH_HOLE)
    behaviour. As we don't really create hole in this case,
    lseek(SEEK_HOLE) may have inconsistent results depending what
    pages happened to be allocated.

  - no need to change shmem_fault: core-mm will map an compound page as
    huge if VMA is suitable;

Link: http://lkml.kernel.org/r/1466021202-61880-30-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h  | 2 ++
 include/linux/shmem_fs.h | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2f757132662a..7b7406e9fedf 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -156,6 +156,8 @@ void put_huge_zero_page(void);
 
 #define transparent_hugepage_enabled(__vma) 0
 
+static inline void prep_transhuge_page(struct page *page) {}
+
 #define transparent_hugepage_flags 0UL
 static inline int
 split_huge_page_to_list(struct page *page, struct list_head *list)
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index ff2de4bab61f..94eaaa2c6ad9 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -71,6 +71,9 @@ static inline struct page *shmem_read_mapping_page(
 					mapping_gfp_mask(mapping));
 }
 
+extern bool shmem_charge(struct inode *inode, long pages);
+extern void shmem_uncharge(struct inode *inode, long pages);
+
 #ifdef CONFIG_TMPFS
 
 extern int shmem_add_seals(struct file *file, unsigned int seals);
-- 
cgit v1.2.3


From b46e756f5e47031c67658ff036e5ffe27062fa43 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:26:24 -0700
Subject: thp: extract khugepaged from mm/huge_memory.c

khugepaged implementation grew to the point when it deserve separate
file in source.

Let's move it to mm/khugepaged.c.

Link: http://lkml.kernel.org/r/1466021202-61880-32-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h    | 10 ++++++++++
 include/linux/khugepaged.h |  5 +++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7b7406e9fedf..92ce91c03cd0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -41,6 +41,16 @@ enum transparent_hugepage_flag {
 #endif
 };
 
+struct kobject;
+struct kobj_attribute;
+
+extern ssize_t single_hugepage_flag_store(struct kobject *kobj,
+				 struct kobj_attribute *attr,
+				 const char *buf, size_t count,
+				 enum transparent_hugepage_flag flag);
+extern ssize_t single_hugepage_flag_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf,
+				enum transparent_hugepage_flag flag);
 extern struct kobj_attribute shmem_enabled_attr;
 
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index eeb307985715..1e032a1ddb3e 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -4,6 +4,11 @@
 #include <linux/sched.h> /* MMF_VM_HUGEPAGE */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern struct attribute_group khugepaged_attr_group;
+
+extern int khugepaged_init(void);
+extern void khugepaged_destroy(void);
+extern int start_stop_khugepaged(void);
 extern int __khugepaged_enter(struct mm_struct *mm);
 extern void __khugepaged_exit(struct mm_struct *mm);
 extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
-- 
cgit v1.2.3


From f3f0e1d2150b2b99da2cbdfaad000089efe9bf30 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:26:32 -0700
Subject: khugepaged: add support of collapse for tmpfs/shmem pages

This patch extends khugepaged to support collapse of tmpfs/shmem pages.
We share fair amount of infrastructure with anon-THP collapse.

Few design points:

  - First we are looking for VMA which can be suitable for mapping huge
    page;

  - If the VMA maps shmem file, the rest scan/collapse operations
    operates on page cache, not on page tables as in anon VMA case.

  - khugepaged_scan_shmem() finds a range which is suitable for huge
    page. The scan is lockless and shouldn't disturb system too much.

  - once the candidate for collapse is found, collapse_shmem() attempts
    to create a huge page:

      + scan over radix tree, making the range point to new huge page;

      + new huge page is not-uptodate, locked and freezed (refcount
        is 0), so nobody can touch them until we say so.

      + we swap in pages during the scan. khugepaged_scan_shmem()
        filters out ranges with more than khugepaged_max_ptes_swap
	swapped out pages. It's HPAGE_PMD_NR/8 by default.

      + old pages are isolated, unmapped and put to local list in case
        to be restored back if collapse failed.

  - if collapse succeed, we retract pte page tables from VMAs where huge
    pages mapping is possible. The huge page will be mapped as PMD on
    next minor fault into the range.

Link: http://lkml.kernel.org/r/1466021202-61880-35-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 94eaaa2c6ad9..0890f700a546 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -54,6 +54,7 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
 extern bool shmem_mapping(struct address_space *mapping);
+extern bool shmem_huge_enabled(struct vm_area_struct *vma);
 extern void shmem_unlock_mapping(struct address_space *mapping);
 extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 					pgoff_t index, gfp_t gfp_mask);
@@ -64,6 +65,19 @@ extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
 extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 						pgoff_t start, pgoff_t end);
 
+/* Flag allocation requirements to shmem_getpage */
+enum sgp_type {
+	SGP_READ,	/* don't exceed i_size, don't allocate page */
+	SGP_CACHE,	/* don't exceed i_size, may allocate page */
+	SGP_NOHUGE,	/* like SGP_CACHE, but no huge pages */
+	SGP_HUGE,	/* like SGP_CACHE, huge pages preferred */
+	SGP_WRITE,	/* may exceed i_size, may allocate !Uptodate page */
+	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
+};
+
+extern int shmem_getpage(struct inode *inode, pgoff_t index,
+		struct page **pagep, enum sgp_type sgp);
+
 static inline struct page *shmem_read_mapping_page(
 				struct address_space *mapping, pgoff_t index)
 {
@@ -71,6 +85,15 @@ static inline struct page *shmem_read_mapping_page(
 					mapping_gfp_mask(mapping));
 }
 
+static inline bool shmem_file(struct file *file)
+{
+	if (!IS_ENABLED(CONFIG_SHMEM))
+		return false;
+	if (!file || !file->f_mapping)
+		return false;
+	return shmem_mapping(file->f_mapping);
+}
+
 extern bool shmem_charge(struct inode *inode, long pages);
 extern void shmem_uncharge(struct inode *inode, long pages);
 
-- 
cgit v1.2.3


From e496cf3d782135c1cca0d154d4b924517ff58de0 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:26:35 -0700
Subject: thp: introduce CONFIG_TRANSPARENT_HUGE_PAGECACHE

For file mappings, we don't deposit page tables on THP allocation
because it's not strictly required to implement split_huge_pmd(): we can
just clear pmd and let following page faults to reconstruct the page
table.

But Power makes use of deposited page table to address MMU quirk.

Let's hide THP page cache, including huge tmpfs, under separate config
option, so it can be forbidden on Power.

We can revert the patch later once solution for Power found.

Link: http://lkml.kernel.org/r/1466021202-61880-36-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 0890f700a546..54fa28dfbd89 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -54,7 +54,6 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
 extern bool shmem_mapping(struct address_space *mapping);
-extern bool shmem_huge_enabled(struct vm_area_struct *vma);
 extern void shmem_unlock_mapping(struct address_space *mapping);
 extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 					pgoff_t index, gfp_t gfp_mask);
@@ -112,4 +111,13 @@ static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a)
 
 #endif
 
+#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+extern bool shmem_huge_enabled(struct vm_area_struct *vma);
+#else
+static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
+{
+	return false;
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 779750d20b93bb2e0c75dfe924f31b02f6a78bfa Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 26 Jul 2016 15:26:38 -0700
Subject: shmem: split huge pages beyond i_size under memory pressure

Even if user asked to allocate huge pages always (huge=always), we
should be able to free up some memory by splitting pages which are
partly byound i_size if memory presure comes or once we hit limit on
filesystem size (-o size=).

In order to do this we maintain per-superblock list of inodes, which
potentially have huge pages on the border of file size.

Per-fs shrinker can reclaim memory by splitting such pages.

If we hit -ENOSPC during shmem_getpage_gfp(), we try to split a page to
free up space on the filesystem and retry allocation if it succeed.

Link: http://lkml.kernel.org/r/1466021202-61880-37-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 54fa28dfbd89..ff078e7043b6 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -16,8 +16,9 @@ struct shmem_inode_info {
 	unsigned long		flags;
 	unsigned long		alloced;	/* data pages alloced to file */
 	unsigned long		swapped;	/* subtotal assigned to swap */
-	struct shared_policy	policy;		/* NUMA memory alloc policy */
+	struct list_head        shrinklist;     /* shrinkable hpage inodes */
 	struct list_head	swaplist;	/* chain of maybes on swap */
+	struct shared_policy	policy;		/* NUMA memory alloc policy */
 	struct simple_xattrs	xattrs;		/* list of xattrs */
 	struct inode		vfs_inode;
 };
@@ -33,6 +34,9 @@ struct shmem_sb_info {
 	kuid_t uid;		    /* Mount uid for root directory */
 	kgid_t gid;		    /* Mount gid for root directory */
 	struct mempolicy *mpol;     /* default memory policy for mappings */
+	spinlock_t shrinklist_lock;   /* Protects shrinklist */
+	struct list_head shrinklist;  /* List of shinkable inodes */
+	unsigned long shrinklist_len; /* Length of shrinklist */
 };
 
 static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
-- 
cgit v1.2.3


From dd4123f324bbaec7618b677b7bce2b11aee9594e Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 26 Jul 2016 15:26:50 -0700
Subject: mm: fix build warnings in <linux/compaction.h>

Randy reported below build error.

> In file included from ../include/linux/balloon_compaction.h:48:0,
>                  from ../mm/balloon_compaction.c:11:
> ../include/linux/compaction.h:237:51: warning: 'struct node' declared inside parameter list [enabled by default]
>  static inline int compaction_register_node(struct node *node)
> ../include/linux/compaction.h:237:51: warning: its scope is only this definition or declaration, which is probably not what you want [enabled by default]
> ../include/linux/compaction.h:242:54: warning: 'struct node' declared inside parameter list [enabled by default]
>  static inline void compaction_unregister_node(struct node *node)
>

It was caused by non-lru page migration which needs compaction.h but
compaction.h doesn't include any header to be standalone.

I think proper header for non-lru page migration is migrate.h rather
than compaction.h because migrate.h has already headers needed to work
non-lru page migration indirectly like isolate_mode_t, migrate_mode
MIGRATEPAGE_SUCCESS.

[akpm@linux-foundation.org: revert mm-balloon-use-general-non-lru-movable-page-feature-fix.patch temp fix]
Link: http://lkml.kernel.org/r/20160610003304.GE29779@bbox
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Gioh Kim <gi-oh.kim@profitbricks.com>
Cc: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/balloon_compaction.h |  3 +--
 include/linux/compaction.h         | 16 ----------------
 include/linux/migrate.h            | 15 +++++++++++++++
 3 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 504bd724e6ab..79542b2698ec 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -45,8 +45,7 @@
 #define _LINUX_BALLOON_COMPACTION_H
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
-#include <linux/node.h>
-#include <linux/compaction.h>
+#include <linux/migrate.h>
 #include <linux/gfp.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index c6b47c861cea..1a02dab16646 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -54,9 +54,6 @@ enum compact_result {
 struct alloc_context; /* in mm/internal.h */
 
 #ifdef CONFIG_COMPACTION
-extern int PageMovable(struct page *page);
-extern void __SetPageMovable(struct page *page, struct address_space *mapping);
-extern void __ClearPageMovable(struct page *page);
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos);
@@ -154,19 +151,6 @@ extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
 
 #else
-static inline int PageMovable(struct page *page)
-{
-	return 0;
-}
-static inline void __SetPageMovable(struct page *page,
-			struct address_space *mapping)
-{
-}
-
-static inline void __ClearPageMovable(struct page *page)
-{
-}
-
 static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 			unsigned int order, int alloc_flags,
 			const struct alloc_context *ac,
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 404fbfefeb33..ae8d475a9385 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -71,6 +71,21 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 
 #endif /* CONFIG_MIGRATION */
 
+#ifdef CONFIG_COMPACTION
+extern int PageMovable(struct page *page);
+extern void __SetPageMovable(struct page *page, struct address_space *mapping);
+extern void __ClearPageMovable(struct page *page);
+#else
+static inline int PageMovable(struct page *page) { return 0; };
+static inline void __SetPageMovable(struct page *page,
+				struct address_space *mapping)
+{
+}
+static inline void __ClearPageMovable(struct page *page)
+{
+}
+#endif
+
 #ifdef CONFIG_NUMA_BALANCING
 extern bool pmd_trans_migrating(pmd_t pmd);
 extern int migrate_misplaced_page(struct page *page,
-- 
cgit v1.2.3