From c3cddc4c296c3a1498df7181015cbcea178a0546 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 24 Jun 2015 16:54:45 -0700
Subject: fsnotify: remove obsolete documentation

should_send_event is no longer part of struct fsnotify_ops, so remove it.

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fsnotify_backend.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0f313f93c586..65a517dd32f7 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -84,8 +84,6 @@ struct fsnotify_fname;
  * Each group much define these ops.  The fsnotify infrastructure will call
  * these operations for each relevant group.
  *
- * should_send_event - given a group, inode, and mask this function determines
- *		if the group is interested in this event.
  * handle_event - main call for a group to handle an fs event
  * free_group_priv - called when a group refcnt hits 0 to clean up the private union
  * freeing_mark - called when a mark is being destroyed for some reason.  The group
-- 
cgit v1.2.3


From 5286d20c4eb7b0c29217f8756652609df74f5489 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Wed, 24 Jun 2015 16:54:51 -0700
Subject: configfs: unexport/make static config_item_init()

config_item_init() is only used in item.c

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/configfs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 34025df61829..c9e5c57e4edf 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -71,7 +71,6 @@ static inline char *config_item_name(struct config_item * item)
 	return item->ci_name;
 }
 
-extern void config_item_init(struct config_item *);
 extern void config_item_init_type_name(struct config_item *item,
 				       const char *name,
 				       struct config_item_type *type);
-- 
cgit v1.2.3


From b5242e98c1cb834feb1e84026f09a4796b49eb4d Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@ezchip.com>
Date: Wed, 24 Jun 2015 16:55:42 -0700
Subject: smpboot: allow excluding cpus from the smpboot threads

This patch series allows the watchdog to run by default only on the
housekeeping cores when nohz_full is in effect; this seems to be a good
compromise short of turning it off completely (since the nohz_full cores
can't tolerate a watchdog).

To provide customizability, we add /proc/sys/kernel/watchdog_cpumask so
that the set of cores running the watchdog can be tuned to different
values after bootup.

To implement this customizability, we add a new
smpboot_update_cpumask_percpu_thread() API to the smpboot_thread
subsystem that lets us park or unpark "unwanted" threads.

And now that threads can be parked for long periods of time, we tweak the
/proc/<pid>/stat and /proc/<pid>/status code so parked threads aren't
reported as running, which is otherwise confusing.

This patch (of 3):

This change allows some cores to be excluded from running the
smp_hotplug_thread tasks.  The following commit to update
kernel/watchdog.c to use this functionality is the motivating example, and
more information on the motivation is provided there.

A new smp_hotplug_thread field is introduced, "cpumask", which is cpumask
field managed by the smpboot subsystem that indicates whether or not the
given smp_hotplug_thread should run on that core; the cpumask is checked
when deciding whether to unpark the thread.

To limit the cpumask to less than cpu_possible, you must call
smpboot_update_cpumask_percpu_thread() after registering.

Signed-off-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smpboot.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index d600afb21926..da3c593f9845 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -27,6 +27,8 @@ struct smpboot_thread_data;
  * @pre_unpark:		Optional unpark function, called before the thread is
  *			unparked (cpu online). This is not guaranteed to be
  *			called on the target cpu of the thread. Careful!
+ * @cpumask:		Internal state.  To update which threads are unparked,
+ *			call smpboot_update_cpumask_percpu_thread().
  * @selfparking:	Thread is not parked by the park function.
  * @thread_comm:	The base name of the thread
  */
@@ -41,11 +43,14 @@ struct smp_hotplug_thread {
 	void				(*park)(unsigned int cpu);
 	void				(*unpark)(unsigned int cpu);
 	void				(*pre_unpark)(unsigned int cpu);
+	cpumask_var_t			cpumask;
 	bool				selfparking;
 	const char			*thread_comm;
 };
 
 int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+					 const struct cpumask *);
 
 #endif
-- 
cgit v1.2.3


From fe4ba3c34352b7e8068b7f18eb233444aed17011 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@ezchip.com>
Date: Wed, 24 Jun 2015 16:55:45 -0700
Subject: watchdog: add watchdog_cpumask sysctl to assist nohz

Change the default behavior of watchdog so it only runs on the
housekeeping cores when nohz_full is enabled at build and boot time.
Allow modifying the set of cores the watchdog is currently running on
with a new kernel.watchdog_cpumask sysctl.

In the current system, the watchdog subsystem runs a periodic timer that
schedules the watchdog kthread to run.  However, nohz_full cores are
designed to allow userspace application code running on those cores to
have 100% access to the CPU.  So the watchdog system prevents the
nohz_full application code from being able to run the way it wants to,
thus the motivation to suppress the watchdog on nohz_full cores, which
this patchset provides by default.

However, if we disable the watchdog globally, then the housekeeping
cores can't benefit from the watchdog functionality.  So we allow
disabling it only on some cores.  See Documentation/lockup-watchdogs.txt
for more information.

[jhubbard@nvidia.com: fix a watchdog crash in some configurations]
Signed-off-by: Chris Metcalf <cmetcalf@ezchip.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nmi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 3d46fb4708e0..f94da0e65dea 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled;
 extern int soft_watchdog_enabled;
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
+extern unsigned long *watchdog_cpumask_bits;
 extern int sysctl_softlockup_all_cpu_backtrace;
 struct ctl_table;
 extern int proc_watchdog(struct ctl_table *, int ,
@@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int ,
 			      void __user *, size_t *, loff_t *);
 extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
+extern int proc_watchdog_cpumask(struct ctl_table *, int,
+				 void __user *, size_t *, loff_t *);
 #endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
-- 
cgit v1.2.3


From 4066c33d0308f87e9a3b0c7fafb9141c0bfbfa77 Mon Sep 17 00:00:00 2001
From: Gavin Guo <gavin.guo@canonical.com>
Date: Wed, 24 Jun 2015 16:55:54 -0700
Subject: mm/slab_common: support the slub_debug boot option on specific object
 size

The slub_debug=PU,kmalloc-xx cannot work because in the
create_kmalloc_caches() the s->name is created after the
create_kmalloc_cache() is called.  The name is NULL in the
create_kmalloc_cache() so the kmem_cache_flags() would not set the
slub_debug flags to the s->flags.  The fix here set up a kmalloc_names
string array for the initialization purpose and delete the dynamic name
creation of kmalloc_caches.

[akpm@linux-foundation.org: s/kmalloc_names/kmalloc_info/, tweak comment text]
Signed-off-by: Gavin Guo <gavin.guo@canonical.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index ffd24c830151..96f0ea506b5c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -153,8 +153,30 @@ size_t ksize(const void *);
 #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
 #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
 #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
+/*
+ * The KMALLOC_LOOP_LOW is the definition for the for loop index start number
+ * to create the kmalloc_caches object in create_kmalloc_caches(). The first
+ * and the second are 96 and 192. You can see that in the kmalloc_index(), if
+ * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64,
+ * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't
+ * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW.
+ */
+#if KMALLOC_MIN_SIZE <= 32
+#define KMALLOC_LOOP_LOW 1
+#elif KMALLOC_MIN_SIZE <= 64
+#define KMALLOC_LOOP_LOW 2
+#else
+#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW
+#endif
+
 #else
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
+/*
+ * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used.
+ * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be
+ * initialized.
+ */
+#define KMALLOC_LOOP_LOW 1
 #endif
 
 /*
-- 
cgit v1.2.3


From 1ed58b6051b67e5cfe9e465fb60bf7d5f55e0a64 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 24 Jun 2015 16:55:59 -0700
Subject: linux/slab.h: fix three off-by-one typos in comment

The first is a keyboard-off-by-one, the other two the ordinary mathy kind.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 96f0ea506b5c..9de2fdc8b5e4 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -262,8 +262,8 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
  * belongs to.
  * 0 = zero alloc
  * 1 =  65 .. 96 bytes
- * 2 = 120 .. 192 bytes
- * n = 2^(n-1) .. 2^n -1
+ * 2 = 129 .. 192 bytes
+ * n = 2^(n-1)+1 .. 2^n
  */
 static __always_inline int kmalloc_index(size_t size)
 {
-- 
cgit v1.2.3


From 2ae416b142b625c58c9ccb039aa3ef48ad0e9bae Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Wed, 24 Jun 2015 16:56:16 -0700
Subject: mm: new mm hook framework

CRIU is recreating the process memory layout by remapping the checkpointee
memory area on top of the current process (criu).  This includes remapping
the vDSO to the place it has at checkpoint time.

However some architectures like powerpc are keeping a reference to the
vDSO base address to build the signal return stack frame by calling the
vDSO sigreturn service.  So once the vDSO has been moved, this reference
is no more valid and the signal frame built later are not usable.

This patch serie is introducing a new mm hook framework, and a new
arch_remap hook which is called when mremap is done and the mm lock still
hold.  The next patch is adding the vDSO remap and unmap tracking to the
powerpc architecture.

This patch (of 3):

This patch introduces a new set of header file to manage mm hooks:
- per architecture empty header file (arch/x/include/asm/mm-arch-hooks.h)
- a generic header (include/linux/mm-arch-hooks.h)

The architecture which need to overwrite a hook as to redefine it in its
header file, while architecture which doesn't need have nothing to do.

The default hooks are defined in the generic header and are used in the
case the architecture is not defining it.

In a next step, mm hooks defined in include/asm-generic/mm_hooks.h should
be moved here.

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm-arch-hooks.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 include/linux/mm-arch-hooks.h

(limited to 'include/linux')

diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
new file mode 100644
index 000000000000..63005e367abd
--- /dev/null
+++ b/include/linux/mm-arch-hooks.h
@@ -0,0 +1,16 @@
+/*
+ * Generic mm no-op hooks.
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _LINUX_MM_ARCH_HOOKS_H
+#define _LINUX_MM_ARCH_HOOKS_H
+
+#include <asm/mm-arch-hooks.h>
+
+#endif /* _LINUX_MM_ARCH_HOOKS_H */
-- 
cgit v1.2.3


From 4abad2ca4a4dbdd4a218c12451231ab628f2e60c Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Wed, 24 Jun 2015 16:56:19 -0700
Subject: mm: new arch_remap() hook

Some architectures would like to be triggered when a memory area is moved
through the mremap system call.

This patch introduces a new arch_remap() mm hook which is placed in the
path of mremap, and is called before the old area is unmapped (and the
arch_unmap() hook is called).

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm-arch-hooks.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
index 63005e367abd..4efc3f56e6df 100644
--- a/include/linux/mm-arch-hooks.h
+++ b/include/linux/mm-arch-hooks.h
@@ -13,4 +13,13 @@
 
 #include <asm/mm-arch-hooks.h>
 
+#ifndef arch_remap
+static inline void arch_remap(struct mm_struct *mm,
+			      unsigned long old_start, unsigned long old_end,
+			      unsigned long new_start, unsigned long new_end)
+{
+}
+#define arch_remap arch_remap
+#endif
+
 #endif /* _LINUX_MM_ARCH_HOOKS_H */
-- 
cgit v1.2.3


From a9919c79359df9a706ff9e14fc0a93cd15791c9b Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 24 Jun 2015 16:56:28 -0700
Subject: mm: only define hashdist variable when needed

For !CONFIG_NUMA, hashdist will always be 0, since it's setter is
otherwise compiled out.  So we can save 4 bytes of data and some .text
(although mostly in __init functions) by only defining it for
CONFIG_NUMA.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 0995c2de8162..f589222bfa87 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -357,12 +357,12 @@ extern void *alloc_large_system_hash(const char *tablename,
 /* Only NUMA needs hash distribution. 64bit NUMA architectures have
  * sufficient vmalloc space.
  */
-#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT)
-#define HASHDIST_DEFAULT 1
+#ifdef CONFIG_NUMA
+#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT)
+extern int hashdist;		/* Distribute hashes across NUMA nodes? */
 #else
-#define HASHDIST_DEFAULT 0
+#define hashdist (0)
 #endif
-extern int hashdist;		/* Distribute hashes across NUMA nodes? */
 
 
 #endif /* _LINUX_BOOTMEM_H */
-- 
cgit v1.2.3


From c761471b58e6138938ebc6eafec20b2f60cb3397 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 24 Jun 2015 16:56:33 -0700
Subject: mm: avoid tail page refcounting on non-THP compound pages

Reintroduce 8d63d99a5dfb ("mm: avoid tail page refcounting on non-THP
compound pages") after removing bogus VM_BUG_ON_PAGE() in
put_unrefcounted_compound_page().

THP uses tail page refcounting to be able to split huge pages at any time.
 Tail page refcounting is not needed for other users of compound pages and
it's harmful because of overhead.

We try to exclude non-THP pages from tail page refcounting using
__compound_tail_refcounted() check.  It excludes most common non-THP
compound pages: SL*B and hugetlb, but it doesn't catch rest of __GFP_COMP
users -- drivers.

And it's not only about overhead.

Drivers might want to use compound pages to get refcounting semantics
suitable for mapping high-order pages to userspace.  But tail page
refcounting breaks it.

Tail page refcounting uses ->_mapcount in tail pages to store GUP pins on
them.  It means GUP pins would affect page_mapcount() for tail pages.
It's not a problem for THP, because it never maps tail pages.  But unlike
THP, drivers map parts of compound pages with PTEs and it makes
page_mapcount() be called for tail pages.

In particular, GUP pins would shift PSS up and affect /proc/kpagecount for
such pages.  But, I'm not aware about anything which can lead to crash or
other serious misbehaviour.

Since currently all THP pages are anonymous and all drivers pages are not,
we can fix the __compound_tail_refcounted() check by requiring PageAnon()
to enable tail page refcounting.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0755b9fd03a7..8b086070c3a5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -499,7 +499,7 @@ static inline int page_count(struct page *page)
 
 static inline bool __compound_tail_refcounted(struct page *page)
 {
-	return !PageSlab(page) && !PageHeadHuge(page);
+	return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
 }
 
 /*
-- 
cgit v1.2.3


From ead07f6a867b5b1b41cf703735e8b39094987a7d Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 24 Jun 2015 16:56:48 -0700
Subject: mm/memory-failure: introduce get_hwpoison_page() for consistent
 refcount handling

memory_failure() can run in 2 different mode (specified by
MF_COUNT_INCREASED) in page refcount perspective.  When
MF_COUNT_INCREASED is set, memory_failure() assumes that the caller
takes a refcount of the target page.  And if cleared, memory_failure()
takes it in it's own.

In current code, however, refcounting is done differently in each caller.
For example, madvise_hwpoison() uses get_user_pages_fast() and
hwpoison_inject() uses get_page_unless_zero().  So this inconsistent
refcounting causes refcount failure especially for thp tail pages.
Typical user visible effects are like memory leak or
VM_BUG_ON_PAGE(!page_count(page)) in isolate_lru_page().

To fix this refcounting issue, this patch introduces get_hwpoison_page()
to handle thp tail pages in the same manner for each caller of hwpoison
code.

memory_failure() might fail to split thp and in such case it returns
without completing page isolation.  This is not good because PageHWPoison
on the thp is still set and there's no easy way to unpoison such thps.  So
this patch try to roll back any action to the thp in "non anonymous thp"
case and "thp split failed" case, expecting an MCE(SRAR) generated by
later access afterward will properly free such thps.

[akpm@linux-foundation.org: fix CONFIG_HWPOISON_INJECT=m]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8b086070c3a5..cd9df66138a1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2146,6 +2146,7 @@ enum mf_flags {
 extern int memory_failure(unsigned long pfn, int trapno, int flags);
 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
 extern int unpoison_memory(unsigned long pfn);
+extern int get_hwpoison_page(struct page *page);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
-- 
cgit v1.2.3


From 16e951966f05da5ccd650104176f6ba289f7fa20 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 24 Jun 2015 16:57:07 -0700
Subject: mm: oom_kill: clean up victim marking and exiting interfaces

Rename unmark_oom_victim() to exit_oom_victim().  Marking and unmarking
are related in functionality, but the interface is not symmetrical at
all: one is an internal OOM killer function used during the killing, the
other is for an OOM victim to signal its own death on exit later on.
This has locking implications, see follow-up changes.

While at it, rename mark_tsk_oom_victim() to mark_oom_victim(), which
is easier on the eye.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 44b2f6f7bbd8..a8e6a498cbcb 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -47,9 +47,7 @@ static inline bool oom_task_origin(const struct task_struct *p)
 	return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
 }
 
-extern void mark_tsk_oom_victim(struct task_struct *tsk);
-
-extern void unmark_oom_victim(void);
+extern void mark_oom_victim(struct task_struct *tsk);
 
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
@@ -75,6 +73,9 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 
 extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *mask, bool force_kill);
+
+extern void exit_oom_victim(void);
+
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 
-- 
cgit v1.2.3


From dc56401fc9f25e8f93899991ec858c98a331d88c Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 24 Jun 2015 16:57:19 -0700
Subject: mm: oom_kill: simplify OOM killer locking

The zonelist locking and the oom_sem are two overlapping locks that are
used to serialize global OOM killing against different things.

The historical zonelist locking serializes OOM kills from allocations with
overlapping zonelists against each other to prevent killing more tasks
than necessary in the same memory domain.  Only when neither tasklists nor
zonelists from two concurrent OOM kills overlap (tasks in separate memcgs
bound to separate nodes) are OOM kills allowed to execute in parallel.

The younger oom_sem is a read-write lock to serialize OOM killing against
the PM code trying to disable the OOM killer altogether.

However, the OOM killer is a fairly cold error path, there is really no
reason to optimize for highly performant and concurrent OOM kills.  And
the oom_sem is just flat-out redundant.

Replace both locking schemes with a single global mutex serializing OOM
kills regardless of context.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index a8e6a498cbcb..7deecb7bca5e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -32,6 +32,8 @@ enum oom_scan_t {
 /* Thread is the potential origin of an oom condition; kill first on oom */
 #define OOM_FLAG_ORIGIN		((__force oom_flags_t)0x1)
 
+extern struct mutex oom_lock;
+
 static inline void set_current_oom_origin(void)
 {
 	current->signal->oom_flags |= OOM_FLAG_ORIGIN;
@@ -60,9 +62,6 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			     struct mem_cgroup *memcg, nodemask_t *nodemask,
 			     const char *message);
 
-extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags);
-extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags);
-
 extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 			       int order, const nodemask_t *nodemask,
 			       struct mem_cgroup *memcg);
-- 
cgit v1.2.3


From cc637b1704d78b068c2eb700eec384c69ea56cdf Mon Sep 17 00:00:00 2001
From: Xie XiuQi <xiexiuqi@huawei.com>
Date: Wed, 24 Jun 2015 16:57:30 -0700
Subject: memory-failure: export page_type and action result

Export 'outcome' and 'action_page_type' to mm.h, so we could use
this emnus outside.

This patch is preparation for adding trace events for memory-failure
recovery action.

Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Chen Gong <gong.chen@linux.intel.com>
Cc: Jim Davis <jim.epost@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cd9df66138a1..986a9a221ee0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2153,6 +2153,40 @@ extern void shake_page(struct page *p, int access);
 extern atomic_long_t num_poisoned_pages;
 extern int soft_offline_page(struct page *page, int flags);
 
+
+/*
+ * Error handlers for various types of pages.
+ */
+enum mf_outcome {
+	MF_IGNORED,	/* Error: cannot be handled */
+	MF_FAILED,	/* Error: handling failed */
+	MF_DELAYED,	/* Will be handled later */
+	MF_RECOVERED,	/* Successfully recovered */
+};
+
+enum mf_action_page_type {
+	MF_MSG_KERNEL,
+	MF_MSG_KERNEL_HIGH_ORDER,
+	MF_MSG_SLAB,
+	MF_MSG_DIFFERENT_COMPOUND,
+	MF_MSG_POISONED_HUGE,
+	MF_MSG_HUGE,
+	MF_MSG_FREE_HUGE,
+	MF_MSG_UNMAP_FAILED,
+	MF_MSG_DIRTY_SWAPCACHE,
+	MF_MSG_CLEAN_SWAPCACHE,
+	MF_MSG_DIRTY_MLOCKED_LRU,
+	MF_MSG_CLEAN_MLOCKED_LRU,
+	MF_MSG_DIRTY_UNEVICTABLE_LRU,
+	MF_MSG_CLEAN_UNEVICTABLE_LRU,
+	MF_MSG_DIRTY_LRU,
+	MF_MSG_CLEAN_LRU,
+	MF_MSG_TRUNCATED_LRU,
+	MF_MSG_BUDDY,
+	MF_MSG_BUDDY_2ND,
+	MF_MSG_UNKNOWN,
+};
+
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
 extern void clear_huge_page(struct page *page,
 			    unsigned long addr,
-- 
cgit v1.2.3


From cc3e2af42e7b7e0457b93bf17c19b44c635cd40c Mon Sep 17 00:00:00 2001
From: Xie XiuQi <xiexiuqi@huawei.com>
Date: Wed, 24 Jun 2015 16:57:33 -0700
Subject: memory-failure: change type of action_result's param 3 to enum

Change type of action_result's param 3 to enum for type consistency,
and rename mf_outcome to mf_result for clearly.

Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Chen Gong <gong.chen@linux.intel.com>
Cc: Jim Davis <jim.epost@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 986a9a221ee0..24ad583596d1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2157,7 +2157,7 @@ extern int soft_offline_page(struct page *page, int flags);
 /*
  * Error handlers for various types of pages.
  */
-enum mf_outcome {
+enum mf_result {
 	MF_IGNORED,	/* Error: cannot be handled */
 	MF_FAILED,	/* Error: handling failed */
 	MF_DELAYED,	/* Will be handled later */
-- 
cgit v1.2.3


From 8809aa2d28d74111ff2f1928edaa4e9845c97a7d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 24 Jun 2015 16:57:44 -0700
Subject: mm: clarify that the function operates on hugepage pte

We have confusing functions to clear pmd, pmd_clear_* and pmd_clear.  Add
_huge_ to pmdp_clear functions so that we are clear that they operate on
hugepage pte.

We don't bother about other functions like pmdp_set_wrprotect,
pmdp_clear_flush_young, because they operate on PTE bits and hence
indicate they are operating on hugepage ptes

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 95243d28a0ee..61cd67f4d788 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -324,25 +324,25 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	___pte;								\
 })
 
-#define pmdp_clear_flush_notify(__vma, __haddr, __pmd)			\
+#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd)		\
 ({									\
 	unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;		\
 	struct mm_struct *___mm = (__vma)->vm_mm;			\
 	pmd_t ___pmd;							\
 									\
-	___pmd = pmdp_clear_flush(__vma, __haddr, __pmd);		\
+	___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd);		\
 	mmu_notifier_invalidate_range(___mm, ___haddr,			\
 				      ___haddr + HPAGE_PMD_SIZE);	\
 									\
 	___pmd;								\
 })
 
-#define pmdp_get_and_clear_notify(__mm, __haddr, __pmd)			\
+#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd)		\
 ({									\
 	unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;		\
 	pmd_t ___pmd;							\
 									\
-	___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd);		\
+	___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd);		\
 	mmu_notifier_invalidate_range(__mm, ___haddr,			\
 				      ___haddr + HPAGE_PMD_SIZE);	\
 									\
@@ -428,8 +428,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
 #define	ptep_clear_flush_notify ptep_clear_flush
-#define pmdp_clear_flush_notify pmdp_clear_flush
-#define pmdp_get_and_clear_notify pmdp_get_and_clear
+#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
+#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
-- 
cgit v1.2.3


From fc6daaf93151877748f8096af6b3fddb147f22d6 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 24 Jun 2015 16:58:09 -0700
Subject: mm/memblock: add extra "flags" to memblock to allow selection of
 memory based on attribute

Some high end Intel Xeon systems report uncorrectable memory errors as a
recoverable machine check.  Linux has included code for some time to
process these and just signal the affected processes (or even recover
completely if the error was in a read only page that can be replaced by
reading from disk).

But we have no recovery path for errors encountered during kernel code
execution.  Except for some very specific cases were are unlikely to ever
be able to recover.

Enter memory mirroring. Actually 3rd generation of memory mirroing.

Gen1: All memory is mirrored
	Pro: No s/w enabling - h/w just gets good data from other side of the
	     mirror
	Con: Halves effective memory capacity available to OS/applications

Gen2: Partial memory mirror - just mirror memory begind some memory controllers
	Pro: Keep more of the capacity
	Con: Nightmare to enable. Have to choose between allocating from
	     mirrored memory for safety vs. NUMA local memory for performance

Gen3: Address range partial memory mirror - some mirror on each memory
      controller
	Pro: Can tune the amount of mirror and keep NUMA performance
	Con: I have to write memory management code to implement

The current plan is just to use mirrored memory for kernel allocations.
This has been broken into two phases:

1) This patch series - find the mirrored memory, use it for boot time
   allocations

2) Wade into mm/page_alloc.c and define a ZONE_MIRROR to pick up the
   unused mirrored memory from mm/memblock.c and only give it out to
   select kernel allocations (this is still being scoped because
   page_alloc.c is scary).

This patch (of 3):

Add extra "flags" to memblock to allow selection of memory based on
attribute.  No functional changes

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Xiexiuqi <xiexiuqi@huawei.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 9497ec7c77ea..7aeec0cb4c27 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -21,7 +21,10 @@
 #define INIT_PHYSMEM_REGIONS	4
 
 /* Definition of memblock flags. */
-#define MEMBLOCK_HOTPLUG	0x1	/* hotpluggable region */
+enum {
+	MEMBLOCK_NONE		= 0x0,	/* No special request */
+	MEMBLOCK_HOTPLUG	= 0x1,	/* hotpluggable region */
+};
 
 struct memblock_region {
 	phys_addr_t base;
@@ -61,7 +64,7 @@ extern bool movable_node_enabled;
 
 phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
 					    phys_addr_t start, phys_addr_t end,
-					    int nid);
+					    int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
 				   phys_addr_t size, phys_addr_t align);
 phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
@@ -85,11 +88,13 @@ int memblock_remove_range(struct memblock_type *type,
 			  phys_addr_t base,
 			  phys_addr_t size);
 
-void __next_mem_range(u64 *idx, int nid, struct memblock_type *type_a,
+void __next_mem_range(u64 *idx, int nid, ulong flags,
+		      struct memblock_type *type_a,
 		      struct memblock_type *type_b, phys_addr_t *out_start,
 		      phys_addr_t *out_end, int *out_nid);
 
-void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
+void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
+			  struct memblock_type *type_a,
 			  struct memblock_type *type_b, phys_addr_t *out_start,
 			  phys_addr_t *out_end, int *out_nid);
 
@@ -100,16 +105,17 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
  * @type_a: ptr to memblock_type to iterate
  * @type_b: ptr to memblock_type which excludes from the iteration
  * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
  */
-#define for_each_mem_range(i, type_a, type_b, nid,			\
+#define for_each_mem_range(i, type_a, type_b, nid, flags,		\
 			   p_start, p_end, p_nid)			\
-	for (i = 0, __next_mem_range(&i, nid, type_a, type_b,		\
+	for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b,	\
 				     p_start, p_end, p_nid);		\
 	     i != (u64)ULLONG_MAX;					\
-	     __next_mem_range(&i, nid, type_a, type_b,			\
+	     __next_mem_range(&i, nid, flags, type_a, type_b,		\
 			      p_start, p_end, p_nid))
 
 /**
@@ -119,17 +125,18 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
  * @type_a: ptr to memblock_type to iterate
  * @type_b: ptr to memblock_type which excludes from the iteration
  * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
  */
-#define for_each_mem_range_rev(i, type_a, type_b, nid,			\
+#define for_each_mem_range_rev(i, type_a, type_b, nid, flags,		\
 			       p_start, p_end, p_nid)			\
 	for (i = (u64)ULLONG_MAX,					\
-		     __next_mem_range_rev(&i, nid, type_a, type_b,	\
+		     __next_mem_range_rev(&i, nid, flags, type_a, type_b,\
 					 p_start, p_end, p_nid);	\
 	     i != (u64)ULLONG_MAX;					\
-	     __next_mem_range_rev(&i, nid, type_a, type_b,		\
+	     __next_mem_range_rev(&i, nid, flags, type_a, type_b,	\
 				  p_start, p_end, p_nid))
 
 #ifdef CONFIG_MOVABLE_NODE
@@ -181,13 +188,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @flags: pick from blocks based on memory attributes
  *
  * Walks over free (memory && !reserved) areas of memblock.  Available as
  * soon as memblock is initialized.
  */
-#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid)		\
+#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid)	\
 	for_each_mem_range(i, &memblock.memory, &memblock.reserved,	\
-			   nid, p_start, p_end, p_nid)
+			   nid, flags, p_start, p_end, p_nid)
 
 /**
  * for_each_free_mem_range_reverse - rev-iterate through free memblock areas
@@ -196,13 +204,15 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @flags: pick from blocks based on memory attributes
  *
  * Walks over free (memory && !reserved) areas of memblock in reverse
  * order.  Available as soon as memblock is initialized.
  */
-#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid)	\
+#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end,	\
+					p_nid)				\
 	for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved,	\
-			       nid, p_start, p_end, p_nid)
+			       nid, flags, p_start, p_end, p_nid)
 
 static inline void memblock_set_region_flags(struct memblock_region *r,
 					     unsigned long flags)
@@ -273,7 +283,8 @@ static inline bool memblock_bottom_up(void) { return false; }
 #define MEMBLOCK_ALLOC_ACCESSIBLE	0
 
 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
-					phys_addr_t start, phys_addr_t end);
+					phys_addr_t start, phys_addr_t end,
+					ulong flags);
 phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
 				phys_addr_t max_addr);
 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
-- 
cgit v1.2.3


From a3f5bafcc04aaf62990e0cf3ced1cc6d8dc6fe95 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 24 Jun 2015 16:58:12 -0700
Subject: mm/memblock: allocate boot time data structures from mirrored memory

Try to allocate all boot time kernel data structures from mirrored
memory.

If we run out of mirrored memory print warnings, but fall back to using
non-mirrored memory to make sure that we still boot.

By number of bytes, most of what we allocate at boot time is the page
structures.  64 bytes per 4K page on x86_64 ...  or about 1.5% of total
system memory.  For workloads where the bulk of memory is allocated to
applications this may represent a useful improvement to system
availability since 1.5% of total memory might be a third of the memory
allocated to the kernel.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Xiexiuqi <xiexiuqi@huawei.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 7aeec0cb4c27..0215ffd63069 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -24,6 +24,7 @@
 enum {
 	MEMBLOCK_NONE		= 0x0,	/* No special request */
 	MEMBLOCK_HOTPLUG	= 0x1,	/* hotpluggable region */
+	MEMBLOCK_MIRROR		= 0x2,	/* mirrored region */
 };
 
 struct memblock_region {
@@ -78,6 +79,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
 int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
+int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
+ulong choose_memblock_flags(void);
 
 /* Low level functions */
 int memblock_add_range(struct memblock_type *type,
@@ -160,6 +163,11 @@ static inline bool movable_node_is_enabled(void)
 }
 #endif
 
+static inline bool memblock_is_mirror(struct memblock_region *m)
+{
+	return m->flags & MEMBLOCK_MIRROR;
+}
+
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
-- 
cgit v1.2.3


From b05b9f5f9dcf593a0e9327676b78e6c17b4218e8 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 24 Jun 2015 16:58:15 -0700
Subject: x86, mirror: x86 enabling - find mirrored memory ranges

UEFI GetMemoryMap() uses a new attribute bit to mark mirrored memory
address ranges.  See UEFI 2.5 spec pages 157-158:

  http://www.uefi.org/sites/default/files/resources/UEFI%202_5.pdf

On EFI enabled systems scan the memory map and tell memblock about any
mirrored ranges.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Xiexiuqi <xiexiuqi@huawei.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/efi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 2092965afca3..5f19efe4eb3f 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -96,6 +96,8 @@ typedef	struct {
 #define EFI_MEMORY_WP		((u64)0x0000000000001000ULL)	/* write-protect */
 #define EFI_MEMORY_RP		((u64)0x0000000000002000ULL)	/* read-protect */
 #define EFI_MEMORY_XP		((u64)0x0000000000004000ULL)	/* execute-protect */
+#define EFI_MEMORY_MORE_RELIABLE \
+				((u64)0x0000000000010000ULL)	/* higher reliability */
 #define EFI_MEMORY_RUNTIME	((u64)0x8000000000000000ULL)	/* range requires runtime mapping */
 #define EFI_MEMORY_DESCRIPTOR_VERSION	1
 
@@ -868,6 +870,7 @@ extern void efi_enter_virtual_mode (void);	/* switch EFI to virtual mode, if pos
 extern void efi_late_init(void);
 extern void efi_free_boot_services(void);
 extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size);
+extern void efi_find_mirror(void);
 #else
 static inline void efi_late_init(void) {}
 static inline void efi_free_boot_services(void) {}
-- 
cgit v1.2.3


From d1dc6f1bcf1e998e7ce65fc120da371ab047a999 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Wed, 24 Jun 2015 16:58:18 -0700
Subject: frontswap: allow multiple backends

Change frontswap single pointer to a singly linked list of frontswap
implementations.  Update Xen tmem implementation as register no longer
returns anything.

Frontswap only keeps track of a single implementation; any
implementation that registers second (or later) will replace the
previously registered implementation, and gets a pointer to the previous
implementation that the new implementation is expected to pass all
frontswap functions to if it can't handle the function itself.  However
that method doesn't really make much sense, as passing that work on to
every implementation adds unnecessary work to implementations; instead,
frontswap should simply keep a list of all registered implementations
and try each implementation for any function.  Most importantly, neither
of the two currently existing frontswap implementations in the kernel
actually do anything with any previous frontswap implementation that
they replace when registering.

This allows frontswap to successfully manage multiple implementations by
keeping a list of them all.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 8293262401de..e65ef959546c 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -6,16 +6,16 @@
 #include <linux/bitops.h>
 
 struct frontswap_ops {
-	void (*init)(unsigned);
-	int (*store)(unsigned, pgoff_t, struct page *);
-	int (*load)(unsigned, pgoff_t, struct page *);
-	void (*invalidate_page)(unsigned, pgoff_t);
-	void (*invalidate_area)(unsigned);
+	void (*init)(unsigned); /* this swap type was just swapon'ed */
+	int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
+	int (*load)(unsigned, pgoff_t, struct page *); /* load a page */
+	void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */
+	void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */
+	struct frontswap_ops *next; /* private pointer to next ops */
 };
 
 extern bool frontswap_enabled;
-extern struct frontswap_ops *
-	frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_register_ops(struct frontswap_ops *ops);
 extern void frontswap_shrink(unsigned long);
 extern unsigned long frontswap_curr_pages(void);
 extern void frontswap_writethrough(bool);
-- 
cgit v1.2.3


From 8a8c35fadfaf55629a37ef1a8ead1b8fb32581d2 Mon Sep 17 00:00:00 2001
From: Larry Finger <Larry.Finger@lwfinger.net>
Date: Wed, 24 Jun 2015 16:58:51 -0700
Subject: mm: kmemleak_alloc_percpu() should follow the gfp from per_alloc()

Beginning at commit d52d3997f843 ("ipv6: Create percpu rt6_info"), the
following INFO splat is logged:

  ===============================
  [ INFO: suspicious RCU usage. ]
  4.1.0-rc7-next-20150612 #1 Not tainted
  -------------------------------
  kernel/sched/core.c:7318 Illegal context switch in RCU-bh read-side critical section!
  other info that might help us debug this:
  rcu_scheduler_active = 1, debug_locks = 0
   3 locks held by systemd/1:
   #0:  (rtnl_mutex){+.+.+.}, at: [<ffffffff815f0c8f>] rtnetlink_rcv+0x1f/0x40
   #1:  (rcu_read_lock_bh){......}, at: [<ffffffff816a34e2>] ipv6_add_addr+0x62/0x540
   #2:  (addrconf_hash_lock){+...+.}, at: [<ffffffff816a3604>] ipv6_add_addr+0x184/0x540
  stack backtrace:
  CPU: 0 PID: 1 Comm: systemd Not tainted 4.1.0-rc7-next-20150612 #1
  Hardware name: TOSHIBA TECRA A50-A/TECRA A50-A, BIOS Version 4.20   04/17/2014
  Call Trace:
    dump_stack+0x4c/0x6e
    lockdep_rcu_suspicious+0xe7/0x120
    ___might_sleep+0x1d5/0x1f0
    __might_sleep+0x4d/0x90
    kmem_cache_alloc+0x47/0x250
    create_object+0x39/0x2e0
    kmemleak_alloc_percpu+0x61/0xe0
    pcpu_alloc+0x370/0x630

Additional backtrace lines are truncated.  In addition, the above splat
is followed by several "BUG: sleeping function called from invalid
context at mm/slub.c:1268" outputs.  As suggested by Martin KaFai Lau,
these are the clue to the fix.  Routine kmemleak_alloc_percpu() always
uses GFP_KERNEL for its allocations, whereas it should follow the gfp
from its callers.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: <stable@vger.kernel.org>	[3.18+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmemleak.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index e705467ddb47..d0a1f99e24e3 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -28,7 +28,8 @@
 extern void kmemleak_init(void) __ref;
 extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
 			   gfp_t gfp) __ref;
-extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref;
+extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+				  gfp_t gfp) __ref;
 extern void kmemleak_free(const void *ptr) __ref;
 extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
 extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
@@ -71,7 +72,8 @@ static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
 					    gfp_t gfp)
 {
 }
-static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
+static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+					 gfp_t gfp)
 {
 }
 static inline void kmemleak_free(const void *ptr)
-- 
cgit v1.2.3