From 62f75532b583c03840f31e40386ce2df73be9ca0 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 14 Apr 2008 18:50:44 +0300
Subject: slub: Initialize per-cpu stats

As spotted by kmemcheck, we need to initialize the per-CPU ->stat array before
using it.

[kmem_cache_cpu structures are usually allocated from arrays defined via
DEFINE_PER_CPU that are zeroed so we have not noticed this so far --cl].

Reported-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index acc975fcc8cc..15a7a0d45d71 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1886,6 +1886,9 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
 	c->node = 0;
 	c->offset = s->offset / sizeof(void *);
 	c->objsize = s->objsize;
+#ifdef CONFIG_SLUB_STATS
+	memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
+#endif
 }
 
 static void init_kmem_cache_node(struct kmem_cache_node *n)
-- 
cgit v1.2.3


From 4097d6017576a5e138f442f5e3c393ad00d10f58 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 18:51:18 +0300
Subject: slub: Reduce #ifdef ZONE_DMA by moving kmalloc_caches_dma near dma
 logic

Move the definition of kmalloc_caches_dma() into a later #ifdef CONFIG_ZONE_DMA.
This saves one #ifdef and leaves us with a total of two #ifdefs for dma slab support.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 15a7a0d45d71..3df6d5bdd711 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2412,10 +2412,6 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 
-#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
-#endif
-
 static int __init setup_slub_min_order(char *str)
 {
 	get_option(&str, &slub_min_order);
@@ -2475,6 +2471,7 @@ panic:
 }
 
 #ifdef CONFIG_ZONE_DMA
+static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
 
 static void sysfs_add_func(struct work_struct *w)
 {
-- 
cgit v1.2.3


From 5b06c853ad447636e31d105e95c48ae9abb6bfb5 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 18:51:34 +0300
Subject: slub: Deal with config variable dependencies

count_partial() is used by both slabinfo and the sysfs proc support. Move
the function directly before the beginning of the sysfs code so that it can
be easily found. Rework the preprocessor conditional to take into account
that slub sysfs support depends on CONFIG_SYSFS *and* CONFIG_SLUB_DEBUG.

Make CONFIG_SLUB_STATS depend on CONFIG_SLUB_DEBUG and CONFIG_SYSFS. There
is no point of keeping statistics if no one can restrive them.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 3df6d5bdd711..3fcdcf7d77ba 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2688,21 +2688,6 @@ void kfree(const void *x)
 }
 EXPORT_SYMBOL(kfree);
 
-#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SLABINFO)
-static unsigned long count_partial(struct kmem_cache_node *n)
-{
-	unsigned long flags;
-	unsigned long x = 0;
-	struct page *page;
-
-	spin_lock_irqsave(&n->list_lock, flags);
-	list_for_each_entry(page, &n->partial, lru)
-		x += page->inuse;
-	spin_unlock_irqrestore(&n->list_lock, flags);
-	return x;
-}
-#endif
-
 /*
  * kmem_cache_shrink removes empty slabs from the partial lists and sorts
  * the remaining slabs by the number of items in use. The slabs with the
@@ -3181,6 +3166,21 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	return slab_alloc(s, gfpflags, node, caller);
 }
 
+#if (defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)) || defined(CONFIG_SLABINFO)
+static unsigned long count_partial(struct kmem_cache_node *n)
+{
+	unsigned long flags;
+	unsigned long x = 0;
+	struct page *page;
+
+	spin_lock_irqsave(&n->list_lock, flags);
+	list_for_each_entry(page, &n->partial, lru)
+		x += page->inuse;
+	spin_unlock_irqrestore(&n->list_lock, flags);
+	return x;
+}
+#endif
+
 #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
 static int validate_slab(struct kmem_cache *s, struct page *page,
 						unsigned long *map)
-- 
cgit v1.2.3


From 50ef37b96c11e76625067ae413dc54585ea22585 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 18:52:05 +0300
Subject: slub: Fixes to per cpu stat output in sysfs

Only output per cpu stats if the kernel is build for SMP.

Use a capital "C" as a leading character for the processor number
(same as the numa statistics that also use a capital letter "N").

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 3fcdcf7d77ba..23e5ee7b149f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3979,10 +3979,12 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
 
 	len = sprintf(buf, "%lu", sum);
 
+#ifdef CONFIG_SMP
 	for_each_online_cpu(cpu) {
 		if (data[cpu] && len < PAGE_SIZE - 20)
-			len += sprintf(buf + len, " c%d=%u", cpu, data[cpu]);
+			len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
 	}
+#endif
 	kfree(data);
 	return len + sprintf(buf + len, "\n");
 }
-- 
cgit v1.2.3


From 49bd5221ce8fb55d12c04a3ffd375201c5bbfb7a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 18:52:18 +0300
Subject: slub: Move map/flag clearing to __free_slab

__free_slab does some diagnostics. The resetting of mapcount etc
in discard_slab() can interfere with debug processing. So move
the reset immediately before the page is freed.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 23e5ee7b149f..f924cffb29e7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1125,6 +1125,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 		-pages);
 
+	__ClearPageSlab(page);
+	reset_page_mapcount(page);
 	__free_pages(page, s->order);
 }
 
@@ -1154,8 +1156,6 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
 	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 
 	atomic_long_dec(&n->nr_slabs);
-	reset_page_mapcount(page);
-	__ClearPageSlab(page);
 	free_slab(s, page);
 }
 
-- 
cgit v1.2.3


From 0f389ec63077521166f071e1e970aed36147fd45 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 18:53:02 +0300
Subject: slub: No need for per node slab counters if !SLUB_DEBUG

The per node counters are used mainly for showing data through the sysfs API.
If that API is not compiled in then there is no point in keeping track of this
data. Disable counters for the number of slabs and the number of total slabs
if !SLUB_DEBUG. Incrementing the per node counters is also accessing a
potentially contended cacheline so this could actually be a performance
benefit to embedded systems.

SLABINFO support is also affected. It now must depends on SLUB_DEBUG (which
is on by default).

Patch also avoids a check for a NULL kmem_cache_node pointer in new_slab()
if the system is not compiled with NUMA support.

[penberg@cs.helsinki.fi: fix oops and move ->nr_slabs into CONFIG_SLUB_DEBUG]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 51 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index f924cffb29e7..7f8aaa291a4e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -837,6 +837,35 @@ static void remove_full(struct kmem_cache *s, struct page *page)
 	spin_unlock(&n->list_lock);
 }
 
+/* Tracking of the number of slabs for debugging purposes */
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+{
+	struct kmem_cache_node *n = get_node(s, node);
+
+	return atomic_long_read(&n->nr_slabs);
+}
+
+static inline void inc_slabs_node(struct kmem_cache *s, int node)
+{
+	struct kmem_cache_node *n = get_node(s, node);
+
+	/*
+	 * May be called early in order to allocate a slab for the
+	 * kmem_cache_node structure. Solve the chicken-egg
+	 * dilemma by deferring the increment of the count during
+	 * bootstrap (see early_kmem_cache_node_alloc).
+	 */
+	if (!NUMA_BUILD || n)
+		atomic_long_inc(&n->nr_slabs);
+}
+static inline void dec_slabs_node(struct kmem_cache *s, int node)
+{
+	struct kmem_cache_node *n = get_node(s, node);
+
+	atomic_long_dec(&n->nr_slabs);
+}
+
+/* Object debug checks for alloc/free paths */
 static void setup_object_debug(struct kmem_cache *s, struct page *page,
 								void *object)
 {
@@ -1028,6 +1057,11 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
 	return flags;
 }
 #define slub_debug 0
+
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+							{ return 0; }
+static inline void inc_slabs_node(struct kmem_cache *s, int node) {}
+static inline void dec_slabs_node(struct kmem_cache *s, int node) {}
 #endif
 /*
  * Slab allocation and freeing
@@ -1066,7 +1100,6 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 {
 	struct page *page;
-	struct kmem_cache_node *n;
 	void *start;
 	void *last;
 	void *p;
@@ -1078,9 +1111,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	if (!page)
 		goto out;
 
-	n = get_node(s, page_to_nid(page));
-	if (n)
-		atomic_long_inc(&n->nr_slabs);
+	inc_slabs_node(s, page_to_nid(page));
 	page->slab = s;
 	page->flags |= 1 << PG_slab;
 	if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
@@ -1153,9 +1184,7 @@ static void free_slab(struct kmem_cache *s, struct page *page)
 
 static void discard_slab(struct kmem_cache *s, struct page *page)
 {
-	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-
-	atomic_long_dec(&n->nr_slabs);
+	dec_slabs_node(s, page_to_nid(page));
 	free_slab(s, page);
 }
 
@@ -1894,10 +1923,10 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
 static void init_kmem_cache_node(struct kmem_cache_node *n)
 {
 	n->nr_partial = 0;
-	atomic_long_set(&n->nr_slabs, 0);
 	spin_lock_init(&n->list_lock);
 	INIT_LIST_HEAD(&n->partial);
 #ifdef CONFIG_SLUB_DEBUG
+	atomic_long_set(&n->nr_slabs, 0);
 	INIT_LIST_HEAD(&n->full);
 #endif
 }
@@ -2066,7 +2095,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
 	init_tracking(kmalloc_caches, n);
 #endif
 	init_kmem_cache_node(n);
-	atomic_long_inc(&n->nr_slabs);
+	inc_slabs_node(kmalloc_caches, node);
 
 	/*
 	 * lockdep requires consistent irq usage for each lock
@@ -2379,7 +2408,7 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 		struct kmem_cache_node *n = get_node(s, node);
 
 		n->nr_partial -= free_list(s, n, &n->partial);
-		if (atomic_long_read(&n->nr_slabs))
+		if (slabs_node(s, node))
 			return 1;
 	}
 	free_kmem_cache_nodes(s);
@@ -2801,7 +2830,7 @@ static void slab_mem_offline_callback(void *arg)
 			 * and offline_pages() function shoudn't call this
 			 * callback. So, we must fail.
 			 */
-			BUG_ON(atomic_long_read(&n->nr_slabs));
+			BUG_ON(slabs_node(s, offline_node));
 
 			s->node[offline_node] = NULL;
 			kmem_cache_free(kmalloc_caches, n);
-- 
cgit v1.2.3


From c33fa9f5609e918824446ef9a75319d4a802f1f4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 17 Apr 2008 20:05:36 +0200
Subject: uaccess: add probe_kernel_write()

add probe_kernel_read() and probe_kernel_write().

Uninlined and restricted to kernel range memory only, as suggested
by Linus.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 mm/Makefile  |  2 +-
 mm/maccess.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 mm/maccess.c

(limited to 'mm')

diff --git a/mm/Makefile b/mm/Makefile
index a5b0dd93427a..18c143b3c46c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   vmalloc.o
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
-			   page_alloc.o page-writeback.o pdflush.o \
+			   maccess.o page_alloc.o page-writeback.o pdflush.o \
 			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
 			   page_isolation.o $(mmu-y)
diff --git a/mm/maccess.c b/mm/maccess.c
new file mode 100644
index 000000000000..24f81b971403
--- /dev/null
+++ b/mm/maccess.c
@@ -0,0 +1,49 @@
+/*
+ * Access kernel memory without faulting.
+ */
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+/**
+ * probe_kernel_read(): safely attempt to read from a location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from address @src to the buffer at @dst.  If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long probe_kernel_read(void *dst, void *src, size_t size)
+{
+	long ret;
+
+	pagefault_disable();
+	ret = __copy_from_user_inatomic(dst,
+			(__force const void __user *)src, size);
+	pagefault_enable();
+
+	return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_read);
+
+/**
+ * probe_kernel_write(): safely attempt to write to a location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src.  If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long probe_kernel_write(void *dst, void *src, size_t size)
+{
+	long ret;
+
+	pagefault_disable();
+	ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
+	pagefault_enable();
+
+	return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_write);
-- 
cgit v1.2.3


From b4b8ac524d9b6ed7229017145afa1d7afbea4a48 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Wed, 20 Feb 2008 13:33:38 -0600
Subject: kgdb: fix optional arch functions and probe_kernel_*

Fix two regressions dealing with the kgdb core.

1) kgdb_skipexception and kgdb_post_primary_code are optional
functions that are only required on archs that need special exception
fixups.

2) The kernel address space scope must be set on any probe_kernel_*
function or archs such as ARCH=arm will not allow access to the kernel
memory space.  As an example, it is required to allow the full kernel
address space is when you the kernel debugger to inspect a system
call.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/maccess.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'mm')

diff --git a/mm/maccess.c b/mm/maccess.c
index 24f81b971403..ac40796cfb15 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -17,11 +17,14 @@
 long probe_kernel_read(void *dst, void *src, size_t size)
 {
 	long ret;
+	mm_segment_t old_fs = get_fs();
 
+	set_fs(KERNEL_DS);
 	pagefault_disable();
 	ret = __copy_from_user_inatomic(dst,
 			(__force const void __user *)src, size);
 	pagefault_enable();
+	set_fs(old_fs);
 
 	return ret ? -EFAULT : 0;
 }
@@ -39,10 +42,13 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
 long probe_kernel_write(void *dst, void *src, size_t size)
 {
 	long ret;
+	mm_segment_t old_fs = get_fs();
 
+	set_fs(KERNEL_DS);
 	pagefault_disable();
 	ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
 	pagefault_enable();
+	set_fs(old_fs);
 
 	return ret ? -EFAULT : 0;
 }
-- 
cgit v1.2.3


From d366f8cbc16882e93538d9a52423c2f50dad7c06 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 4 Apr 2008 18:11:12 -0700
Subject: cpumask: Cleanup more uses of CPU_MASK and NODE_MASK

 *  Replace usages of CPU_MASK_NONE, CPU_MASK_ALL, NODE_MASK_NONE,
    NODE_MASK_ALL to reduce stack requirements for large NR_CPUS
    and MAXNODES counts.

 *  In some cases, the cpumask variable was initialized but then overwritten
    with another value.  This is the case for changes like this:

    -       cpumask_t oldmask = CPU_MASK_ALL;
    +       cpumask_t oldmask;

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/allocpercpu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index b0012e27fea8..f4026bae6eed 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -82,9 +82,10 @@ EXPORT_SYMBOL_GPL(percpu_populate);
 int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
 			   cpumask_t *mask)
 {
-	cpumask_t populated = CPU_MASK_NONE;
+	cpumask_t populated;
 	int cpu;
 
+	cpus_clear(populated);
 	for_each_cpu_mask(cpu, *mask)
 		if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
 			__percpu_depopulate_mask(__pdata, &populated);
-- 
cgit v1.2.3


From f9a86fcbbb1e5542eabf45c9144ac4b6330861a4 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 4 Apr 2008 18:11:07 -0700
Subject: cpuset: modify cpuset_set_cpus_allowed to use cpumask pointer

  * Modify cpuset_cpus_allowed to return the currently allowed cpuset
    via a pointer argument instead of as the function return value.

  * Use new set_cpus_allowed_ptr function.

  * Cleanup CPU_MASK_ALL and NODE_MASK_ALL uses.

Depends on:
	[sched-devel]: sched: add new set_cpus_allowed_ptr function

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/pdflush.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/pdflush.c b/mm/pdflush.c
index 8f6ee073c0e3..0ceacff56457 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -187,8 +187,8 @@ static int pdflush(void *dummy)
 	 * This is needed as pdflush's are dynamically created and destroyed.
 	 * The boottime pdflush's are easily placed w/o these 2 lines.
 	 */
-	cpus_allowed = cpuset_cpus_allowed(current);
-	set_cpus_allowed(current, cpus_allowed);
+	cpuset_cpus_allowed(current, &cpus_allowed);
+	set_cpus_allowed_ptr(current, &cpus_allowed);
 
 	return __pdflush(&my_work);
 }
-- 
cgit v1.2.3


From c5f59f0833df945eef7ff35f3dc6ba61c5f293dd Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 4 Apr 2008 18:11:10 -0700
Subject: nodemask: use new node_to_cpumask_ptr function

  * Use new node_to_cpumask_ptr.  This creates a pointer to the
    cpumask for a given node.  This definition is in mm patch:

	asm-generic-add-node_to_cpumask_ptr-macro.patch

  * Use new set_cpus_allowed_ptr function.

Depends on:
	[mm-patch]: asm-generic-add-node_to_cpumask_ptr-macro.patch
	[sched-devel]: sched: add new set_cpus_allowed_ptr function
	[x86/latest]: x86: add cpus_scnprintf function

Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Greg Banks <gnb@melbourne.sgi.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/page_alloc.c |  6 +++---
 mm/slab.c       |  5 ++---
 mm/vmscan.c     | 18 ++++++++----------
 3 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 402a504f1228..32e796af12a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2029,6 +2029,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = -1;
+	node_to_cpumask_ptr(tmp, 0);
 
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
@@ -2037,7 +2038,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 	}
 
 	for_each_node_state(n, N_HIGH_MEMORY) {
-		cpumask_t tmp;
 
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
@@ -2050,8 +2050,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 		val += (n < node);
 
 		/* Give preference to headless and unused nodes */
-		tmp = node_to_cpumask(n);
-		if (!cpus_empty(tmp))
+		node_to_cpumask_ptr_next(tmp, n);
+		if (!cpus_empty(*tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 
 		/* Slight preference for less loaded node */
diff --git a/mm/slab.c b/mm/slab.c
index 04b308c3bc54..03927cb5ec9e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1160,14 +1160,13 @@ static void __cpuinit cpuup_canceled(long cpu)
 	struct kmem_cache *cachep;
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
+	node_to_cpumask_ptr(mask, node);
 
 	list_for_each_entry(cachep, &cache_chain, next) {
 		struct array_cache *nc;
 		struct array_cache *shared;
 		struct array_cache **alien;
-		cpumask_t mask;
 
-		mask = node_to_cpumask(node);
 		/* cpu is dead; no one can alloc from it. */
 		nc = cachep->array[cpu];
 		cachep->array[cpu] = NULL;
@@ -1183,7 +1182,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 		if (nc)
 			free_block(cachep, nc->entry, nc->avail, node);
 
-		if (!cpus_empty(mask)) {
+		if (!cpus_empty(*mask)) {
 			spin_unlock_irq(&l3->list_lock);
 			goto free_array_cache;
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4046434046e6..f80a5b7c057f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1647,11 +1647,10 @@ static int kswapd(void *p)
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
-	cpumask_t cpumask;
+	node_to_cpumask_ptr(cpumask, pgdat->node_id);
 
-	cpumask = node_to_cpumask(pgdat->node_id);
-	if (!cpus_empty(cpumask))
-		set_cpus_allowed(tsk, cpumask);
+	if (!cpus_empty(*cpumask))
+		set_cpus_allowed_ptr(tsk, cpumask);
 	current->reclaim_state = &reclaim_state;
 
 	/*
@@ -1880,17 +1879,16 @@ out:
 static int __devinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action, void *hcpu)
 {
-	pg_data_t *pgdat;
-	cpumask_t mask;
 	int nid;
 
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
 		for_each_node_state(nid, N_HIGH_MEMORY) {
-			pgdat = NODE_DATA(nid);
-			mask = node_to_cpumask(pgdat->node_id);
-			if (any_online_cpu(mask) != NR_CPUS)
+			pg_data_t *pgdat = NODE_DATA(nid);
+			node_to_cpumask_ptr(mask, pgdat->node_id);
+
+			if (any_online_cpu(*mask) < nr_cpu_ids)
 				/* One of our CPUs online: restore mask */
-				set_cpus_allowed(pgdat->kswapd, mask);
+				set_cpus_allowed_ptr(pgdat->kswapd, mask);
 		}
 	}
 	return NOTIFY_OK;
-- 
cgit v1.2.3


From da19cbcf71cde3c09587b5924d113f0c7f1fd23a Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Mon, 4 Feb 2008 23:35:47 -0800
Subject: driver core: memory: semaphore to mutex

Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 mm/memory_hotplug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7469c503580d..0fb330271271 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -208,7 +208,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	/*
 	 * This doesn't need a lock to do pfn_to_page().
 	 * The section can't be removed here because of the
-	 * memory_block->state_sem.
+	 * memory_block->state_mutex.
 	 */
 	zone = page_zone(pfn_to_page(pfn));
 	pgdat_resize_lock(zone->zone_pgdat, &flags);
-- 
cgit v1.2.3


From f5264481c8049673e2cc8c7aca410931f571ba2d Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Mon, 21 Apr 2008 22:15:06 +0000
Subject: trivial: small cleanups

These are small cleanups all over the tree.

Trivial style and comment changes to
  fs/select.c, kernel/signal.c, kernel/stop_machine.c & mm/pdflush.c

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
---
 mm/pdflush.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/pdflush.c b/mm/pdflush.c
index 8f6ee073c0e3..3931f716454a 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -17,8 +17,8 @@
 #include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/fs.h>		// Needed by writeback.h
-#include <linux/writeback.h>	// Prototypes pdflush_operation()
+#include <linux/fs.h>		/* Needed by writeback.h	  */
+#include <linux/writeback.h>	/* Prototypes pdflush_operation() */
 #include <linux/kthread.h>
 #include <linux/cpuset.h>
 #include <linux/freezer.h>
-- 
cgit v1.2.3


From 3dc5063786b273f1aee545844f6bd4e9651ebffe Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 23 Apr 2008 12:28:01 -0700
Subject: slab_err: Pass parameters correctly to slab_bug

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 7f8aaa291a4e..39592b5ce68a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -521,7 +521,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 static void object_err(struct kmem_cache *s, struct page *page,
 			u8 *object, char *reason)
 {
-	slab_bug(s, reason);
+	slab_bug(s, "%s", reason);
 	print_trailer(s, page, object);
 }
 
@@ -533,7 +533,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
 	va_start(args, fmt);
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
-	slab_bug(s, fmt);
+	slab_bug(s, "%s", buf);
 	print_page_info(page);
 	dump_stack();
 }
-- 
cgit v1.2.3


From e123dd3f0ec1664576456ea1ea045591a0a95f0c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 13 Apr 2008 11:51:06 -0700
Subject: mm: make mem_map allocation continuous

vmemmap allocation currently has this layout:

 [ffffe20000000000-ffffe200001fffff] PMD ->ffff810001400000 on node 0
 [ffffe20000200000-ffffe200003fffff] PMD ->ffff810001800000 on node 0
 [ffffe20000400000-ffffe200005fffff] PMD ->ffff810001c00000 on node 0
 [ffffe20000600000-ffffe200007fffff] PMD ->ffff810002000000 on node 0
 [ffffe20000800000-ffffe200009fffff] PMD ->ffff810002400000 on node 0
...

note that there is a 2M hole between them - not optimal.

the root cause is that usemap (24 bytes) will be allocated after every 2M
mem_map, and it will push next vmemmap (2M) to the next (2M) alignment.

solution: try to allocate the mem_map continously.

after the patch, we get:

 [ffffe20000000000-ffffe200001fffff] PMD ->ffff810001400000 on node 0
 [ffffe20000200000-ffffe200003fffff] PMD ->ffff810001600000 on node 0
 [ffffe20000400000-ffffe200005fffff] PMD ->ffff810001800000 on node 0
 [ffffe20000600000-ffffe200007fffff] PMD ->ffff810001a00000 on node 0
 [ffffe20000800000-ffffe200009fffff] PMD ->ffff810001c00000 on node 0
...

which is the ideal layout.

and usemap will share a page because of they are allocated continuously too:

sparse_early_usemap_alloc: usemap = ffff810024e00000 size = 24
sparse_early_usemap_alloc: usemap = ffff810024e00080 size = 24
sparse_early_usemap_alloc: usemap = ffff810024e00100 size = 24
sparse_early_usemap_alloc: usemap = ffff810024e00180 size = 24
...

so we make the bootmem allocation more compact and use less memory
for usemap => mission accomplished ;-)

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/sparse.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/sparse.c b/mm/sparse.c
index 98d6b39c3472..458109b99e61 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -304,22 +304,48 @@ void __init sparse_init(void)
 	unsigned long pnum;
 	struct page *map;
 	unsigned long *usemap;
+	unsigned long **usemap_map;
+	int size;
+
+	/*
+	 * map is using big page (aka 2M in x86 64 bit)
+	 * usemap is less one page (aka 24 bytes)
+	 * so alloc 2M (with 2M align) and 24 bytes in turn will
+	 * make next 2M slip to one more 2M later.
+	 * then in big system, the memory will have a lot of holes...
+	 * here try to allocate 2M pages continously.
+	 *
+	 * powerpc need to call sparse_init_one_section right after each
+	 * sparse_early_mem_map_alloc, so allocate usemap_map at first.
+	 */
+	size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
+	usemap_map = alloc_bootmem(size);
+	if (!usemap_map)
+		panic("can not allocate usemap_map\n");
 
 	for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
 		if (!present_section_nr(pnum))
 			continue;
+		usemap_map[pnum] = sparse_early_usemap_alloc(pnum);
+	}
 
-		map = sparse_early_mem_map_alloc(pnum);
-		if (!map)
+	for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+		if (!present_section_nr(pnum))
 			continue;
 
-		usemap = sparse_early_usemap_alloc(pnum);
+		usemap = usemap_map[pnum];
 		if (!usemap)
 			continue;
 
+		map = sparse_early_mem_map_alloc(pnum);
+		if (!map)
+			continue;
+
 		sparse_init_one_section(__nr_to_section(pnum), pnum, map,
 								usemap);
 	}
+
+	free_bootmem(__pa(usemap_map), size);
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-- 
cgit v1.2.3


From ad09315cad17458e51c7f1f8b371cb942c54b955 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 10 Mar 2008 23:23:42 -0700
Subject: mm: fix alloc_bootmem_core to use fast searching for all nodes

Make the nodes other than node 0 use bdata->last_success for fast
search too.

We need to use __alloc_bootmem_core() for vmemmap allocation for other
nodes when numa and sparsemem/vmemmap are enabled.

Also, make fail_block path increase i with incr only after ALIGN
to avoid extra increase when size is larger than align.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/bootmem.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2ccea700968f..3c012fb58745 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -238,28 +238,32 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	 * We try to allocate bootmem pages above 'goal'
 	 * first, then we try to allocate lower pages.
 	 */
-	if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) {
-		preferred = goal - bdata->node_boot_start;
+	preferred = 0;
+	if (goal && PFN_DOWN(goal) < end_pfn) {
+		if (goal > bdata->node_boot_start)
+			preferred = goal - bdata->node_boot_start;
 
 		if (bdata->last_success >= preferred)
 			if (!limit || (limit && limit > bdata->last_success))
 				preferred = bdata->last_success;
-	} else
-		preferred = 0;
+	}
 
 	preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
 	areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
 	incr = align >> PAGE_SHIFT ? : 1;
 
 restart_scan:
-	for (i = preferred; i < eidx; i += incr) {
+	for (i = preferred; i < eidx;) {
 		unsigned long j;
+
 		i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
 		i = ALIGN(i, incr);
 		if (i >= eidx)
 			break;
-		if (test_bit(i, bdata->node_bootmem_map))
+		if (test_bit(i, bdata->node_bootmem_map)) {
+			i += incr;
 			continue;
+		}
 		for (j = i + 1; j < i + areasize; ++j) {
 			if (j >= eidx)
 				goto fail_block;
@@ -270,6 +274,8 @@ restart_scan:
 		goto found;
 	fail_block:
 		i = ALIGN(j, incr);
+		if (i == j)
+			i += incr;
 	}
 
 	if (preferred > offset) {
-- 
cgit v1.2.3


From 9a2dc04cf070ee98e014a172695782ff42015fc4 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 18 Mar 2008 12:44:48 -0700
Subject: mm: offset align in alloc_bootmem()

need offset alignment when node_boot_start's alignment is less than
the alignment required.

use local node_boot_start to match alignment - so don't add extra operation
in search loop.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/bootmem.c | 60 ++++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 3c012fb58745..0f30bc873ecc 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -206,9 +206,11 @@ void * __init
 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	      unsigned long align, unsigned long goal, unsigned long limit)
 {
-	unsigned long offset, remaining_size, areasize, preferred;
+	unsigned long areasize, preferred;
 	unsigned long i, start = 0, incr, eidx, end_pfn;
 	void *ret;
+	unsigned long node_boot_start;
+	void *node_bootmem_map;
 
 	if (!size) {
 		printk("__alloc_bootmem_core(): zero-sized request\n");
@@ -216,23 +218,29 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	}
 	BUG_ON(align & (align-1));
 
-	if (limit && bdata->node_boot_start >= limit)
-		return NULL;
-
 	/* on nodes without memory - bootmem_map is NULL */
 	if (!bdata->node_bootmem_map)
 		return NULL;
 
+	/* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */
+	node_boot_start = bdata->node_boot_start;
+	node_bootmem_map = bdata->node_bootmem_map;
+	if (align) {
+		node_boot_start = ALIGN(bdata->node_boot_start, align);
+		if (node_boot_start > bdata->node_boot_start)
+			node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
+			    PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
+	}
+
+	if (limit && node_boot_start >= limit)
+		return NULL;
+
 	end_pfn = bdata->node_low_pfn;
 	limit = PFN_DOWN(limit);
 	if (limit && end_pfn > limit)
 		end_pfn = limit;
 
-	eidx = end_pfn - PFN_DOWN(bdata->node_boot_start);
-	offset = 0;
-	if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
-		offset = align - (bdata->node_boot_start & (align - 1UL));
-	offset = PFN_DOWN(offset);
+	eidx = end_pfn - PFN_DOWN(node_boot_start);
 
 	/*
 	 * We try to allocate bootmem pages above 'goal'
@@ -240,15 +248,16 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	 */
 	preferred = 0;
 	if (goal && PFN_DOWN(goal) < end_pfn) {
-		if (goal > bdata->node_boot_start)
-			preferred = goal - bdata->node_boot_start;
+		if (goal > node_boot_start)
+			preferred = goal - node_boot_start;
 
-		if (bdata->last_success >= preferred)
+		if (bdata->last_success > node_boot_start &&
+			bdata->last_success - node_boot_start >= preferred)
 			if (!limit || (limit && limit > bdata->last_success))
-				preferred = bdata->last_success;
+				preferred = bdata->last_success - node_boot_start;
 	}
 
-	preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
+	preferred = PFN_DOWN(ALIGN(preferred, align));
 	areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
 	incr = align >> PAGE_SHIFT ? : 1;
 
@@ -256,18 +265,18 @@ restart_scan:
 	for (i = preferred; i < eidx;) {
 		unsigned long j;
 
-		i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
+		i = find_next_zero_bit(node_bootmem_map, eidx, i);
 		i = ALIGN(i, incr);
 		if (i >= eidx)
 			break;
-		if (test_bit(i, bdata->node_bootmem_map)) {
+		if (test_bit(i, node_bootmem_map)) {
 			i += incr;
 			continue;
 		}
 		for (j = i + 1; j < i + areasize; ++j) {
 			if (j >= eidx)
 				goto fail_block;
-			if (test_bit(j, bdata->node_bootmem_map))
+			if (test_bit(j, node_bootmem_map))
 				goto fail_block;
 		}
 		start = i;
@@ -278,14 +287,14 @@ restart_scan:
 			i += incr;
 	}
 
-	if (preferred > offset) {
-		preferred = offset;
+	if (preferred > 0) {
+		preferred = 0;
 		goto restart_scan;
 	}
 	return NULL;
 
 found:
-	bdata->last_success = PFN_PHYS(start);
+	bdata->last_success = PFN_PHYS(start) + node_boot_start;
 	BUG_ON(start >= eidx);
 
 	/*
@@ -295,6 +304,7 @@ found:
 	 */
 	if (align < PAGE_SIZE &&
 	    bdata->last_offset && bdata->last_pos+1 == start) {
+		unsigned long offset, remaining_size;
 		offset = ALIGN(bdata->last_offset, align);
 		BUG_ON(offset > PAGE_SIZE);
 		remaining_size = PAGE_SIZE - offset;
@@ -303,14 +313,12 @@ found:
 			/* last_pos unchanged */
 			bdata->last_offset = offset + size;
 			ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
-					   offset +
-					   bdata->node_boot_start);
+					   offset + node_boot_start);
 		} else {
 			remaining_size = size - remaining_size;
 			areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
 			ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
-					   offset +
-					   bdata->node_boot_start);
+					   offset + node_boot_start);
 			bdata->last_pos = start + areasize - 1;
 			bdata->last_offset = remaining_size;
 		}
@@ -318,14 +326,14 @@ found:
 	} else {
 		bdata->last_pos = start + areasize - 1;
 		bdata->last_offset = size & ~PAGE_MASK;
-		ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
+		ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
 	}
 
 	/*
 	 * Reserve the area now:
 	 */
 	for (i = start; i < start + areasize; i++)
-		if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
+		if (unlikely(test_and_set_bit(i, node_bootmem_map)))
 			BUG();
 	memset(ret, 0, size);
 	return ret;
-- 
cgit v1.2.3


From a5645a61b3b7e7d7de15e1a642ead600150ce94d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 18 Mar 2008 12:49:12 -0700
Subject: mm: allow reserve_bootmem() cross nodes

split reserve_bootmem_core() into two functions, one which checks
conflicts, and one which sets the bits.

and make reserve_bootmem to loop bdata_list to cross the nodes.

user could be crashkernel and ramdisk..., in case the range provided
by those externalities crosses the nodes.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/bootmem.c | 92 +++++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 69 insertions(+), 23 deletions(-)

(limited to 'mm')

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0f30bc873ecc..b6791646143e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -111,44 +111,74 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
  * might be used for boot-time allocations - or it might get added
  * to the free page pool later on.
  */
-static int __init reserve_bootmem_core(bootmem_data_t *bdata,
+static int __init can_reserve_bootmem_core(bootmem_data_t *bdata,
 			unsigned long addr, unsigned long size, int flags)
 {
 	unsigned long sidx, eidx;
 	unsigned long i;
-	int ret;
+
+	BUG_ON(!size);
+
+	/* out of range, don't hold other */
+	if (addr + size < bdata->node_boot_start ||
+		PFN_DOWN(addr) > bdata->node_low_pfn)
+		return 0;
 
 	/*
-	 * round up, partially reserved pages are considered
-	 * fully reserved.
+	 * Round up to index to the range.
 	 */
+	if (addr > bdata->node_boot_start)
+		sidx= PFN_DOWN(addr - bdata->node_boot_start);
+	else
+		sidx = 0;
+
+	eidx = PFN_UP(addr + size - bdata->node_boot_start);
+	if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+		eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
+
+	for (i = sidx; i < eidx; i++) {
+		if (test_bit(i, bdata->node_bootmem_map)) {
+			if (flags & BOOTMEM_EXCLUSIVE)
+				return -EBUSY;
+		}
+	}
+
+	return 0;
+
+}
+
+static void __init reserve_bootmem_core(bootmem_data_t *bdata,
+			unsigned long addr, unsigned long size, int flags)
+{
+	unsigned long sidx, eidx;
+	unsigned long i;
+
 	BUG_ON(!size);
-	BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
-	BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
-	BUG_ON(addr < bdata->node_boot_start);
 
-	sidx = PFN_DOWN(addr - bdata->node_boot_start);
+	/* out of range */
+	if (addr + size < bdata->node_boot_start ||
+		PFN_DOWN(addr) > bdata->node_low_pfn)
+		return;
+
+	/*
+	 * Round up to index to the range.
+	 */
+	if (addr > bdata->node_boot_start)
+		sidx= PFN_DOWN(addr - bdata->node_boot_start);
+	else
+		sidx = 0;
+
 	eidx = PFN_UP(addr + size - bdata->node_boot_start);
+	if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+		eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
 
-	for (i = sidx; i < eidx; i++)
+	for (i = sidx; i < eidx; i++) {
 		if (test_and_set_bit(i, bdata->node_bootmem_map)) {
 #ifdef CONFIG_DEBUG_BOOTMEM
 			printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
 #endif
-			if (flags & BOOTMEM_EXCLUSIVE) {
-				ret = -EBUSY;
-				goto err;
-			}
 		}
-
-	return 0;
-
-err:
-	/* unreserve memory we accidentally reserved */
-	for (i--; i >= sidx; i--)
-		clear_bit(i, bdata->node_bootmem_map);
-
-	return ret;
+	}
 }
 
 static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
@@ -415,6 +445,11 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
 void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 				 unsigned long size, int flags)
 {
+	int ret;
+
+	ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
+	if (ret < 0)
+		return;
 	reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
 }
 
@@ -440,7 +475,18 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
 int __init reserve_bootmem(unsigned long addr, unsigned long size,
 			    int flags)
 {
-	return reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size, flags);
+	bootmem_data_t *bdata;
+	int ret;
+
+	list_for_each_entry(bdata, &bdata_list, list) {
+		ret = can_reserve_bootmem_core(bdata, addr, size, flags);
+		if (ret < 0)
+			return ret;
+	}
+	list_for_each_entry(bdata, &bdata_list, list)
+		reserve_bootmem_core(bdata, addr, size, flags);
+
+	return 0;
 }
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
-- 
cgit v1.2.3


From c2b91e2eec9678dbda274e906cc32ea8f711da3b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Sat, 12 Apr 2008 01:19:24 -0700
Subject: x86_64/mm: check and print vmemmap allocation continuous

On big systems with lots of memory, don't print out too much during
bootup, and make it easy to find if it is continuous.

on 256G 8 sockets system will get
 [ffffe20000000000-ffffe20002bfffff] PMD -> [ffff810001400000-ffff810003ffffff] on node 0
[ffffe2001c700000-ffffe2001c7fffff] potential offnode page_structs
 [ffffe20002c00000-ffffe2001c7fffff] PMD -> [ffff81000c000000-ffff8100255fffff] on node 0
[ffffe20038700000-ffffe200387fffff] potential offnode page_structs
 [ffffe2001c800000-ffffe200387fffff] PMD -> [ffff810820200000-ffff81083c1fffff] on node 1
 [ffffe20040000000-ffffe2007fffffff] PUD ->ffff811027a00000 on node 2
 [ffffe20038800000-ffffe2003fffffff] PMD -> [ffff811020200000-ffff8110279fffff] on node 2
[ffffe20054700000-ffffe200547fffff] potential offnode page_structs
 [ffffe20040000000-ffffe200547fffff] PMD -> [ffff811027c00000-ffff81103c3fffff] on node 2
[ffffe20070700000-ffffe200707fffff] potential offnode page_structs
 [ffffe20054800000-ffffe200707fffff] PMD -> [ffff811820200000-ffff81183c1fffff] on node 3
 [ffffe20080000000-ffffe200bfffffff] PUD ->ffff81202fa00000 on node 4
 [ffffe20070800000-ffffe2007fffffff] PMD -> [ffff812020200000-ffff81202f9fffff] on node 4
[ffffe2008c700000-ffffe2008c7fffff] potential offnode page_structs
 [ffffe20080000000-ffffe2008c7fffff] PMD -> [ffff81202fc00000-ffff81203c3fffff] on node 4
[ffffe200a8700000-ffffe200a87fffff] potential offnode page_structs
 [ffffe2008c800000-ffffe200a87fffff] PMD -> [ffff812820200000-ffff81283c1fffff] on node 5
 [ffffe200c0000000-ffffe200ffffffff] PUD ->ffff813037a00000 on node 6
 [ffffe200a8800000-ffffe200bfffffff] PMD -> [ffff813020200000-ffff8130379fffff] on node 6
[ffffe200c4700000-ffffe200c47fffff] potential offnode page_structs
 [ffffe200c0000000-ffffe200c47fffff] PMD -> [ffff813037c00000-ffff81303c3fffff] on node 6
 [ffffe200c4800000-ffffe200e07fffff] PMD -> [ffff813820200000-ffff81383c1fffff] on node 7

instead of a very long print out...

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 mm/sparse.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'mm')

diff --git a/mm/sparse.c b/mm/sparse.c
index 458109b99e61..7e9191381f86 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -295,6 +295,9 @@ struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 	return NULL;
 }
 
+void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
+{
+}
 /*
  * Allocate the accumulated non-linear sections, allocate a mem_map
  * for each and record the physical to section mapping.
@@ -345,6 +348,8 @@ void __init sparse_init(void)
 								usemap);
 	}
 
+	vmemmap_populate_print_last();
+
 	free_bootmem(__pa(usemap_map), size);
 }
 
-- 
cgit v1.2.3


From 5b7baf05783b1ac97a510243d7e82293416a7cf6 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 25 Mar 2008 18:47:12 +0100
Subject: s390: KVM preparation: host memory management changes for s390 kvm

This patch changes the s390 memory management defintions to use the pgste field
for dirty and reference bit tracking of host and guest code. Usually on s390,
dirty and referenced are tracked in storage keys, which belong to the physical
page. This changes with virtualization: The guest and host dirty/reference bits
are defined to be the logical OR of the values for the mapping and the physical
page. This patch implements the necessary changes in pgtable.h for s390.

There is a common code change in mm/rmap.c, the call to
page_test_and_clear_young must be moved. This is a no-op for all
architecture but s390. page_referenced checks the referenced bits for
the physiscal page and for all mappings:
o The physical page is checked with page_test_and_clear_young.
o The mappings are checked with ptep_test_and_clear_young and friends.

Without pgstes (the current implementation on Linux s390) the physical page
check is implemented but the mapping callbacks are no-ops because dirty
and referenced are not tracked in the s390 page tables. The pgstes introduces
guest and host dirty and reference bits for s390 in the host mapping. These
mapping must be checked before page_test_and_clear_young resets the reference
bit.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 mm/rmap.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 997f06907b6d..e9bb6b1093f6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -413,9 +413,6 @@ int page_referenced(struct page *page, int is_locked,
 {
 	int referenced = 0;
 
-	if (page_test_and_clear_young(page))
-		referenced++;
-
 	if (TestClearPageReferenced(page))
 		referenced++;
 
@@ -433,6 +430,10 @@ int page_referenced(struct page *page, int is_locked,
 			unlock_page(page);
 		}
 	}
+
+	if (page_test_and_clear_young(page))
+		referenced++;
+
 	return referenced;
 }
 
-- 
cgit v1.2.3


From 0701a9e649bf0ffdac0a761d3c3d1041f5375d90 Mon Sep 17 00:00:00 2001
From: Yi Li <yi.li@analog.com>
Date: Fri, 25 Apr 2008 19:49:21 +0300
Subject: slob: fix bug - when slob allocates "struct kmem_cache", it does not
 force alignment.

This may trigger misaligned memory access exception.

Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Yi Li <yi.li@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slob.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index e2c3c0ec5463..6038cbadf796 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -533,7 +533,8 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 {
 	struct kmem_cache *c;
 
-	c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
+	c = slob_alloc(sizeof(struct kmem_cache),
+		flags, ARCH_KMALLOC_MINALIGN, -1);
 
 	if (c) {
 		c->name = name;
-- 
cgit v1.2.3


From d629d819579327267884a12de21ef6d4b539db88 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 23 Apr 2008 22:31:08 +0300
Subject: slub: improve kmem_cache_destroy() error message

As pointed out by Ingo, the SLUB warning of calling kmem_cache_destroy()
with cache that still has objects triggers in practice. So turn this
WARN_ON() into a nice SLUB specific error message to avoid people
confusing it to a SLUB bug.

Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 39592b5ce68a..378d3f1b548f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2426,8 +2426,11 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	if (!s->refcount) {
 		list_del(&s->list);
 		up_write(&slub_lock);
-		if (kmem_cache_close(s))
-			WARN_ON(1);
+		if (kmem_cache_close(s)) {
+			printk(KERN_ERR "SLUB %s: %s called for cache that "
+				"still has objects.\n", s->name, __func__);
+			dump_stack();
+		}
 		sysfs_slab_remove(s);
 	} else
 		up_write(&slub_lock);
-- 
cgit v1.2.3


From 599870b175987008b5f5c82a70b89f751e12822e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 23 Apr 2008 12:36:52 -0700
Subject: slub: free_list() cleanup

free_list looked a bit screwy so here is an attempt to clean it up.

free_list is is only used for freeing partial lists. We do not need to return a
parameter if we decrement nr_partial within the function which allows a
simplification of the whole thing.

The current version modifies nr_partial outside of the list_lock which is
technically not correct. It was only ok because we should be the only user of
this slab cache at this point.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 378d3f1b548f..c937233127e2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2372,25 +2372,21 @@ const char *kmem_cache_name(struct kmem_cache *s)
 EXPORT_SYMBOL(kmem_cache_name);
 
 /*
- * Attempt to free all slabs on a node. Return the number of slabs we
- * were unable to free.
+ * Attempt to free all partial slabs on a node.
  */
-static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
-			struct list_head *list)
+static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 {
-	int slabs_inuse = 0;
 	unsigned long flags;
 	struct page *page, *h;
 
 	spin_lock_irqsave(&n->list_lock, flags);
-	list_for_each_entry_safe(page, h, list, lru)
+	list_for_each_entry_safe(page, h, &n->partial, lru)
 		if (!page->inuse) {
 			list_del(&page->lru);
 			discard_slab(s, page);
-		} else
-			slabs_inuse++;
+			n->nr_partial--;
+		}
 	spin_unlock_irqrestore(&n->list_lock, flags);
-	return slabs_inuse;
 }
 
 /*
@@ -2407,8 +2403,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 	for_each_node_state(node, N_NORMAL_MEMORY) {
 		struct kmem_cache_node *n = get_node(s, node);
 
-		n->nr_partial -= free_list(s, n, &n->partial);
-		if (slabs_node(s, node))
+		free_partial(s, n);
+		if (n->nr_partial || slabs_node(s, node))
 			return 1;
 	}
 	free_kmem_cache_nodes(s);
-- 
cgit v1.2.3


From 33b12c38134e95e5afa73214af6f49abd7b8418e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 25 Apr 2008 12:22:43 -0700
Subject: slub: Dump list of objects not freed on kmem_cache_close()

Dump a list of unfreed objects if a slab cache is closed but
objects still remain.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index c937233127e2..64c2b2bfbd79 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2371,6 +2371,32 @@ const char *kmem_cache_name(struct kmem_cache *s)
 }
 EXPORT_SYMBOL(kmem_cache_name);
 
+static void list_slab_objects(struct kmem_cache *s, struct page *page,
+							const char *text)
+{
+#ifdef CONFIG_SLUB_DEBUG
+	void *addr = page_address(page);
+	void *p;
+	DECLARE_BITMAP(map, page->objects);
+
+	bitmap_zero(map, page->objects);
+	slab_err(s, page, "%s", text);
+	slab_lock(page);
+	for_each_free_object(p, s, page->freelist)
+		set_bit(slab_index(p, s, addr), map);
+
+	for_each_object(p, s, addr, page->objects) {
+
+		if (!test_bit(slab_index(p, s, addr), map)) {
+			printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
+							p, p - addr);
+			print_tracking(s, p);
+		}
+	}
+	slab_unlock(page);
+#endif
+}
+
 /*
  * Attempt to free all partial slabs on a node.
  */
@@ -2380,12 +2406,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 	struct page *page, *h;
 
 	spin_lock_irqsave(&n->list_lock, flags);
-	list_for_each_entry_safe(page, h, &n->partial, lru)
+	list_for_each_entry_safe(page, h, &n->partial, lru) {
 		if (!page->inuse) {
 			list_del(&page->lru);
 			discard_slab(s, page);
 			n->nr_partial--;
+		} else {
+			list_slab_objects(s, page,
+				"Objects remaining on kmem_cache_close()");
 		}
+	}
 	spin_unlock_irqrestore(&n->list_lock, flags);
 }
 
-- 
cgit v1.2.3


From 39b264641a0c3b5e0e742e2046b49e92d1f3be88 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 19:11:30 +0300
Subject: slub: Store max number of objects in the page struct.

Split the inuse field up to be able to store the number of objects in this
page in the page struct as well. Necessary if we want to have pages of
various orders for a slab. Also avoids touching struct kmem_cache cachelines in
__slab_alloc().

Update diagnostic code to check the number of objects and make sure that
the number of objects always stays within the bounds of a 16 bit unsigned
integer.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 54 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 64c2b2bfbd79..6641025c597f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -301,7 +301,7 @@ static inline int check_valid_pointer(struct kmem_cache *s,
 		return 1;
 
 	base = page_address(page);
-	if (object < base || object >= base + s->objects * s->size ||
+	if (object < base || object >= base + page->objects * s->size ||
 		(object - base) % s->size) {
 		return 0;
 	}
@@ -451,8 +451,8 @@ static void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_page_info(struct page *page)
 {
-	printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
-		page, page->inuse, page->freelist, page->flags);
+	printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
+		page, page->objects, page->inuse, page->freelist, page->flags);
 
 }
 
@@ -652,6 +652,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 				p + off, POISON_INUSE, s->size - off);
 }
 
+/* Check the pad bytes at the end of a slab page */
 static int slab_pad_check(struct kmem_cache *s, struct page *page)
 {
 	u8 *start;
@@ -664,20 +665,20 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 		return 1;
 
 	start = page_address(page);
-	end = start + (PAGE_SIZE << s->order);
-	length = s->objects * s->size;
-	remainder = end - (start + length);
+	length = (PAGE_SIZE << s->order);
+	end = start + length;
+	remainder = length % s->size;
 	if (!remainder)
 		return 1;
 
-	fault = check_bytes(start + length, POISON_INUSE, remainder);
+	fault = check_bytes(end - remainder, POISON_INUSE, remainder);
 	if (!fault)
 		return 1;
 	while (end > fault && end[-1] == POISON_INUSE)
 		end--;
 
 	slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
-	print_section("Padding", start, length);
+	print_section("Padding", end - remainder, remainder);
 
 	restore_bytes(s, "slab padding", POISON_INUSE, start, end);
 	return 0;
@@ -739,15 +740,24 @@ static int check_object(struct kmem_cache *s, struct page *page,
 
 static int check_slab(struct kmem_cache *s, struct page *page)
 {
+	int maxobj;
+
 	VM_BUG_ON(!irqs_disabled());
 
 	if (!PageSlab(page)) {
 		slab_err(s, page, "Not a valid slab page");
 		return 0;
 	}
-	if (page->inuse > s->objects) {
+
+	maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
+	if (page->objects > maxobj) {
+		slab_err(s, page, "objects %u > max %u",
+			s->name, page->objects, maxobj);
+		return 0;
+	}
+	if (page->inuse > page->objects) {
 		slab_err(s, page, "inuse %u > max %u",
-			s->name, page->inuse, s->objects);
+			s->name, page->inuse, page->objects);
 		return 0;
 	}
 	/* Slab_pad_check fixes things up after itself */
@@ -765,7 +775,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 	void *fp = page->freelist;
 	void *object = NULL;
 
-	while (fp && nr <= s->objects) {
+	while (fp && nr <= page->objects) {
 		if (fp == search)
 			return 1;
 		if (!check_valid_pointer(s, page, fp)) {
@@ -777,7 +787,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 			} else {
 				slab_err(s, page, "Freepointer corrupt");
 				page->freelist = NULL;
-				page->inuse = s->objects;
+				page->inuse = page->objects;
 				slab_fix(s, "Freelist cleared");
 				return 0;
 			}
@@ -788,10 +798,10 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 		nr++;
 	}
 
-	if (page->inuse != s->objects - nr) {
+	if (page->inuse != page->objects - nr) {
 		slab_err(s, page, "Wrong object count. Counter is %d but "
-			"counted were %d", page->inuse, s->objects - nr);
-		page->inuse = s->objects - nr;
+			"counted were %d", page->inuse, page->objects - nr);
+		page->inuse = page->objects - nr;
 		slab_fix(s, "Object count adjusted.");
 	}
 	return search == NULL;
@@ -910,7 +920,7 @@ bad:
 		 * as used avoids touching the remaining objects.
 		 */
 		slab_fix(s, "Marking all objects used");
-		page->inuse = s->objects;
+		page->inuse = page->objects;
 		page->freelist = NULL;
 	}
 	return 0;
@@ -1081,6 +1091,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	if (!page)
 		return NULL;
 
+	page->objects = s->objects;
 	mod_zone_page_state(page_zone(page),
 		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
@@ -1519,7 +1530,7 @@ load_freelist:
 		goto debug;
 
 	c->freelist = object[c->offset];
-	c->page->inuse = s->objects;
+	c->page->inuse = c->page->objects;
 	c->page->freelist = NULL;
 	c->node = page_to_nid(c->page);
 unlock_out:
@@ -1818,6 +1829,9 @@ static inline int slab_order(int size, int min_objects,
 	int rem;
 	int min_order = slub_min_order;
 
+	if ((PAGE_SIZE << min_order) / size > 65535)
+		return get_order(size * 65535) - 1;
+
 	for (order = max(min_order,
 				fls(min_objects * size - 1) - PAGE_SHIFT);
 			order <= max_order; order++) {
@@ -3251,7 +3265,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
 		return 0;
 
 	/* Now we know that a valid freelist exists */
-	bitmap_zero(map, s->objects);
+	bitmap_zero(map, page->objects);
 
 	for_each_free_object(p, s, page->freelist) {
 		set_bit(slab_index(p, s, addr), map);
@@ -3528,10 +3542,10 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
 		struct page *page, enum track_item alloc)
 {
 	void *addr = page_address(page);
-	DECLARE_BITMAP(map, s->objects);
+	DECLARE_BITMAP(map, page->objects);
 	void *p;
 
-	bitmap_zero(map, s->objects);
+	bitmap_zero(map, page->objects);
 	for_each_free_object(p, s, page->freelist)
 		set_bit(slab_index(p, s, addr), map);
 
-- 
cgit v1.2.3


From 224a88be40c45c0da5bdc45a8118004a37c60e8a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 19:11:31 +0300
Subject: slub: for_each_object must be passed the number of objects in a slab

Pass the number of objects to the for_each_object macro. Most of these are
debug related.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 6641025c597f..67f7d6068934 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -327,8 +327,8 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 }
 
 /* Loop over all objects in a slab */
-#define for_each_object(__p, __s, __addr) \
-	for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
+#define for_each_object(__p, __s, __addr, __objects) \
+	for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
 			__p += (__s)->size)
 
 /* Scan freelist */
@@ -774,6 +774,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 	int nr = 0;
 	void *fp = page->freelist;
 	void *object = NULL;
+	unsigned long max_objects;
 
 	while (fp && nr <= page->objects) {
 		if (fp == search)
@@ -798,6 +799,16 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 		nr++;
 	}
 
+	max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
+	if (max_objects > 65535)
+		max_objects = 65535;
+
+	if (page->objects != max_objects) {
+		slab_err(s, page, "Wrong number of objects. Found %d but "
+			"should be %d", page->objects, max_objects);
+		page->objects = max_objects;
+		slab_fix(s, "Number of objects adjusted.");
+	}
 	if (page->inuse != page->objects - nr) {
 		slab_err(s, page, "Wrong object count. Counter is %d but "
 			"counted were %d", page->inuse, page->objects - nr);
@@ -1135,7 +1146,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 		memset(start, POISON_INUSE, PAGE_SIZE << s->order);
 
 	last = start;
-	for_each_object(p, s, start) {
+	for_each_object(p, s, start, page->objects) {
 		setup_object(s, page, last);
 		set_freepointer(s, last, p);
 		last = p;
@@ -1157,7 +1168,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 		void *p;
 
 		slab_pad_check(s, page);
-		for_each_object(p, s, page_address(page))
+		for_each_object(p, s, page_address(page),
+						page->objects)
 			check_object(s, page, p, 0);
 		ClearSlabDebug(page);
 	}
@@ -3273,7 +3285,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
 			return 0;
 	}
 
-	for_each_object(p, s, addr)
+	for_each_object(p, s, addr, page->objects)
 		if (!test_bit(slab_index(p, s, addr), map))
 			if (!check_object(s, page, p, 1))
 				return 0;
@@ -3549,7 +3561,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
 	for_each_free_object(p, s, page->freelist)
 		set_bit(slab_index(p, s, addr), map);
 
-	for_each_object(p, s, addr)
+	for_each_object(p, s, addr, page->objects)
 		if (!test_bit(slab_index(p, s, addr), map))
 			add_location(t, s, get_track(s, p, alloc));
 }
-- 
cgit v1.2.3


From 834f3d119234b35a1985a2449831d99356637937 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 19:11:31 +0300
Subject: slub: Add kmem_cache_order_objects struct

Pack the order and the number of objects into a single word.
This saves some memory in the kmem_cache_structure and more importantly
allows us to fetch both values atomically.

Later the slab orders become runtime configurable and we need to fetch these
two items together in order to properly allocate a slab and initialize its
objects.

Fix the race by fetching the order and the number of objects in one word.

[penberg@cs.helsinki.fi: fix memset() page order in new_slab()]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 76 ++++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 25 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 67f7d6068934..0a220df5ed7c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -341,6 +341,26 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 	return (p - addr) / s->size;
 }
 
+static inline struct kmem_cache_order_objects oo_make(int order,
+						unsigned long size)
+{
+	struct kmem_cache_order_objects x = {
+		(order << 16) + (PAGE_SIZE << order) / size
+	};
+
+	return x;
+}
+
+static inline int oo_order(struct kmem_cache_order_objects x)
+{
+	return x.x >> 16;
+}
+
+static inline int oo_objects(struct kmem_cache_order_objects x)
+{
+	return x.x & ((1 << 16) - 1);
+}
+
 #ifdef CONFIG_SLUB_DEBUG
 /*
  * Debug settings:
@@ -665,7 +685,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 		return 1;
 
 	start = page_address(page);
-	length = (PAGE_SIZE << s->order);
+	length = (PAGE_SIZE << compound_order(page));
 	end = start + length;
 	remainder = length % s->size;
 	if (!remainder)
@@ -1090,19 +1110,21 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node) {}
 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 {
 	struct page *page;
-	int pages = 1 << s->order;
+	struct kmem_cache_order_objects oo = s->oo;
+	int order = oo_order(oo);
+	int pages = 1 << order;
 
 	flags |= s->allocflags;
 
 	if (node == -1)
-		page = alloc_pages(flags, s->order);
+		page = alloc_pages(flags, order);
 	else
-		page = alloc_pages_node(node, flags, s->order);
+		page = alloc_pages_node(node, flags, order);
 
 	if (!page)
 		return NULL;
 
-	page->objects = s->objects;
+	page->objects = oo_objects(oo);
 	mod_zone_page_state(page_zone(page),
 		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
@@ -1143,7 +1165,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	start = page_address(page);
 
 	if (unlikely(s->flags & SLAB_POISON))
-		memset(start, POISON_INUSE, PAGE_SIZE << s->order);
+		memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
 
 	last = start;
 	for_each_object(p, s, start, page->objects) {
@@ -1162,7 +1184,8 @@ out:
 
 static void __free_slab(struct kmem_cache *s, struct page *page)
 {
-	int pages = 1 << s->order;
+	int order = compound_order(page);
+	int pages = 1 << order;
 
 	if (unlikely(SlabDebug(page))) {
 		void *p;
@@ -1181,7 +1204,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 
 	__ClearPageSlab(page);
 	reset_page_mapcount(page);
-	__free_pages(page, s->order);
+	__free_pages(page, order);
 }
 
 static void rcu_free_slab(struct rcu_head *h)
@@ -2202,6 +2225,7 @@ static int calculate_sizes(struct kmem_cache *s)
 	unsigned long flags = s->flags;
 	unsigned long size = s->objsize;
 	unsigned long align = s->align;
+	int order;
 
 	/*
 	 * Round up object size to the next word boundary. We can only
@@ -2294,17 +2318,17 @@ static int calculate_sizes(struct kmem_cache *s)
 		 * page allocator order 0 allocs so take a reasonably large
 		 * order that will allows us a good number of objects.
 		 */
-		s->order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER);
+		order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER);
 		s->flags |= __PAGE_ALLOC_FALLBACK;
 		s->allocflags |= __GFP_NOWARN;
 	} else
-		s->order = calculate_order(size);
+		order = calculate_order(size);
 
-	if (s->order < 0)
+	if (order < 0)
 		return 0;
 
 	s->allocflags = 0;
-	if (s->order)
+	if (order)
 		s->allocflags |= __GFP_COMP;
 
 	if (s->flags & SLAB_CACHE_DMA)
@@ -2316,9 +2340,9 @@ static int calculate_sizes(struct kmem_cache *s)
 	/*
 	 * Determine the number of objects per slab
 	 */
-	s->objects = (PAGE_SIZE << s->order) / size;
+	s->oo = oo_make(order, size);
 
-	return !!s->objects;
+	return !!oo_objects(s->oo);
 
 }
 
@@ -2351,7 +2375,7 @@ error:
 	if (flags & SLAB_PANIC)
 		panic("Cannot create slab %s size=%lu realsize=%u "
 			"order=%u offset=%u flags=%lx\n",
-			s->name, (unsigned long)size, s->size, s->order,
+			s->name, (unsigned long)size, s->size, oo_order(s->oo),
 			s->offset, flags);
 	return 0;
 }
@@ -2789,8 +2813,9 @@ int kmem_cache_shrink(struct kmem_cache *s)
 	struct kmem_cache_node *n;
 	struct page *page;
 	struct page *t;
+	int objects = oo_objects(s->oo);
 	struct list_head *slabs_by_inuse =
-		kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
+		kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
 	unsigned long flags;
 
 	if (!slabs_by_inuse)
@@ -2803,7 +2828,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
 		if (!n->nr_partial)
 			continue;
 
-		for (i = 0; i < s->objects; i++)
+		for (i = 0; i < objects; i++)
 			INIT_LIST_HEAD(slabs_by_inuse + i);
 
 		spin_lock_irqsave(&n->list_lock, flags);
@@ -2835,7 +2860,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
 		 * Rebuild the partial list with the slabs filled up most
 		 * first and the least used slabs at the end.
 		 */
-		for (i = s->objects - 1; i >= 0; i--)
+		for (i = objects - 1; i >= 0; i--)
 			list_splice(slabs_by_inuse + i, n->partial.prev);
 
 		spin_unlock_irqrestore(&n->list_lock, flags);
@@ -3351,7 +3376,7 @@ static long validate_slab_cache(struct kmem_cache *s)
 {
 	int node;
 	unsigned long count = 0;
-	unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) *
+	unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->oo)) *
 				sizeof(unsigned long), GFP_KERNEL);
 
 	if (!map)
@@ -3719,7 +3744,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
 					- n->nr_partial;
 
 			if (flags & SO_OBJECTS)
-				x = full_slabs * s->objects;
+				x = full_slabs * oo_objects(s->oo);
 			else
 				x = full_slabs;
 			total += x;
@@ -3798,13 +3823,13 @@ SLAB_ATTR_RO(object_size);
 
 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
 {
-	return sprintf(buf, "%d\n", s->objects);
+	return sprintf(buf, "%d\n", oo_objects(s->oo));
 }
 SLAB_ATTR_RO(objs_per_slab);
 
 static ssize_t order_show(struct kmem_cache *s, char *buf)
 {
-	return sprintf(buf, "%d\n", s->order);
+	return sprintf(buf, "%d\n", oo_order(s->oo));
 }
 SLAB_ATTR_RO(order);
 
@@ -4451,11 +4476,12 @@ static int s_show(struct seq_file *m, void *p)
 		nr_inuse += count_partial(n);
 	}
 
-	nr_objs = nr_slabs * s->objects;
-	nr_inuse += (nr_slabs - nr_partials) * s->objects;
+	nr_objs = nr_slabs * oo_objects(s->oo);
+	nr_inuse += (nr_slabs - nr_partials) * oo_objects(s->oo);
 
 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
-		   nr_objs, s->size, s->objects, (1 << s->order));
+		   nr_objs, s->size, oo_objects(s->oo),
+		   (1 << oo_order(s->oo)));
 	seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
 	seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
 		   0UL);
-- 
cgit v1.2.3


From 205ab99dd103e3dd5b0964dad8a16dfe2db69b2e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 19:11:40 +0300
Subject: slub: Update statistics handling for variable order slabs

Change the statistics to consider that slabs of the same slabcache
can have different number of objects in them since they may be of
different order.

Provide a new sysfs field

	total_objects

which shows the total objects that the allocated slabs of a slabcache
could hold.

Add a max field that holds the largest slab order that was ever used
for a slab cache.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 150 ++++++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 97 insertions(+), 53 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 0a220df5ed7c..c8514e93ffdf 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -886,7 +886,7 @@ static inline unsigned long slabs_node(struct kmem_cache *s, int node)
 	return atomic_long_read(&n->nr_slabs);
 }
 
-static inline void inc_slabs_node(struct kmem_cache *s, int node)
+static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
 {
 	struct kmem_cache_node *n = get_node(s, node);
 
@@ -896,14 +896,17 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node)
 	 * dilemma by deferring the increment of the count during
 	 * bootstrap (see early_kmem_cache_node_alloc).
 	 */
-	if (!NUMA_BUILD || n)
+	if (!NUMA_BUILD || n) {
 		atomic_long_inc(&n->nr_slabs);
+		atomic_long_add(objects, &n->total_objects);
+	}
 }
-static inline void dec_slabs_node(struct kmem_cache *s, int node)
+static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
 {
 	struct kmem_cache_node *n = get_node(s, node);
 
 	atomic_long_dec(&n->nr_slabs);
+	atomic_long_sub(objects, &n->total_objects);
 }
 
 /* Object debug checks for alloc/free paths */
@@ -1101,9 +1104,12 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
 
 static inline unsigned long slabs_node(struct kmem_cache *s, int node)
 							{ return 0; }
-static inline void inc_slabs_node(struct kmem_cache *s, int node) {}
-static inline void dec_slabs_node(struct kmem_cache *s, int node) {}
+static inline void inc_slabs_node(struct kmem_cache *s, int node,
+							int objects) {}
+static inline void dec_slabs_node(struct kmem_cache *s, int node,
+							int objects) {}
 #endif
+
 /*
  * Slab allocation and freeing
  */
@@ -1155,7 +1161,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	if (!page)
 		goto out;
 
-	inc_slabs_node(s, page_to_nid(page));
+	inc_slabs_node(s, page_to_nid(page), page->objects);
 	page->slab = s;
 	page->flags |= 1 << PG_slab;
 	if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
@@ -1230,7 +1236,7 @@ static void free_slab(struct kmem_cache *s, struct page *page)
 
 static void discard_slab(struct kmem_cache *s, struct page *page)
 {
-	dec_slabs_node(s, page_to_nid(page));
+	dec_slabs_node(s, page_to_nid(page), page->objects);
 	free_slab(s, page);
 }
 
@@ -2144,7 +2150,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
 	init_tracking(kmalloc_caches, n);
 #endif
 	init_kmem_cache_node(n);
-	inc_slabs_node(kmalloc_caches, node);
+	inc_slabs_node(kmalloc_caches, node, page->objects);
 
 	/*
 	 * lockdep requires consistent irq usage for each lock
@@ -2341,6 +2347,8 @@ static int calculate_sizes(struct kmem_cache *s)
 	 * Determine the number of objects per slab
 	 */
 	s->oo = oo_make(order, size);
+	if (oo_objects(s->oo) > oo_objects(s->max))
+		s->max = s->oo;
 
 	return !!oo_objects(s->oo);
 
@@ -2813,7 +2821,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
 	struct kmem_cache_node *n;
 	struct page *page;
 	struct page *t;
-	int objects = oo_objects(s->oo);
+	int objects = oo_objects(s->max);
 	struct list_head *slabs_by_inuse =
 		kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
 	unsigned long flags;
@@ -3276,7 +3284,8 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 }
 
 #if (defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)) || defined(CONFIG_SLABINFO)
-static unsigned long count_partial(struct kmem_cache_node *n)
+static unsigned long count_partial(struct kmem_cache_node *n,
+					int (*get_count)(struct page *))
 {
 	unsigned long flags;
 	unsigned long x = 0;
@@ -3284,10 +3293,25 @@ static unsigned long count_partial(struct kmem_cache_node *n)
 
 	spin_lock_irqsave(&n->list_lock, flags);
 	list_for_each_entry(page, &n->partial, lru)
-		x += page->inuse;
+		x += get_count(page);
 	spin_unlock_irqrestore(&n->list_lock, flags);
 	return x;
 }
+
+static int count_inuse(struct page *page)
+{
+	return page->inuse;
+}
+
+static int count_total(struct page *page)
+{
+	return page->objects;
+}
+
+static int count_free(struct page *page)
+{
+	return page->objects - page->inuse;
+}
 #endif
 
 #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
@@ -3376,7 +3400,7 @@ static long validate_slab_cache(struct kmem_cache *s)
 {
 	int node;
 	unsigned long count = 0;
-	unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->oo)) *
+	unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
 				sizeof(unsigned long), GFP_KERNEL);
 
 	if (!map)
@@ -3676,22 +3700,23 @@ static int list_locations(struct kmem_cache *s, char *buf,
 }
 
 enum slab_stat_type {
-	SL_FULL,
-	SL_PARTIAL,
-	SL_CPU,
-	SL_OBJECTS
+	SL_ALL,			/* All slabs */
+	SL_PARTIAL,		/* Only partially allocated slabs */
+	SL_CPU,			/* Only slabs used for cpu caches */
+	SL_OBJECTS,		/* Determine allocated objects not slabs */
+	SL_TOTAL		/* Determine object capacity not slabs */
 };
 
-#define SO_FULL		(1 << SL_FULL)
+#define SO_ALL		(1 << SL_ALL)
 #define SO_PARTIAL	(1 << SL_PARTIAL)
 #define SO_CPU		(1 << SL_CPU)
 #define SO_OBJECTS	(1 << SL_OBJECTS)
+#define SO_TOTAL	(1 << SL_TOTAL)
 
 static ssize_t show_slab_objects(struct kmem_cache *s,
 			    char *buf, unsigned long flags)
 {
 	unsigned long total = 0;
-	int cpu;
 	int node;
 	int x;
 	unsigned long *nodes;
@@ -3702,56 +3727,60 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
 		return -ENOMEM;
 	per_cpu = nodes + nr_node_ids;
 
-	for_each_possible_cpu(cpu) {
-		struct page *page;
-		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+	if (flags & SO_CPU) {
+		int cpu;
 
-		if (!c)
-			continue;
+		for_each_possible_cpu(cpu) {
+			struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
 
-		page = c->page;
-		node = c->node;
-		if (node < 0)
-			continue;
-		if (page) {
-			if (flags & SO_CPU) {
-				if (flags & SO_OBJECTS)
-					x = page->inuse;
+			if (!c || c->node < 0)
+				continue;
+
+			if (c->page) {
+					if (flags & SO_TOTAL)
+						x = c->page->objects;
+				else if (flags & SO_OBJECTS)
+					x = c->page->inuse;
 				else
 					x = 1;
+
 				total += x;
-				nodes[node] += x;
+				nodes[c->node] += x;
 			}
-			per_cpu[node]++;
+			per_cpu[c->node]++;
 		}
 	}
 
-	for_each_node_state(node, N_NORMAL_MEMORY) {
-		struct kmem_cache_node *n = get_node(s, node);
+	if (flags & SO_ALL) {
+		for_each_node_state(node, N_NORMAL_MEMORY) {
+			struct kmem_cache_node *n = get_node(s, node);
+
+		if (flags & SO_TOTAL)
+			x = atomic_long_read(&n->total_objects);
+		else if (flags & SO_OBJECTS)
+			x = atomic_long_read(&n->total_objects) -
+				count_partial(n, count_free);
 
-		if (flags & SO_PARTIAL) {
-			if (flags & SO_OBJECTS)
-				x = count_partial(n);
 			else
-				x = n->nr_partial;
+				x = atomic_long_read(&n->nr_slabs);
 			total += x;
 			nodes[node] += x;
 		}
 
-		if (flags & SO_FULL) {
-			int full_slabs = atomic_long_read(&n->nr_slabs)
-					- per_cpu[node]
-					- n->nr_partial;
+	} else if (flags & SO_PARTIAL) {
+		for_each_node_state(node, N_NORMAL_MEMORY) {
+			struct kmem_cache_node *n = get_node(s, node);
 
-			if (flags & SO_OBJECTS)
-				x = full_slabs * oo_objects(s->oo);
+			if (flags & SO_TOTAL)
+				x = count_partial(n, count_total);
+			else if (flags & SO_OBJECTS)
+				x = count_partial(n, count_inuse);
 			else
-				x = full_slabs;
+				x = n->nr_partial;
 			total += x;
 			nodes[node] += x;
 		}
 	}
-
 	x = sprintf(buf, "%lu", total);
 #ifdef CONFIG_NUMA
 	for_each_node_state(node, N_NORMAL_MEMORY)
@@ -3852,7 +3881,7 @@ SLAB_ATTR_RO(aliases);
 
 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
 {
-	return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU);
+	return show_slab_objects(s, buf, SO_ALL);
 }
 SLAB_ATTR_RO(slabs);
 
@@ -3870,10 +3899,22 @@ SLAB_ATTR_RO(cpu_slabs);
 
 static ssize_t objects_show(struct kmem_cache *s, char *buf)
 {
-	return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS);
+	return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
 }
 SLAB_ATTR_RO(objects);
 
+static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
+{
+	return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
+}
+SLAB_ATTR_RO(objects_partial);
+
+static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
+{
+	return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
+}
+SLAB_ATTR_RO(total_objects);
+
 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
 {
 	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
@@ -4131,6 +4172,8 @@ static struct attribute *slab_attrs[] = {
 	&objs_per_slab_attr.attr,
 	&order_attr.attr,
 	&objects_attr.attr,
+	&objects_partial_attr.attr,
+	&total_objects_attr.attr,
 	&slabs_attr.attr,
 	&partial_attr.attr,
 	&cpu_slabs_attr.attr,
@@ -4459,7 +4502,8 @@ static int s_show(struct seq_file *m, void *p)
 	unsigned long nr_partials = 0;
 	unsigned long nr_slabs = 0;
 	unsigned long nr_inuse = 0;
-	unsigned long nr_objs;
+	unsigned long nr_objs = 0;
+	unsigned long nr_free = 0;
 	struct kmem_cache *s;
 	int node;
 
@@ -4473,11 +4517,11 @@ static int s_show(struct seq_file *m, void *p)
 
 		nr_partials += n->nr_partial;
 		nr_slabs += atomic_long_read(&n->nr_slabs);
-		nr_inuse += count_partial(n);
+		nr_objs += atomic_long_read(&n->total_objects);
+		nr_free += count_partial(n, count_free);
 	}
 
-	nr_objs = nr_slabs * oo_objects(s->oo);
-	nr_inuse += (nr_slabs - nr_partials) * oo_objects(s->oo);
+	nr_inuse = nr_objs - nr_free;
 
 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
 		   nr_objs, s->size, oo_objects(s->oo),
-- 
cgit v1.2.3


From 65c3376aaca96c66aa76014aaf430398964b68cb Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 19:11:40 +0300
Subject: slub: Fallback to minimal order during slab page allocation

If any higher order allocation fails then fall back the smallest order
necessary to contain at least one object. This enables fallback for all
allocations to order 0 pages. The fallback will waste more memory (objects
will not fit neatly) and the fallback slabs will be not as efficient as larger
slabs since they contain less objects.

Note that SLAB also depends on order 1 allocations for some slabs that waste
too much memory if forced into PAGE_SIZE'd page. SLUB now can now deal with
failing order 1 allocs which SLAB cannot do.

Add a new field min that will contain the objects for the smallest possible order
for a slab cache.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index c8514e93ffdf..35c22d940ba7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1113,28 +1113,43 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
 /*
  * Slab allocation and freeing
  */
+static inline struct page *alloc_slab_page(gfp_t flags, int node,
+					struct kmem_cache_order_objects oo)
+{
+	int order = oo_order(oo);
+
+	if (node == -1)
+		return alloc_pages(flags, order);
+	else
+		return alloc_pages_node(node, flags, order);
+}
+
 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 {
 	struct page *page;
 	struct kmem_cache_order_objects oo = s->oo;
-	int order = oo_order(oo);
-	int pages = 1 << order;
 
 	flags |= s->allocflags;
 
-	if (node == -1)
-		page = alloc_pages(flags, order);
-	else
-		page = alloc_pages_node(node, flags, order);
-
-	if (!page)
-		return NULL;
+	page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node,
+									oo);
+	if (unlikely(!page)) {
+		oo = s->min;
+		/*
+		 * Allocation may have failed due to fragmentation.
+		 * Try a lower order alloc if possible
+		 */
+		page = alloc_slab_page(flags, node, oo);
+		if (!page)
+			return NULL;
 
+		stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
+	}
 	page->objects = oo_objects(oo);
 	mod_zone_page_state(page_zone(page),
 		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-		pages);
+		1 << oo_order(oo));
 
 	return page;
 }
@@ -2347,6 +2362,7 @@ static int calculate_sizes(struct kmem_cache *s)
 	 * Determine the number of objects per slab
 	 */
 	s->oo = oo_make(order, size);
+	s->min = oo_make(get_order(size), size);
 	if (oo_objects(s->oo) > oo_objects(s->max))
 		s->max = s->oo;
 
@@ -4163,7 +4179,7 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
-
+STAT_ATTR(ORDER_FALLBACK, order_fallback);
 #endif
 
 static struct attribute *slab_attrs[] = {
@@ -4216,6 +4232,7 @@ static struct attribute *slab_attrs[] = {
 	&deactivate_to_head_attr.attr,
 	&deactivate_to_tail_attr.attr,
 	&deactivate_remote_frees_attr.attr,
+	&order_fallback_attr.attr,
 #endif
 	NULL
 };
-- 
cgit v1.2.3


From 319d1e240683d37924ea8977c91730c3393fd453 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 19:11:41 +0300
Subject: slub: Drop fallback to page allocator method

There is now a generic method of falling back to a slab page of minimal
order. No need anymore for the fallback to kmalloc_large().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 43 ++-----------------------------------------
 1 file changed, 2 insertions(+), 41 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 35c22d940ba7..de6f38761d1f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -204,8 +204,6 @@ static inline void ClearSlabDebug(struct page *page)
 /* Internal SLUB flags */
 #define __OBJECT_POISON		0x80000000 /* Poison object */
 #define __SYSFS_ADD_DEFERRED	0x40000000 /* Not yet visible via sysfs */
-#define __KMALLOC_CACHE		0x20000000 /* objects freed using kfree */
-#define __PAGE_ALLOC_FALLBACK	0x10000000 /* Allow fallback to page alloc */
 
 /* Not all arches define cache_line_size */
 #ifndef cache_line_size
@@ -1623,27 +1621,6 @@ new_slab:
 		c->page = new;
 		goto load_freelist;
 	}
-
-	/*
-	 * No memory available.
-	 *
-	 * If the slab uses higher order allocs but the object is
-	 * smaller than a page size then we can fallback in emergencies
-	 * to the page allocator via kmalloc_large. The page allocator may
-	 * have failed to obtain a higher order page and we can try to
-	 * allocate a single page if the object fits into a single page.
-	 * That is only possible if certain conditions are met that are being
-	 * checked when a slab is created.
-	 */
-	if (!(gfpflags & __GFP_NORETRY) &&
-				(s->flags & __PAGE_ALLOC_FALLBACK)) {
-		if (gfpflags & __GFP_WAIT)
-			local_irq_enable();
-		object = kmalloc_large(s->objsize, gfpflags);
-		if (gfpflags & __GFP_WAIT)
-			local_irq_disable();
-		return object;
-	}
 	return NULL;
 debug:
 	if (!alloc_debug_processing(s, c->page, object, addr))
@@ -2330,20 +2307,7 @@ static int calculate_sizes(struct kmem_cache *s)
 	 */
 	size = ALIGN(size, align);
 	s->size = size;
-
-	if ((flags & __KMALLOC_CACHE) &&
-			PAGE_SIZE / size < slub_min_objects) {
-		/*
-		 * Kmalloc cache that would not have enough objects in
-		 * an order 0 page. Kmalloc slabs can fallback to
-		 * page allocator order 0 allocs so take a reasonably large
-		 * order that will allows us a good number of objects.
-		 */
-		order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER);
-		s->flags |= __PAGE_ALLOC_FALLBACK;
-		s->allocflags |= __GFP_NOWARN;
-	} else
-		order = calculate_order(size);
+	order = calculate_order(size);
 
 	if (order < 0)
 		return 0;
@@ -2589,7 +2553,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
 
 	down_write(&slub_lock);
 	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
-			flags | __KMALLOC_CACHE, NULL))
+								flags, NULL))
 		goto panic;
 
 	list_add(&s->list, &slab_caches);
@@ -3105,9 +3069,6 @@ static int slab_unmergeable(struct kmem_cache *s)
 	if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
 		return 1;
 
-	if ((s->flags & __PAGE_ALLOC_FALLBACK))
-		return 1;
-
 	if (s->ctor)
 		return 1;
 
-- 
cgit v1.2.3


From 06b285dc3d6194abe79ab9dcaaab703d6f75627c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 19:11:41 +0300
Subject: slub: Make the order configurable for each slab cache

Makes /sys/kernel/slab/<slabname>/order writable. The allocation
order of a slab cache can then be changed dynamically during runtime.
This can be used to override the objects per slabs value establisheed
with the slub_min_objects setting that was manually specified or
calculated on bootup.

The changes of the slab order can occur while allocate_slab() runs.
Allocate slab needs the order and the number of slab objects that
are both changed by the change of order. Both are put into
a single word (struct kmem_cache_order_objects). They can then
be atomically updated and retrieved.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index de6f38761d1f..23a2683d6c9f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2218,7 +2218,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
  * calculate_sizes() determines the order and the distribution of data within
  * a slab object.
  */
-static int calculate_sizes(struct kmem_cache *s)
+static int calculate_sizes(struct kmem_cache *s, int forced_order)
 {
 	unsigned long flags = s->flags;
 	unsigned long size = s->objsize;
@@ -2307,7 +2307,10 @@ static int calculate_sizes(struct kmem_cache *s)
 	 */
 	size = ALIGN(size, align);
 	s->size = size;
-	order = calculate_order(size);
+	if (forced_order >= 0)
+		order = forced_order;
+	else
+		order = calculate_order(size);
 
 	if (order < 0)
 		return 0;
@@ -2346,7 +2349,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 	s->align = align;
 	s->flags = kmem_cache_flags(size, flags, name, ctor);
 
-	if (!calculate_sizes(s))
+	if (!calculate_sizes(s, -1))
 		goto error;
 
 	s->refcount = 1;
@@ -3833,11 +3836,23 @@ static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
 }
 SLAB_ATTR_RO(objs_per_slab);
 
+static ssize_t order_store(struct kmem_cache *s,
+				const char *buf, size_t length)
+{
+	int order = simple_strtoul(buf, NULL, 10);
+
+	if (order > slub_max_order || order < slub_min_order)
+		return -EINVAL;
+
+	calculate_sizes(s, order);
+	return length;
+}
+
 static ssize_t order_show(struct kmem_cache *s, char *buf)
 {
 	return sprintf(buf, "%d\n", oo_order(s->oo));
 }
-SLAB_ATTR_RO(order);
+SLAB_ATTR(order);
 
 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
 {
@@ -3971,7 +3986,7 @@ static ssize_t red_zone_store(struct kmem_cache *s,
 	s->flags &= ~SLAB_RED_ZONE;
 	if (buf[0] == '1')
 		s->flags |= SLAB_RED_ZONE;
-	calculate_sizes(s);
+	calculate_sizes(s, -1);
 	return length;
 }
 SLAB_ATTR(red_zone);
@@ -3990,7 +4005,7 @@ static ssize_t poison_store(struct kmem_cache *s,
 	s->flags &= ~SLAB_POISON;
 	if (buf[0] == '1')
 		s->flags |= SLAB_POISON;
-	calculate_sizes(s);
+	calculate_sizes(s, -1);
 	return length;
 }
 SLAB_ATTR(poison);
@@ -4009,7 +4024,7 @@ static ssize_t store_user_store(struct kmem_cache *s,
 	s->flags &= ~SLAB_STORE_USER;
 	if (buf[0] == '1')
 		s->flags |= SLAB_STORE_USER;
-	calculate_sizes(s);
+	calculate_sizes(s, -1);
 	return length;
 }
 SLAB_ATTR(store_user);
-- 
cgit v1.2.3


From 31d33baf36bda7a2fea800648d87c9fe6155e7ca Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 19:11:41 +0300
Subject: slub: Simplify any_slab_object checks

Since we now have total_objects counter per node use that to
check for the presence of any objects. The loop over all cpu slabs
is not that useful since any cpu slab would require an object allocation
first. So drop that.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 23a2683d6c9f..06533f342be0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3775,14 +3775,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
 static int any_slab_objects(struct kmem_cache *s)
 {
 	int node;
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
-
-		if (c && c->page)
-			return 1;
-	}
 
 	for_each_online_node(node) {
 		struct kmem_cache_node *n = get_node(s, node);
@@ -3790,7 +3782,7 @@ static int any_slab_objects(struct kmem_cache *s)
 		if (!n)
 			continue;
 
-		if (n->nr_partial || atomic_long_read(&n->nr_slabs))
+		if (atomic_read(&n->total_objects))
 			return 1;
 	}
 	return 0;
-- 
cgit v1.2.3


From 114e9e89e668ec561c9b0f3dea7bcc8af7c29d21 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 19:11:41 +0300
Subject: slub: Drop DEFAULT_MAX_ORDER / DEFAULT_MIN_OBJECTS

We can now fallback to order 0 slabs. So set the slub_max_order to
PAGE_CACHE_ORDER_COSTLY but keep the slub_min_objects at 4. This
will mostly preserve the orders used in 2.6.25. F.e. The 2k kmalloc slab
will use order 1 allocs and the 4k kmalloc slab order 2.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 06533f342be0..6572cef0c43c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -149,25 +149,6 @@ static inline void ClearSlabDebug(struct page *page)
 /* Enable to test recovery from slab corruption on boot */
 #undef SLUB_RESILIENCY_TEST
 
-#if PAGE_SHIFT <= 12
-
-/*
- * Small page size. Make sure that we do not fragment memory
- */
-#define DEFAULT_MAX_ORDER 1
-#define DEFAULT_MIN_OBJECTS 4
-
-#else
-
-/*
- * Large page machines are customarily able to handle larger
- * page orders.
- */
-#define DEFAULT_MAX_ORDER 2
-#define DEFAULT_MIN_OBJECTS 8
-
-#endif
-
 /*
  * Mininum number of partial slabs. These will be left on the partial
  * lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -1821,8 +1802,8 @@ static struct page *get_object_page(const void *x)
  * take the list_lock.
  */
 static int slub_min_order;
-static int slub_max_order = DEFAULT_MAX_ORDER;
-static int slub_min_objects = DEFAULT_MIN_OBJECTS;
+static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
+static int slub_min_objects = 4;
 
 /*
  * Merge control. If this is set then no merging of slab caches will occur.
-- 
cgit v1.2.3


From 9b2cd506e5f2117f94c28a0040bf5da058105316 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 19:11:41 +0300
Subject: slub: Calculate min_objects based on number of processors.

The mininum objects per slab is calculated based on the number of processors
that may come online.

Processors    min_objects
---------------------------
1             8
2             12
4             16
8             20
16            24
32            28
64            32
1024          48
4096          56

The higher the number of processors the large the order sizes used for various
slab caches will become. This has been shown to address the performance issues
in hackbench on 16p etc.

The calculation is only performed if slub_min_objects is zero (default). If one
specifies a slub_min_objects on boot then that setting is taken.

As suggested by Zhang Yanmin's performance tests on 16-core Tigerton, use the
formula '4 * (fls(nr_cpu_ids) + 1)':

  ./hackbench 100 process 2000:

  1) 2.6.25-rc6slab: 23.5 seconds
  2) 2.6.25-rc7SLUB+slub_min_objects=20: 31 seconds
  3) 2.6.25-rc7SLUB+slub_min_objects=24: 23.5 seconds

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 6572cef0c43c..e2e6ba7a5172 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1803,7 +1803,7 @@ static struct page *get_object_page(const void *x)
  */
 static int slub_min_order;
 static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
-static int slub_min_objects = 4;
+static int slub_min_objects;
 
 /*
  * Merge control. If this is set then no merging of slab caches will occur.
@@ -1880,6 +1880,8 @@ static inline int calculate_order(int size)
 	 * we reduce the minimum objects required in a slab.
 	 */
 	min_objects = slub_min_objects;
+	if (!min_objects)
+		min_objects = 4 * (fls(nr_cpu_ids) + 1);
 	while (min_objects > 1) {
 		fraction = 8;
 		while (fraction >= 4) {
-- 
cgit v1.2.3


From c124f5b54f879e5870befcc076addbd5d614663f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 14 Apr 2008 19:13:29 +0300
Subject: slub: pack objects denser

Since we now have more orders available use a denser packing.
Increase slab order if more than 1/16th of a slab would be wasted.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index e2e6ba7a5172..d821ce6fff39 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1818,7 +1818,7 @@ static int slub_nomerge;
  * system components. Generally order 0 allocations should be preferred since
  * order 0 does not cause fragmentation in the page allocator. Larger objects
  * be problematic to put into order 0 slabs because there may be too much
- * unused space left. We go to a higher order if more than 1/8th of the slab
+ * unused space left. We go to a higher order if more than 1/16th of the slab
  * would be wasted.
  *
  * In order to reach satisfactory performance we must ensure that a minimum
@@ -1883,7 +1883,7 @@ static inline int calculate_order(int size)
 	if (!min_objects)
 		min_objects = 4 * (fls(nr_cpu_ids) + 1);
 	while (min_objects > 1) {
-		fraction = 8;
+		fraction = 16;
 		while (fraction >= 4) {
 			order = slab_order(size, min_objects,
 						slub_max_order, fraction);
-- 
cgit v1.2.3


From 556637cdabcd5918c7d4a1a2679b8f86fc81e891 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@saeurebad.de>
Date: Mon, 28 Apr 2008 02:11:47 -0700
Subject: mm: fix possible off-by-one in walk_pte_range()

After the loop in walk_pte_range() pte might point to the first address after
the pmd it walks.  The pte_unmap() is then applied to something bad.

Spotted by Roel Kluin and Andreas Schwab.

Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
Cc: Roel Kluin <12o3l@tiscali.nl>
Cc: Andreas Schwab <schwab@suse.de>
Acked-by: Matt Mackall <mpm@selenic.com>
Acked-by: Mikael Pettersson <mikpe@it.uu.se>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/pagewalk.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 1cf1417ef8b7..0afd2387e507 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -9,11 +9,15 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	int err = 0;
 
 	pte = pte_offset_map(pmd, addr);
-	do {
+	for (;;) {
 		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
 		if (err)
 		       break;
-	} while (pte++, addr += PAGE_SIZE, addr != end);
+		addr += PAGE_SIZE;
+		if (addr == end)
+			break;
+		pte++;
+	}
 
 	pte_unmap(pte);
 	return err;
-- 
cgit v1.2.3


From ea01ea937dcae2caa146dea1918cccf2f16ed3c4 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Mon, 28 Apr 2008 02:12:01 -0700
Subject: hotplug memory remove: generic __remove_pages() support

Generic helper function to remove section mappings and sysfs entries for the
section of the memory we are removing.  offline_pages() correctly adjusted
zone and marked the pages reserved.

TODO: Yasunori Goto is working on patches to free up allocations from bootmem.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Acked-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/sparse.c         | 45 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 97 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0fb330271271..d5094929766d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -101,6 +101,25 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 	return register_new_memory(__pfn_to_section(phys_start_pfn));
 }
 
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+	unsigned long flags;
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	int ret = -EINVAL;
+
+	if (!valid_section(ms))
+		return ret;
+
+	ret = unregister_memory_section(ms);
+	if (ret)
+		return ret;
+
+	pgdat_resize_lock(pgdat, &flags);
+	sparse_remove_one_section(zone, ms);
+	pgdat_resize_unlock(pgdat, &flags);
+	return 0;
+}
+
 /*
  * Reasonably generic function for adding memory.  It is
  * expected that archs that support memory hotplug will
@@ -134,6 +153,42 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
 }
 EXPORT_SYMBOL_GPL(__add_pages);
 
+/**
+ * __remove_pages() - remove sections of pages from a zone
+ * @zone: zone from which pages need to be removed
+ * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
+ * @nr_pages: number of pages to remove (must be multiple of section size)
+ *
+ * Generic helper function to remove section mappings and sysfs entries
+ * for the section of the memory we are removing. Caller needs to make
+ * sure that pages are marked reserved and zones are adjust properly by
+ * calling offline_pages().
+ */
+int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+		 unsigned long nr_pages)
+{
+	unsigned long i, ret = 0;
+	int sections_to_remove;
+
+	/*
+	 * We can only remove entire sections
+	 */
+	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
+	BUG_ON(nr_pages % PAGES_PER_SECTION);
+
+	release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
+
+	sections_to_remove = nr_pages / PAGES_PER_SECTION;
+	for (i = 0; i < sections_to_remove; i++) {
+		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+		ret = __remove_section(zone, __pfn_to_section(pfn));
+		if (ret)
+			break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__remove_pages);
+
 static void grow_zone_span(struct zone *zone,
 		unsigned long start_pfn, unsigned long end_pfn)
 {
diff --git a/mm/sparse.c b/mm/sparse.c
index 7e9191381f86..186a85bf7912 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -208,12 +208,13 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
 }
 
 /*
- * We need this if we ever free the mem_maps.  While not implemented yet,
- * this function is included for parity with its sibling.
+ * Decode mem_map from the coded memmap
  */
-static __attribute((unused))
+static
 struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
 {
+	/* mask off the extra low bits of information */
+	coded_mem_map &= SECTION_MAP_MASK;
 	return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
 }
 
@@ -404,6 +405,28 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 }
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
+static void free_section_usemap(struct page *memmap, unsigned long *usemap)
+{
+	if (!usemap)
+		return;
+
+	/*
+	 * Check to see if allocation came from hot-plug-add
+	 */
+	if (PageSlab(virt_to_page(usemap))) {
+		kfree(usemap);
+		if (memmap)
+			__kfree_section_memmap(memmap, PAGES_PER_SECTION);
+		return;
+	}
+
+	/*
+	 * TODO: Allocations came from bootmem - how do I free up ?
+	 */
+	printk(KERN_WARNING "Not freeing up allocations from bootmem "
+			"- leaking memory\n");
+}
+
 /*
  * returns the number of sections whose mem_maps were properly
  * set.  If this is <=0, then that means that the passed-in
@@ -456,4 +479,20 @@ out:
 	}
 	return ret;
 }
+
+void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
+{
+	struct page *memmap = NULL;
+	unsigned long *usemap = NULL;
+
+	if (ms->section_mem_map) {
+		usemap = ms->pageblock_flags;
+		memmap = sparse_decode_mem_map(ms->section_mem_map,
+						__section_nr(ms));
+		ms->section_mem_map = 0;
+		ms->pageblock_flags = NULL;
+	}
+
+	free_section_usemap(memmap, usemap);
+}
 #endif
-- 
cgit v1.2.3


From 180c06efce691f2b721dd0d965079827bdd7ee03 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 28 Apr 2008 02:12:03 -0700
Subject: hotplug-memory: make online_page() common

All architectures use an effectively identical definition of online_page(), so
just make it common code.  x86-64, ia64, powerpc and sh are actually
identical; x86-32 is slightly different.

x86-32's differences arise because it puts its hotplug pages in the highmem
zone.  We can handle this in the generic code by inspecting the page to see if
its in highmem, and update the totalhigh_pages count appropriately.  This
leaves init_32.c:free_new_highpage with a single caller, so I folded it into
add_one_highpage_init.

I also removed an incorrect comment referring to the NUMA case; any NUMA
details have already been dealt with by the time online_page() is called.

[akpm@linux-foundation.org: fix indenting]
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Dave Hansen <dave@linux.vnet.ibm.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamez.hiroyu@jp.fujitsu.com>
Tested-by: KAMEZAWA Hiroyuki <kamez.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d5094929766d..c8b3ca79de2d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -219,6 +219,25 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
 					pgdat->node_start_pfn;
 }
 
+void online_page(struct page *page)
+{
+	totalram_pages++;
+	num_physpages++;
+
+#ifdef CONFIG_HIGHMEM
+	if (PageHighMem(page))
+		totalhigh_pages++;
+#endif
+
+#ifdef CONFIG_FLATMEM
+	max_mapnr = max(page_to_pfn(page), max_mapnr);
+#endif
+
+	ClearPageReserved(page);
+	init_page_count(page);
+	__free_page(page);
+}
+
 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 			void *arg)
 {
-- 
cgit v1.2.3


From 0dd1334faf7e075bfdb6f5284eed65210b296fc1 Mon Sep 17 00:00:00 2001
From: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Date: Mon, 28 Apr 2008 02:12:08 -0700
Subject: fix invalidate_inode_pages2_range() to not clear ret

DIO invalidates page cache through invalidate_inode_pages2_range().
invalidate_inode_pages2_range() sets ret=-EIO when
invalidate_complete_page2() fails, but this ret is cleared if
do_launder_page() succeed on a page of next index.

In this case, dio is carried out even if invalidate_complete_page2() fails
on some pages.

This can cause inconsistency between memory and blocks on HDD because the
page cache still exists.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Ken Chen <kenchen@google.com>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Chuck Lever <cel@citi.umich.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/truncate.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/truncate.c b/mm/truncate.c
index 7d20ce41ecf5..b8961cb63414 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -391,6 +391,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 	pgoff_t next;
 	int i;
 	int ret = 0;
+	int ret2 = 0;
 	int did_range_unmap = 0;
 	int wrapped = 0;
 
@@ -438,9 +439,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 				}
 			}
 			BUG_ON(page_mapped(page));
-			ret = do_launder_page(mapping, page);
-			if (ret == 0 && !invalidate_complete_page2(mapping, page))
-				ret = -EIO;
+			ret2 = do_launder_page(mapping, page);
+			if (ret2 == 0) {
+				if (!invalidate_complete_page2(mapping, page))
+					ret2 = -EIO;
+			}
+			if (ret2 < 0)
+				ret = ret2;
 			unlock_page(page);
 		}
 		pagevec_release(&pvec);
-- 
cgit v1.2.3


From 4d3d5b41a72b52555d43efbfc4ccde6ba6e5444f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 28 Apr 2008 02:12:10 -0700
Subject: mmap_region: cleanup the final vma_merge() related code

It is not easy to actually understand the "if (!file || !vma_merge())"
code, turn it into "if (file && vma_merge())".  This makes immediately
obvious that the subsequent "if (file)" is superfluous.

As Hugh Dickins pointed out, we can also factor out the ->i_writecount
corrections, and add a small comment about that.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index a32d28ce31cd..6aaf657adb87 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1068,7 +1068,6 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
 		mapping_cap_account_dirty(vma->vm_file->f_mapping);
 }
 
-
 unsigned long mmap_region(struct file *file, unsigned long addr,
 			  unsigned long len, unsigned long flags,
 			  unsigned int vm_flags, unsigned long pgoff,
@@ -1181,22 +1180,20 @@ munmap_back:
 	if (vma_wants_writenotify(vma))
 		vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
 
-	if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
+	if (file && vma_merge(mm, prev, addr, vma->vm_end,
 			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
-		file = vma->vm_file;
-		vma_link(mm, vma, prev, rb_link, rb_parent);
-		if (correct_wcount)
-			atomic_inc(&inode->i_writecount);
-	} else {
-		if (file) {
-			if (correct_wcount)
-				atomic_inc(&inode->i_writecount);
-			fput(file);
-		}
 		mpol_free(vma_policy(vma));
 		kmem_cache_free(vm_area_cachep, vma);
+		fput(file);
+	} else {
+		vma_link(mm, vma, prev, rb_link, rb_parent);
+		file = vma->vm_file;
 	}
-out:	
+
+	/* Once vma denies write, undo our temporary denial count */
+	if (correct_wcount)
+		atomic_inc(&inode->i_writecount);
+out:
 	mm->total_vm += len >> PAGE_SHIFT;
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
-- 
cgit v1.2.3


From 3c18ddd160d1fcd46d1131d9ad6c594dd8e9af99 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 28 Apr 2008 02:12:10 -0700
Subject: mm: remove nopage

Nothing in the tree uses nopage any more.  Remove support for it in the
core mm code and documentation (and a few stray references to it in
comments).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c  | 22 +++++-----------------
 mm/mincore.c |  2 +-
 mm/rmap.c    |  1 -
 3 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 0d14d1e58a5f..46958fb97c2d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1057,8 +1057,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		if (pages)
 			foll_flags |= FOLL_GET;
 		if (!write && !(vma->vm_flags & VM_LOCKED) &&
-		    (!vma->vm_ops || (!vma->vm_ops->nopage &&
-					!vma->vm_ops->fault)))
+		    (!vma->vm_ops || !vma->vm_ops->fault))
 			foll_flags |= FOLL_ANON;
 
 		do {
@@ -2199,20 +2198,9 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	BUG_ON(vma->vm_flags & VM_PFNMAP);
 
-	if (likely(vma->vm_ops->fault)) {
-		ret = vma->vm_ops->fault(vma, &vmf);
-		if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
-			return ret;
-	} else {
-		/* Legacy ->nopage path */
-		ret = 0;
-		vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
-		/* no page was available -- either SIGBUS or OOM */
-		if (unlikely(vmf.page == NOPAGE_SIGBUS))
-			return VM_FAULT_SIGBUS;
-		else if (unlikely(vmf.page == NOPAGE_OOM))
-			return VM_FAULT_OOM;
-	}
+	ret = vma->vm_ops->fault(vma, &vmf);
+	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+		return ret;
 
 	/*
 	 * For consistency in subsequent calls, make the faulted page always
@@ -2458,7 +2446,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
 			if (vma->vm_ops) {
-				if (vma->vm_ops->fault || vma->vm_ops->nopage)
+				if (likely(vma->vm_ops->fault))
 					return do_linear_fault(mm, vma, address,
 						pte, pmd, write_access, entry);
 				if (unlikely(vma->vm_ops->nopfn))
diff --git a/mm/mincore.c b/mm/mincore.c
index 5efe0ded69b1..5178800bc129 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -33,7 +33,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 	 * When tmpfs swaps out a page from a file, any process mapping that
 	 * file will not get a swp_entry_t in its pte, but rather it is like
 	 * any other file mapping (ie. marked !present and faulted in with
-	 * tmpfs's .nopage). So swapped out tmpfs mappings are tested here.
+	 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
 	 *
 	 * However when tmpfs moves the page from pagecache and into swapcache,
 	 * it is still in core, but the find_get_page below won't find it.
diff --git a/mm/rmap.c b/mm/rmap.c
index e9bb6b1093f6..bf0a5b7cfb8e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -662,7 +662,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
 			printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
 			print_symbol (KERN_EMERG "  vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
 			if (vma->vm_ops) {
-				print_symbol (KERN_EMERG "  vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
 				print_symbol (KERN_EMERG "  vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
 			}
 			if (vma->vm_file && vma->vm_file->f_op)
-- 
cgit v1.2.3


From dac1d27bc8d5ca636d3014ecfdf94407031d1970 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Mon, 28 Apr 2008 02:12:12 -0700
Subject: mm: use zonelists instead of zones when direct reclaiming pages

The following patches replace multiple zonelists per node with two zonelists
that are filtered based on the GFP flags.  The patches as a set fix a bug with
regard to the use of MPOL_BIND and ZONE_MOVABLE.  With this patchset, the
MPOL_BIND will apply to the two highest zones when the highest zone is
ZONE_MOVABLE.  This should be considered as an alternative fix for the
MPOL_BIND+ZONE_MOVABLE in 2.6.23 to the previously discussed hack that filters
only custom zonelists.

The first patch cleans up an inconsistency where direct reclaim uses
zonelist->zones where other places use zonelist.

The second patch introduces a helper function node_zonelist() for looking up
the appropriate zonelist for a GFP mask which simplifies patches later in the
set.

The third patch defines/remembers the "preferred zone" for numa statistics, as
it is no longer always the first zone in a zonelist.

The forth patch replaces multiple zonelists with two zonelists that are
filtered.  The two zonelists are due to the fact that the memoryless patchset
introduces a second set of zonelists for __GFP_THISNODE.

The fifth patch introduces helper macros for retrieving the zone and node
indices of entries in a zonelist.

The final patch introduces filtering of the zonelists based on a nodemask.
Two zonelists exist per node, one for normal allocations and one for
__GFP_THISNODE.

Performance results varied depending on the machine configuration.  In real
workloads the gain/loss will depend on how much the userspace portion of the
benchmark benefits from having more cache available due to reduced referencing
of zonelists.

These are the range of performance losses/gains when running against
2.6.24-rc4-mm1.  The set and these machines are a mix of i386, x86_64 and
ppc64 both NUMA and non-NUMA.
			     loss   to  gain
Total CPU time on Kernbench: -0.86% to  1.13%
Elapsed   time on Kernbench: -0.79% to  0.76%
page_test from aim9:         -4.37% to  0.79%
brk_test  from aim9:         -0.71% to  4.07%
fork_test from aim9:         -1.84% to  4.60%
exec_test from aim9:         -0.71% to  1.08%

This patch:

The allocator deals with zonelists which indicate the order in which zones
should be targeted for an allocation.  Similarly, direct reclaim of pages
iterates over an array of zones.  For consistency, this patch converts direct
reclaim to use a zonelist.  No functionality is changed by this patch.  This
simplifies zonelist iterators in the next patch.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c |  2 +-
 mm/vmscan.c     | 21 ++++++++++++---------
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 32e796af12a1..1bda771a072a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1569,7 +1569,7 @@ nofail_alloc:
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
+	did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f80a5b7c057f..ef8551e0d2d0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1246,10 +1246,11 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static unsigned long shrink_zones(int priority, struct zone **zones,
+static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
 					struct scan_control *sc)
 {
 	unsigned long nr_reclaimed = 0;
+	struct zone **zones = zonelist->zones;
 	int i;
 
 
@@ -1301,8 +1302,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
  */
-static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
-					  struct scan_control *sc)
+static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+					gfp_t gfp_mask, struct scan_control *sc)
 {
 	int priority;
 	int ret = 0;
@@ -1310,6 +1311,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
 	unsigned long nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
+	struct zone **zones = zonelist->zones;
 	int i;
 
 	if (scan_global_lru(sc))
@@ -1333,7 +1335,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
 		sc->nr_scanned = 0;
 		if (!priority)
 			disable_swap_token();
-		nr_reclaimed += shrink_zones(priority, zones, sc);
+		nr_reclaimed += shrink_zones(priority, zonelist, sc);
 		/*
 		 * Don't shrink slabs when reclaiming memory from
 		 * over limit cgroups
@@ -1397,7 +1399,8 @@ out:
 	return ret;
 }
 
-unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
+								gfp_t gfp_mask)
 {
 	struct scan_control sc = {
 		.gfp_mask = gfp_mask,
@@ -1410,7 +1413,7 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
 		.isolate_pages = isolate_pages_global,
 	};
 
-	return do_try_to_free_pages(zones, gfp_mask, &sc);
+	return do_try_to_free_pages(zonelist, gfp_mask, &sc);
 }
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1428,11 +1431,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 		.mem_cgroup = mem_cont,
 		.isolate_pages = mem_cgroup_isolate_pages,
 	};
-	struct zone **zones;
+	struct zonelist *zonelist;
 	int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
 
-	zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones;
-	if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
+	zonelist = &NODE_DATA(numa_node_id())->node_zonelists[target_zone];
+	if (do_try_to_free_pages(zonelist, sc.gfp_mask, &sc))
 		return 1;
 	return 0;
 }
-- 
cgit v1.2.3


From 0e88460da6ab7bb6a7ef83675412ed5b6315d741 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Mon, 28 Apr 2008 02:12:14 -0700
Subject: mm: introduce node_zonelist() for accessing the zonelist for a GFP
 mask

Introduce a node_zonelist() helper function.  It is used to lookup the
appropriate zonelist given a node and a GFP mask.  The patch on its own is a
cleanup but it helps clarify parts of the two-zonelist-per-node patchset.  If
necessary, it can be merged with the next patch in this set without problems.

Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c  | 6 +++---
 mm/page_alloc.c | 3 +--
 mm/slab.c       | 3 +--
 mm/slub.c       | 3 +--
 4 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3c3601121509..5d20bf44062f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1183,7 +1183,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 		nd = 0;
 		BUG();
 	}
-	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
+	return node_zonelist(nd, gfp);
 }
 
 /* Do dynamic interleaving for a process */
@@ -1299,7 +1299,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 		if (unlikely(pol != &default_policy &&
 				pol != current->mempolicy))
 			__mpol_free(pol);	/* finished with pol */
-		return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
+		return node_zonelist(nid, gfp_flags);
 	}
 
 	zl = zonelist_policy(GFP_HIGHUSER, pol);
@@ -1321,7 +1321,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 	struct zonelist *zl;
 	struct page *page;
 
-	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
+	zl = node_zonelist(nid, gfp);
 	page = __alloc_pages(gfp, order, zl);
 	if (page && page_zone(page) == zl->zones[0])
 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1bda771a072a..63ff71830ea4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1713,10 +1713,9 @@ EXPORT_SYMBOL(free_pages);
 static unsigned int nr_free_zone_pages(int offset)
 {
 	/* Just pick one node, since fallback list is circular */
-	pg_data_t *pgdat = NODE_DATA(numa_node_id());
 	unsigned int sum = 0;
 
-	struct zonelist *zonelist = pgdat->node_zonelists + offset;
+	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 	struct zone **zonep = zonelist->zones;
 	struct zone *zone;
 
diff --git a/mm/slab.c b/mm/slab.c
index 03927cb5ec9e..5488c54b1172 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3249,8 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	if (flags & __GFP_THISNODE)
 		return NULL;
 
-	zonelist = &NODE_DATA(slab_node(current->mempolicy))
-			->node_zonelists[gfp_zone(flags)];
+	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
 retry:
diff --git a/mm/slub.c b/mm/slub.c
index 39592b5ce68a..19ebbfb20689 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1309,8 +1309,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 			get_cycles() % 1024 > s->remote_node_defrag_ratio)
 		return NULL;
 
-	zonelist = &NODE_DATA(
-		slab_node(current->mempolicy))->node_zonelists[gfp_zone(flags)];
+	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
 	for (z = zonelist->zones; *z; z++) {
 		struct kmem_cache_node *n;
 
-- 
cgit v1.2.3


From 18ea7e710d2452fa726814a406779188028cf1bf Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Mon, 28 Apr 2008 02:12:14 -0700
Subject: mm: remember what the preferred zone is for zone_statistics

On NUMA, zone_statistics() is used to record events like numa hit, miss and
foreign.  It assumes that the first zone in a zonelist is the preferred zone.
When multiple zonelists are replaced by one that is filtered, this is no
longer the case.

This patch records what the preferred zone is rather than assuming the first
zone in the zonelist is it.  This simplifies the reading of later patches in
this set.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 9 +++++----
 mm/vmstat.c     | 6 +++---
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63ff71830ea4..187efd47a446 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1050,7 +1050,7 @@ void split_page(struct page *page, unsigned int order)
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-static struct page *buffered_rmqueue(struct zonelist *zonelist,
+static struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, int order, gfp_t gfp_flags)
 {
 	unsigned long flags;
@@ -1102,7 +1102,7 @@ again:
 	}
 
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
-	zone_statistics(zonelist, zone);
+	zone_statistics(preferred_zone, zone);
 	local_irq_restore(flags);
 	put_cpu();
 
@@ -1383,7 +1383,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 	struct zone **z;
 	struct page *page = NULL;
 	int classzone_idx = zone_idx(zonelist->zones[0]);
-	struct zone *zone;
+	struct zone *zone, *preferred_zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
@@ -1395,6 +1395,7 @@ zonelist_scan:
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	z = zonelist->zones;
+	preferred_zone = *z;
 
 	do {
 		/*
@@ -1433,7 +1434,7 @@ zonelist_scan:
 			}
 		}
 
-		page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
+		page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
 		if (page)
 			break;
 this_zone_full:
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7c7286e9506d..879bcc0a1d4c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -364,13 +364,13 @@ void refresh_cpu_vm_stats(int cpu)
  *
  * Must be called with interrupts disabled.
  */
-void zone_statistics(struct zonelist *zonelist, struct zone *z)
+void zone_statistics(struct zone *preferred_zone, struct zone *z)
 {
-	if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
+	if (z->zone_pgdat == preferred_zone->zone_pgdat) {
 		__inc_zone_state(z, NUMA_HIT);
 	} else {
 		__inc_zone_state(z, NUMA_MISS);
-		__inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
+		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
 	}
 	if (z->node == numa_node_id())
 		__inc_zone_state(z, NUMA_LOCAL);
-- 
cgit v1.2.3


From 54a6eb5c4765aa573a030ceeba2c14e3d2ea5706 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Mon, 28 Apr 2008 02:12:16 -0700
Subject: mm: use two zonelist that are filtered by GFP mask

Currently a node has two sets of zonelists, one for each zone type in the
system and a second set for GFP_THISNODE allocations.  Based on the zones
allowed by a gfp mask, one of these zonelists is selected.  All of these
zonelists consume memory and occupy cache lines.

This patch replaces the multiple zonelists per-node with two zonelists.  The
first contains all populated zones in the system, ordered by distance, for
fallback allocations when the target/preferred node has no free pages.  The
second contains all populated zones in the node suitable for GFP_THISNODE
allocations.

An iterator macro is introduced called for_each_zone_zonelist() that interates
through each zone allowed by the GFP flags in the selected zonelist.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c    |   8 +--
 mm/oom_kill.c   |   8 ++-
 mm/page_alloc.c | 170 ++++++++++++++++++++++++--------------------------------
 mm/slab.c       |   8 ++-
 mm/slub.c       |   8 ++-
 mm/vmscan.c     |  21 +++----
 6 files changed, 101 insertions(+), 122 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51c9e2c01640..ddd141cad77f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -97,11 +97,11 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	struct mempolicy *mpol;
 	struct zonelist *zonelist = huge_zonelist(vma, address,
 					htlb_alloc_mask, &mpol);
-	struct zone **z;
+	struct zone *zone, **z;
 
-	for (z = zonelist->zones; *z; z++) {
-		nid = zone_to_nid(*z);
-		if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
+	for_each_zone_zonelist(zone, z, zonelist, MAX_NR_ZONES - 1) {
+		nid = zone_to_nid(zone);
+		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
 		    !list_empty(&hugepage_freelists[nid])) {
 			page = list_entry(hugepage_freelists[nid].next,
 					  struct page, lru);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index beb592fe9389..2c93502cfcb4 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -175,12 +175,14 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 						    gfp_t gfp_mask)
 {
 #ifdef CONFIG_NUMA
+	struct zone *zone;
 	struct zone **z;
+	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	nodemask_t nodes = node_states[N_HIGH_MEMORY];
 
-	for (z = zonelist->zones; *z; z++)
-		if (cpuset_zone_allowed_softwall(*z, gfp_mask))
-			node_clear(zone_to_nid(*z), nodes);
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+		if (cpuset_zone_allowed_softwall(zone, gfp_mask))
+			node_clear(zone_to_nid(zone), nodes);
 		else
 			return CONSTRAINT_CPUSET;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 187efd47a446..4ccb8651cf22 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1378,42 +1378,29 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
-		struct zonelist *zonelist, int alloc_flags)
+		struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
 {
 	struct zone **z;
 	struct page *page = NULL;
-	int classzone_idx = zone_idx(zonelist->zones[0]);
+	int classzone_idx;
 	struct zone *zone, *preferred_zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
-	enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */
+
+	z = first_zones_zonelist(zonelist, high_zoneidx);
+	classzone_idx = zone_idx(*z);
+	preferred_zone = *z;
 
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	z = zonelist->zones;
-	preferred_zone = *z;
-
-	do {
-		/*
-		 * In NUMA, this could be a policy zonelist which contains
-		 * zones that may not be allowed by the current gfp_mask.
-		 * Check the zone is allowed by the current flags
-		 */
-		if (unlikely(alloc_should_filter_zonelist(zonelist))) {
-			if (highest_zoneidx == -1)
-				highest_zoneidx = gfp_zone(gfp_mask);
-			if (zone_idx(*z) > highest_zoneidx)
-				continue;
-		}
-
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		if (NUMA_BUILD && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
-		zone = *z;
 		if ((alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				goto try_next_zone;
@@ -1447,7 +1434,7 @@ try_next_zone:
 			zlc_active = 1;
 			did_zlc_setup = 1;
 		}
-	} while (*(++z) != NULL);
+	}
 
 	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
 		/* Disable zlc cache for second zonelist scan */
@@ -1465,6 +1452,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
 		struct zonelist *zonelist)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
+	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zone **z;
 	struct page *page;
 	struct reclaim_state reclaim_state;
@@ -1490,7 +1478,7 @@ restart:
 	}
 
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-				zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
 	if (page)
 		goto got_pg;
 
@@ -1534,7 +1522,8 @@ restart:
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+	page = get_page_from_freelist(gfp_mask, order, zonelist,
+						high_zoneidx, alloc_flags);
 	if (page)
 		goto got_pg;
 
@@ -1547,7 +1536,7 @@ rebalance:
 nofail_alloc:
 			/* go through the zonelist yet again, ignoring mins */
 			page = get_page_from_freelist(gfp_mask, order,
-				zonelist, ALLOC_NO_WATERMARKS);
+				zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
 			if (page)
 				goto got_pg;
 			if (gfp_mask & __GFP_NOFAIL) {
@@ -1582,7 +1571,7 @@ nofail_alloc:
 
 	if (likely(did_some_progress)) {
 		page = get_page_from_freelist(gfp_mask, order,
-						zonelist, alloc_flags);
+					zonelist, high_zoneidx, alloc_flags);
 		if (page)
 			goto got_pg;
 	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
@@ -1598,7 +1587,7 @@ nofail_alloc:
 		 * under heavy pressure.
 		 */
 		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-				zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+			zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
 		if (page) {
 			clear_zonelist_oom(zonelist);
 			goto got_pg;
@@ -1713,14 +1702,15 @@ EXPORT_SYMBOL(free_pages);
 
 static unsigned int nr_free_zone_pages(int offset)
 {
+	struct zone **z;
+	struct zone *zone;
+
 	/* Just pick one node, since fallback list is circular */
 	unsigned int sum = 0;
 
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
-	struct zone **zonep = zonelist->zones;
-	struct zone *zone;
 
-	for (zone = *zonep++; zone; zone = *zonep++) {
+	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->present_pages;
 		unsigned long high = zone->pages_high;
 		if (size > high)
@@ -2078,17 +2068,15 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
  */
 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
-	enum zone_type i;
 	int j;
 	struct zonelist *zonelist;
 
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		zonelist = pgdat->node_zonelists + i;
-		for (j = 0; zonelist->zones[j] != NULL; j++)
-			;
- 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
-		zonelist->zones[j] = NULL;
-	}
+	zonelist = &pgdat->node_zonelists[0];
+	for (j = 0; zonelist->zones[j] != NULL; j++)
+		;
+	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+							MAX_NR_ZONES - 1);
+	zonelist->zones[j] = NULL;
 }
 
 /*
@@ -2096,15 +2084,12 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
  */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
-	enum zone_type i;
 	int j;
 	struct zonelist *zonelist;
 
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i;
-		j = build_zonelists_node(pgdat, zonelist, 0, i);
-		zonelist->zones[j] = NULL;
-	}
+	zonelist = &pgdat->node_zonelists[1];
+	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
+	zonelist->zones[j] = NULL;
 }
 
 /*
@@ -2117,27 +2102,24 @@ static int node_order[MAX_NUMNODES];
 
 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 {
-	enum zone_type i;
 	int pos, j, node;
 	int zone_type;		/* needs to be signed */
 	struct zone *z;
 	struct zonelist *zonelist;
 
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		zonelist = pgdat->node_zonelists + i;
-		pos = 0;
-		for (zone_type = i; zone_type >= 0; zone_type--) {
-			for (j = 0; j < nr_nodes; j++) {
-				node = node_order[j];
-				z = &NODE_DATA(node)->node_zones[zone_type];
-				if (populated_zone(z)) {
-					zonelist->zones[pos++] = z;
-					check_highest_zone(zone_type);
-				}
+	zonelist = &pgdat->node_zonelists[0];
+	pos = 0;
+	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
+		for (j = 0; j < nr_nodes; j++) {
+			node = node_order[j];
+			z = &NODE_DATA(node)->node_zones[zone_type];
+			if (populated_zone(z)) {
+				zonelist->zones[pos++] = z;
+				check_highest_zone(zone_type);
 			}
 		}
-		zonelist->zones[pos] = NULL;
 	}
+	zonelist->zones[pos] = NULL;
 }
 
 static int default_zonelist_order(void)
@@ -2264,19 +2246,15 @@ static void build_zonelists(pg_data_t *pgdat)
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
-	int i;
-
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		struct zonelist *zonelist;
-		struct zonelist_cache *zlc;
-		struct zone **z;
+	struct zonelist *zonelist;
+	struct zonelist_cache *zlc;
+	struct zone **z;
 
-		zonelist = pgdat->node_zonelists + i;
-		zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
-		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-		for (z = zonelist->zones; *z; z++)
-			zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
-	}
+	zonelist = &pgdat->node_zonelists[0];
+	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+	for (z = zonelist->zones; *z; z++)
+		zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
 }
 
 
@@ -2290,45 +2268,43 @@ static void set_zonelist_order(void)
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
-	enum zone_type i,j;
+	enum zone_type j;
+	struct zonelist *zonelist;
 
 	local_node = pgdat->node_id;
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		struct zonelist *zonelist;
 
-		zonelist = pgdat->node_zonelists + i;
+	zonelist = &pgdat->node_zonelists[0];
+	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
 
- 		j = build_zonelists_node(pgdat, zonelist, 0, i);
- 		/*
- 		 * Now we build the zonelist so that it contains the zones
- 		 * of all the other nodes.
- 		 * We don't want to pressure a particular node, so when
- 		 * building the zones for node N, we make sure that the
- 		 * zones coming right after the local ones are those from
- 		 * node N+1 (modulo N)
- 		 */
-		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
-			if (!node_online(node))
-				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
-		}
-		for (node = 0; node < local_node; node++) {
-			if (!node_online(node))
-				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
-		}
-
-		zonelist->zones[j] = NULL;
+	/*
+	 * Now we build the zonelist so that it contains the zones
+	 * of all the other nodes.
+	 * We don't want to pressure a particular node, so when
+	 * building the zones for node N, we make sure that the
+	 * zones coming right after the local ones are those from
+	 * node N+1 (modulo N)
+	 */
+	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+		if (!node_online(node))
+			continue;
+		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+							MAX_NR_ZONES - 1);
 	}
+	for (node = 0; node < local_node; node++) {
+		if (!node_online(node))
+			continue;
+		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+							MAX_NR_ZONES - 1);
+	}
+
+	zonelist->zones[j] = NULL;
 }
 
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
-	int i;
-
-	for (i = 0; i < MAX_NR_ZONES; i++)
-		pgdat->node_zonelists[i].zlcache_ptr = NULL;
+	pgdat->node_zonelists[0].zlcache_ptr = NULL;
+	pgdat->node_zonelists[1].zlcache_ptr = NULL;
 }
 
 #endif	/* CONFIG_NUMA */
diff --git a/mm/slab.c b/mm/slab.c
index 5488c54b1172..29851841da62 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3243,6 +3243,8 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	struct zonelist *zonelist;
 	gfp_t local_flags;
 	struct zone **z;
+	struct zone *zone;
+	enum zone_type high_zoneidx = gfp_zone(flags);
 	void *obj = NULL;
 	int nid;
 
@@ -3257,10 +3259,10 @@ retry:
 	 * Look through allowed nodes for objects available
 	 * from existing per node queues.
 	 */
-	for (z = zonelist->zones; *z && !obj; z++) {
-		nid = zone_to_nid(*z);
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+		nid = zone_to_nid(zone);
 
-		if (cpuset_zone_allowed_hardwall(*z, flags) &&
+		if (cpuset_zone_allowed_hardwall(zone, flags) &&
 			cache->nodelists[nid] &&
 			cache->nodelists[nid]->free_objects)
 				obj = ____cache_alloc_node(cache,
diff --git a/mm/slub.c b/mm/slub.c
index 19ebbfb20689..80d20cc1c0f8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1285,6 +1285,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 #ifdef CONFIG_NUMA
 	struct zonelist *zonelist;
 	struct zone **z;
+	struct zone *zone;
+	enum zone_type high_zoneidx = gfp_zone(flags);
 	struct page *page;
 
 	/*
@@ -1310,12 +1312,12 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 		return NULL;
 
 	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
-	for (z = zonelist->zones; *z; z++) {
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		struct kmem_cache_node *n;
 
-		n = get_node(s, zone_to_nid(*z));
+		n = get_node(s, zone_to_nid(zone));
 
-		if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
+		if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
 				n->nr_partial > MIN_PARTIAL) {
 			page = get_partial_node(n);
 			if (page)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ef8551e0d2d0..0515b8f44894 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1249,15 +1249,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
 					struct scan_control *sc)
 {
+	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
 	unsigned long nr_reclaimed = 0;
-	struct zone **zones = zonelist->zones;
-	int i;
-
+	struct zone **z;
+	struct zone *zone;
 
 	sc->all_unreclaimable = 1;
-	for (i = 0; zones[i] != NULL; i++) {
-		struct zone *zone = zones[i];
-
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		if (!populated_zone(zone))
 			continue;
 		/*
@@ -1311,8 +1309,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	unsigned long nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
-	struct zone **zones = zonelist->zones;
-	int i;
+	struct zone **z;
+	struct zone *zone;
+	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 
 	if (scan_global_lru(sc))
 		count_vm_event(ALLOCSTALL);
@@ -1320,8 +1319,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	 * mem_cgroup will not do shrink_slab.
 	 */
 	if (scan_global_lru(sc)) {
-		for (i = 0; zones[i] != NULL; i++) {
-			struct zone *zone = zones[i];
+		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
@@ -1385,8 +1383,7 @@ out:
 		priority = 0;
 
 	if (scan_global_lru(sc)) {
-		for (i = 0; zones[i] != NULL; i++) {
-			struct zone *zone = zones[i];
+		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
-- 
cgit v1.2.3


From dd1a239f6f2d4d3eedd318583ec319aa145b324c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Mon, 28 Apr 2008 02:12:17 -0700
Subject: mm: have zonelist contains structs with both a zone pointer and
 zone_idx

Filtering zonelists requires very frequent use of zone_idx().  This is costly
as it involves a lookup of another structure and a substraction operation.  As
the zone_idx is often required, it should be quickly accessible.  The node idx
could also be stored here if it was found that accessing zone->node is
significant which may be the case on workloads where nodemasks are heavily
used.

This patch introduces a struct zoneref to store a zone pointer and a zone
index.  The zonelist then consists of an array of these struct zonerefs which
are looked up as necessary.  Helpers are given for accessing the zone index as
well as the node index.

[kamezawa.hiroyu@jp.fujitsu.com: Suggested struct zoneref instead of embedding information in pointers]
[hugh@veritas.com: mm-have-zonelist: fix memcg ooms]
[hugh@veritas.com: just return do_try_to_free_pages]
[hugh@veritas.com: do_try_to_free_pages gfp_mask redundant]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Christoph Lameter <clameter@sgi.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c    |  3 ++-
 mm/mempolicy.c  | 36 ++++++++++++++++++------------
 mm/oom_kill.c   | 45 +++++++++++++++++++-------------------
 mm/page_alloc.c | 68 ++++++++++++++++++++++++++++++++-------------------------
 mm/slab.c       |  2 +-
 mm/slub.c       |  2 +-
 mm/vmscan.c     | 22 +++++++++----------
 7 files changed, 96 insertions(+), 82 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ddd141cad77f..4bced0d705ca 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -97,7 +97,8 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	struct mempolicy *mpol;
 	struct zonelist *zonelist = huge_zonelist(vma, address,
 					htlb_alloc_mask, &mpol);
-	struct zone *zone, **z;
+	struct zone *zone;
+	struct zoneref *z;
 
 	for_each_zone_zonelist(zone, z, zonelist, MAX_NR_ZONES - 1) {
 		nid = zone_to_nid(zone);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5d20bf44062f..90193a2a915b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -186,7 +186,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 		for_each_node_mask(nd, *nodes) { 
 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
 			if (z->present_pages > 0) 
-				zl->zones[num++] = z;
+				zoneref_set_zone(z, &zl->_zonerefs[num++]);
 		}
 		if (k == 0)
 			break;
@@ -196,7 +196,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 		kfree(zl);
 		return ERR_PTR(-EINVAL);
 	}
-	zl->zones[num] = NULL;
+	zl->_zonerefs[num].zone = NULL;
+	zl->_zonerefs[num].zone_idx = 0;
 	return zl;
 }
 
@@ -504,9 +505,11 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 	nodes_clear(*nodes);
 	switch (p->policy) {
 	case MPOL_BIND:
-		for (i = 0; p->v.zonelist->zones[i]; i++)
-			node_set(zone_to_nid(p->v.zonelist->zones[i]),
-				*nodes);
+		for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) {
+			struct zoneref *zref;
+			zref = &p->v.zonelist->_zonerefs[i];
+			node_set(zonelist_node_idx(zref), *nodes);
+		}
 		break;
 	case MPOL_DEFAULT:
 		break;
@@ -1212,12 +1215,13 @@ unsigned slab_node(struct mempolicy *policy)
 	case MPOL_INTERLEAVE:
 		return interleave_nodes(policy);
 
-	case MPOL_BIND:
+	case MPOL_BIND: {
 		/*
 		 * Follow bind policy behavior and start allocation at the
 		 * first node.
 		 */
-		return zone_to_nid(policy->v.zonelist->zones[0]);
+		return zonelist_node_idx(policy->v.zonelist->_zonerefs);
+	}
 
 	case MPOL_PREFERRED:
 		if (policy->v.preferred_node >= 0)
@@ -1323,7 +1327,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 
 	zl = node_zonelist(nid, gfp);
 	page = __alloc_pages(gfp, order, zl);
-	if (page && page_zone(page) == zl->zones[0])
+	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
 	return page;
 }
@@ -1463,10 +1467,14 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 		return a->v.preferred_node == b->v.preferred_node;
 	case MPOL_BIND: {
 		int i;
-		for (i = 0; a->v.zonelist->zones[i]; i++)
-			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
+		for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) {
+			struct zone *za, *zb;
+			za = zonelist_zone(&a->v.zonelist->_zonerefs[i]);
+			zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]);
+			if (za != zb)
 				return 0;
-		return b->v.zonelist->zones[i] == NULL;
+		}
+		return b->v.zonelist->_zonerefs[i].zone == NULL;
 	}
 	default:
 		BUG();
@@ -1785,12 +1793,12 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 		break;
 	case MPOL_BIND: {
 		nodemask_t nodes;
-		struct zone **z;
+		struct zoneref *z;
 		struct zonelist *zonelist;
 
 		nodes_clear(nodes);
-		for (z = pol->v.zonelist->zones; *z; z++)
-			node_set(zone_to_nid(*z), nodes);
+		for (z = pol->v.zonelist->_zonerefs; z->zone; z++)
+			node_set(zonelist_node_idx(z), nodes);
 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
 		nodes = tmp;
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2c93502cfcb4..e41504aa5da9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -176,7 +176,7 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 {
 #ifdef CONFIG_NUMA
 	struct zone *zone;
-	struct zone **z;
+	struct zoneref *z;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	nodemask_t nodes = node_states[N_HIGH_MEMORY];
 
@@ -462,29 +462,29 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
  * if a parallel OOM killing is already taking place that includes a zone in
  * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
  */
-int try_set_zone_oom(struct zonelist *zonelist)
+int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 {
-	struct zone **z;
+	struct zoneref *z;
+	struct zone *zone;
 	int ret = 1;
 
-	z = zonelist->zones;
-
 	spin_lock(&zone_scan_mutex);
-	do {
-		if (zone_is_oom_locked(*z)) {
+	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+		if (zone_is_oom_locked(zone)) {
 			ret = 0;
 			goto out;
 		}
-	} while (*(++z) != NULL);
+	}
+
+	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+		/*
+		 * Lock each zone in the zonelist under zone_scan_mutex so a
+		 * parallel invocation of try_set_zone_oom() doesn't succeed
+		 * when it shouldn't.
+		 */
+		zone_set_flag(zone, ZONE_OOM_LOCKED);
+	}
 
-	/*
-	 * Lock each zone in the zonelist under zone_scan_mutex so a parallel
-	 * invocation of try_set_zone_oom() doesn't succeed when it shouldn't.
-	 */
-	z = zonelist->zones;
-	do {
-		zone_set_flag(*z, ZONE_OOM_LOCKED);
-	} while (*(++z) != NULL);
 out:
 	spin_unlock(&zone_scan_mutex);
 	return ret;
@@ -495,16 +495,15 @@ out:
  * allocation attempts with zonelists containing them may now recall the OOM
  * killer, if necessary.
  */
-void clear_zonelist_oom(struct zonelist *zonelist)
+void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 {
-	struct zone **z;
-
-	z = zonelist->zones;
+	struct zoneref *z;
+	struct zone *zone;
 
 	spin_lock(&zone_scan_mutex);
-	do {
-		zone_clear_flag(*z, ZONE_OOM_LOCKED);
-	} while (*(++z) != NULL);
+	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+		zone_clear_flag(zone, ZONE_OOM_LOCKED);
+	}
 	spin_unlock(&zone_scan_mutex);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ccb8651cf22..6d94d04ea784 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1317,7 +1317,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
  * We are low on memory in the second scan, and should leave no stone
  * unturned looking for a free page.
  */
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 						nodemask_t *allowednodes)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
@@ -1328,7 +1328,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
 	if (!zlc)
 		return 1;
 
-	i = z - zonelist->zones;
+	i = z - zonelist->_zonerefs;
 	n = zlc->z_to_n[i];
 
 	/* This zone is worth trying if it is allowed but not full */
@@ -1340,7 +1340,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
  * zlc->fullzones, so that subsequent attempts to allocate a page
  * from that zone don't waste time re-examining it.
  */
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
@@ -1349,7 +1349,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
 	if (!zlc)
 		return;
 
-	i = z - zonelist->zones;
+	i = z - zonelist->_zonerefs;
 
 	set_bit(i, zlc->fullzones);
 }
@@ -1361,13 +1361,13 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 	return NULL;
 }
 
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 				nodemask_t *allowednodes)
 {
 	return 1;
 }
 
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
 #endif	/* CONFIG_NUMA */
@@ -1380,7 +1380,7 @@ static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
 {
-	struct zone **z;
+	struct zoneref *z;
 	struct page *page = NULL;
 	int classzone_idx;
 	struct zone *zone, *preferred_zone;
@@ -1389,8 +1389,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 
 	z = first_zones_zonelist(zonelist, high_zoneidx);
-	classzone_idx = zone_idx(*z);
-	preferred_zone = *z;
+	classzone_idx = zonelist_zone_idx(z);
+	preferred_zone = zonelist_zone(z);
 
 zonelist_scan:
 	/*
@@ -1453,7 +1453,8 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-	struct zone **z;
+	struct zoneref *z;
+	struct zone *zone;
 	struct page *page;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
@@ -1467,9 +1468,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
 		return NULL;
 
 restart:
-	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
+	z = zonelist->_zonerefs;  /* the list of zones suitable for gfp_mask */
 
-	if (unlikely(*z == NULL)) {
+	if (unlikely(!z->zone)) {
 		/*
 		 * Happens if we have an empty zonelist as a result of
 		 * GFP_THISNODE being used on a memoryless node
@@ -1493,8 +1494,8 @@ restart:
 	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 
-	for (z = zonelist->zones; *z; z++)
-		wakeup_kswapd(*z, order);
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+		wakeup_kswapd(zone, order);
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -1575,7 +1576,7 @@ nofail_alloc:
 		if (page)
 			goto got_pg;
 	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
-		if (!try_set_zone_oom(zonelist)) {
+		if (!try_set_zone_oom(zonelist, gfp_mask)) {
 			schedule_timeout_uninterruptible(1);
 			goto restart;
 		}
@@ -1589,18 +1590,18 @@ nofail_alloc:
 		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
 		if (page) {
-			clear_zonelist_oom(zonelist);
+			clear_zonelist_oom(zonelist, gfp_mask);
 			goto got_pg;
 		}
 
 		/* The OOM killer will not help higher order allocs so fail */
 		if (order > PAGE_ALLOC_COSTLY_ORDER) {
-			clear_zonelist_oom(zonelist);
+			clear_zonelist_oom(zonelist, gfp_mask);
 			goto nopage;
 		}
 
 		out_of_memory(zonelist, gfp_mask, order);
-		clear_zonelist_oom(zonelist);
+		clear_zonelist_oom(zonelist, gfp_mask);
 		goto restart;
 	}
 
@@ -1702,7 +1703,7 @@ EXPORT_SYMBOL(free_pages);
 
 static unsigned int nr_free_zone_pages(int offset)
 {
-	struct zone **z;
+	struct zoneref *z;
 	struct zone *zone;
 
 	/* Just pick one node, since fallback list is circular */
@@ -1896,7 +1897,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
-			zonelist->zones[nr_zones++] = zone;
+			zoneref_set_zone(zone,
+				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 
@@ -2072,11 +2074,12 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 	struct zonelist *zonelist;
 
 	zonelist = &pgdat->node_zonelists[0];
-	for (j = 0; zonelist->zones[j] != NULL; j++)
+	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
 	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
-	zonelist->zones[j] = NULL;
+	zonelist->_zonerefs[j].zone = NULL;
+	zonelist->_zonerefs[j].zone_idx = 0;
 }
 
 /*
@@ -2089,7 +2092,8 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
 
 	zonelist = &pgdat->node_zonelists[1];
 	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
-	zonelist->zones[j] = NULL;
+	zonelist->_zonerefs[j].zone = NULL;
+	zonelist->_zonerefs[j].zone_idx = 0;
 }
 
 /*
@@ -2114,12 +2118,14 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 			node = node_order[j];
 			z = &NODE_DATA(node)->node_zones[zone_type];
 			if (populated_zone(z)) {
-				zonelist->zones[pos++] = z;
+				zoneref_set_zone(z,
+					&zonelist->_zonerefs[pos++]);
 				check_highest_zone(zone_type);
 			}
 		}
 	}
-	zonelist->zones[pos] = NULL;
+	zonelist->_zonerefs[pos].zone = NULL;
+	zonelist->_zonerefs[pos].zone_idx = 0;
 }
 
 static int default_zonelist_order(void)
@@ -2196,7 +2202,8 @@ static void build_zonelists(pg_data_t *pgdat)
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
 		zonelist = pgdat->node_zonelists + i;
-		zonelist->zones[0] = NULL;
+		zonelist->_zonerefs[0].zone = NULL;
+		zonelist->_zonerefs[0].zone_idx = 0;
 	}
 
 	/* NUMA-aware ordering of nodes */
@@ -2248,13 +2255,13 @@ static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	struct zonelist *zonelist;
 	struct zonelist_cache *zlc;
-	struct zone **z;
+	struct zoneref *z;
 
 	zonelist = &pgdat->node_zonelists[0];
 	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-	for (z = zonelist->zones; *z; z++)
-		zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
+	for (z = zonelist->_zonerefs; z->zone; z++)
+		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
 }
 
 
@@ -2297,7 +2304,8 @@ static void build_zonelists(pg_data_t *pgdat)
 							MAX_NR_ZONES - 1);
 	}
 
-	zonelist->zones[j] = NULL;
+	zonelist->_zonerefs[j].zone = NULL;
+	zonelist->_zonerefs[j].zone_idx = 0;
 }
 
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
diff --git a/mm/slab.c b/mm/slab.c
index 29851841da62..7bc4a136846e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3242,7 +3242,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 {
 	struct zonelist *zonelist;
 	gfp_t local_flags;
-	struct zone **z;
+	struct zoneref *z;
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	void *obj = NULL;
diff --git a/mm/slub.c b/mm/slub.c
index 80d20cc1c0f8..48fff83a1e9d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1284,7 +1284,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 {
 #ifdef CONFIG_NUMA
 	struct zonelist *zonelist;
-	struct zone **z;
+	struct zoneref *z;
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	struct page *page;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0515b8f44894..eceac9f9032f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1251,7 +1251,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
 {
 	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
 	unsigned long nr_reclaimed = 0;
-	struct zone **z;
+	struct zoneref *z;
 	struct zone *zone;
 
 	sc->all_unreclaimable = 1;
@@ -1301,7 +1301,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
  * allocation attempt will fail.
  */
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
-					gfp_t gfp_mask, struct scan_control *sc)
+					struct scan_control *sc)
 {
 	int priority;
 	int ret = 0;
@@ -1309,9 +1309,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	unsigned long nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
-	struct zone **z;
+	struct zoneref *z;
 	struct zone *zone;
-	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
 
 	if (scan_global_lru(sc))
 		count_vm_event(ALLOCSTALL);
@@ -1339,7 +1339,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		 * over limit cgroups
 		 */
 		if (scan_global_lru(sc)) {
-			shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
+			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
 			if (reclaim_state) {
 				nr_reclaimed += reclaim_state->reclaimed_slab;
 				reclaim_state->reclaimed_slab = 0;
@@ -1410,7 +1410,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.isolate_pages = isolate_pages_global,
 	};
 
-	return do_try_to_free_pages(zonelist, gfp_mask, &sc);
+	return do_try_to_free_pages(zonelist, &sc);
 }
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1419,7 +1419,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 						gfp_t gfp_mask)
 {
 	struct scan_control sc = {
-		.gfp_mask = gfp_mask,
 		.may_writepage = !laptop_mode,
 		.may_swap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
@@ -1429,12 +1428,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 		.isolate_pages = mem_cgroup_isolate_pages,
 	};
 	struct zonelist *zonelist;
-	int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
 
-	zonelist = &NODE_DATA(numa_node_id())->node_zonelists[target_zone];
-	if (do_try_to_free_pages(zonelist, sc.gfp_mask, &sc))
-		return 1;
-	return 0;
+	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+	zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+	return do_try_to_free_pages(zonelist, &sc);
 }
 #endif
 
-- 
cgit v1.2.3


From 19770b32609b6bf97a3dece2529089494cbfc549 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Mon, 28 Apr 2008 02:12:18 -0700
Subject: mm: filter based on a nodemask as well as a gfp_mask

The MPOL_BIND policy creates a zonelist that is used for allocations
controlled by that mempolicy.  As the per-node zonelist is already being
filtered based on a zone id, this patch adds a version of __alloc_pages() that
takes a nodemask for further filtering.  This eliminates the need for
MPOL_BIND to create a custom zonelist.

A positive benefit of this is that allocations using MPOL_BIND now use the
local node's distance-ordered zonelist instead of a custom node-id-ordered
zonelist.  I.e., pages will be allocated from the closest allowed node with
available memory.

[Lee.Schermerhorn@hp.com: Mempolicy: update stale documentation and comments]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask rework]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c    |   6 +-
 mm/mempolicy.c  | 184 ++++++++++++++++++++++----------------------------------
 mm/mmzone.c     |  30 +++++++++
 mm/page_alloc.c |  50 ++++++++++-----
 4 files changed, 143 insertions(+), 127 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4bced0d705ca..3737d82f5225 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -95,12 +95,14 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	int nid;
 	struct page *page = NULL;
 	struct mempolicy *mpol;
+	nodemask_t *nodemask;
 	struct zonelist *zonelist = huge_zonelist(vma, address,
-					htlb_alloc_mask, &mpol);
+					htlb_alloc_mask, &mpol, &nodemask);
 	struct zone *zone;
 	struct zoneref *z;
 
-	for_each_zone_zonelist(zone, z, zonelist, MAX_NR_ZONES - 1) {
+	for_each_zone_zonelist_nodemask(zone, z, zonelist,
+						MAX_NR_ZONES - 1, nodemask) {
 		nid = zone_to_nid(zone);
 		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
 		    !list_empty(&hugepage_freelists[nid])) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 90193a2a915b..acb5ee3587c3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -163,42 +163,25 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
 	return 0;
 }
 
-/* Generate a custom zonelist for the BIND policy. */
-static struct zonelist *bind_zonelist(nodemask_t *nodes)
+/* Check that the nodemask contains at least one populated zone */
+static int is_valid_nodemask(nodemask_t *nodemask)
 {
-	struct zonelist *zl;
-	int num, max, nd;
-	enum zone_type k;
+	int nd, k;
 
-	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
-	max++;			/* space for zlcache_ptr (see mmzone.h) */
-	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
-	if (!zl)
-		return ERR_PTR(-ENOMEM);
-	zl->zlcache_ptr = NULL;
-	num = 0;
-	/* First put in the highest zones from all nodes, then all the next 
-	   lower zones etc. Avoid empty zones because the memory allocator
-	   doesn't like them. If you implement node hot removal you
-	   have to fix that. */
-	k = MAX_NR_ZONES - 1;
-	while (1) {
-		for_each_node_mask(nd, *nodes) { 
-			struct zone *z = &NODE_DATA(nd)->node_zones[k];
-			if (z->present_pages > 0) 
-				zoneref_set_zone(z, &zl->_zonerefs[num++]);
+	/* Check that there is something useful in this mask */
+	k = policy_zone;
+
+	for_each_node_mask(nd, *nodemask) {
+		struct zone *z;
+
+		for (k = 0; k <= policy_zone; k++) {
+			z = &NODE_DATA(nd)->node_zones[k];
+			if (z->present_pages > 0)
+				return 1;
 		}
-		if (k == 0)
-			break;
-		k--;
-	}
-	if (num == 0) {
-		kfree(zl);
-		return ERR_PTR(-EINVAL);
 	}
-	zl->_zonerefs[num].zone = NULL;
-	zl->_zonerefs[num].zone_idx = 0;
-	return zl;
+
+	return 0;
 }
 
 /* Create a new policy */
@@ -229,12 +212,11 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 			policy->v.preferred_node = -1;
 		break;
 	case MPOL_BIND:
-		policy->v.zonelist = bind_zonelist(nodes);
-		if (IS_ERR(policy->v.zonelist)) {
-			void *error_code = policy->v.zonelist;
+		if (!is_valid_nodemask(nodes)) {
 			kmem_cache_free(policy_cache, policy);
-			return error_code;
+			return ERR_PTR(-EINVAL);
 		}
+		policy->v.nodes = *nodes;
 		break;
 	}
 	policy->policy = mode;
@@ -500,19 +482,12 @@ static long do_set_mempolicy(int mode, nodemask_t *nodes)
 /* Fill a zone bitmap for a policy */
 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 {
-	int i;
-
 	nodes_clear(*nodes);
 	switch (p->policy) {
-	case MPOL_BIND:
-		for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) {
-			struct zoneref *zref;
-			zref = &p->v.zonelist->_zonerefs[i];
-			node_set(zonelist_node_idx(zref), *nodes);
-		}
-		break;
 	case MPOL_DEFAULT:
 		break;
+	case MPOL_BIND:
+		/* Fall through */
 	case MPOL_INTERLEAVE:
 		*nodes = p->v.nodes;
 		break;
@@ -1160,6 +1135,18 @@ static struct mempolicy * get_vma_policy(struct task_struct *task,
 	return pol;
 }
 
+/* Return a nodemask representing a mempolicy */
+static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
+{
+	/* Lower zones don't get a nodemask applied for MPOL_BIND */
+	if (unlikely(policy->policy == MPOL_BIND) &&
+			gfp_zone(gfp) >= policy_zone &&
+			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
+		return &policy->v.nodes;
+
+	return NULL;
+}
+
 /* Return a zonelist representing a mempolicy */
 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 {
@@ -1172,12 +1159,17 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 			nd = numa_node_id();
 		break;
 	case MPOL_BIND:
-		/* Lower zones don't get a policy applied */
-		/* Careful: current->mems_allowed might have moved */
-		if (gfp_zone(gfp) >= policy_zone)
-			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
-				return policy->v.zonelist;
-		/*FALL THROUGH*/
+		/*
+		 * Normally, MPOL_BIND allocations node-local are node-local
+		 * within the allowed nodemask. However, if __GFP_THISNODE is
+		 * set and the current node is part of the mask, we use the
+		 * the zonelist for the first node in the mask instead.
+		 */
+		nd = numa_node_id();
+		if (unlikely(gfp & __GFP_THISNODE) &&
+				unlikely(!node_isset(nd, policy->v.nodes)))
+			nd = first_node(policy->v.nodes);
+		break;
 	case MPOL_INTERLEAVE: /* should not happen */
 	case MPOL_DEFAULT:
 		nd = numa_node_id();
@@ -1220,7 +1212,14 @@ unsigned slab_node(struct mempolicy *policy)
 		 * Follow bind policy behavior and start allocation at the
 		 * first node.
 		 */
-		return zonelist_node_idx(policy->v.zonelist->_zonerefs);
+		struct zonelist *zonelist;
+		struct zone *zone;
+		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
+		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
+		(void)first_zones_zonelist(zonelist, highest_zoneidx,
+							&policy->v.nodes,
+							&zone);
+		return zone->node;
 	}
 
 	case MPOL_PREFERRED:
@@ -1278,25 +1277,31 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
  * @vma = virtual memory area whose policy is sought
  * @addr = address in @vma for shared policy lookup and interleave policy
  * @gfp_flags = for requested zone
- * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
+ * @mpol = pointer to mempolicy pointer for reference counted mempolicy
+ * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
  *
  * Returns a zonelist suitable for a huge page allocation.
- * If the effective policy is 'BIND, returns pointer to policy's zonelist.
+ * If the effective policy is 'BIND, returns pointer to local node's zonelist,
+ * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
  * If it is also a policy for which get_vma_policy() returns an extra
- * reference, we must hold that reference until after allocation.
+ * reference, we must hold that reference until after the allocation.
  * In that case, return policy via @mpol so hugetlb allocation can drop
- * the reference.  For non-'BIND referenced policies, we can/do drop the
+ * the reference. For non-'BIND referenced policies, we can/do drop the
  * reference here, so the caller doesn't need to know about the special case
  * for default and current task policy.
  */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
-				gfp_t gfp_flags, struct mempolicy **mpol)
+				gfp_t gfp_flags, struct mempolicy **mpol,
+				nodemask_t **nodemask)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
 
 	*mpol = NULL;		/* probably no unref needed */
-	if (pol->policy == MPOL_INTERLEAVE) {
+	*nodemask = NULL;	/* assume !MPOL_BIND */
+	if (pol->policy == MPOL_BIND) {
+			*nodemask = &pol->v.nodes;
+	} else if (pol->policy == MPOL_INTERLEAVE) {
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
@@ -1376,14 +1381,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		/*
 		 * slow path: ref counted policy -- shared or vma
 		 */
-		struct page *page =  __alloc_pages(gfp, 0, zl);
+		struct page *page =  __alloc_pages_nodemask(gfp, 0,
+						zl, nodemask_policy(gfp, pol));
 		__mpol_free(pol);
 		return page;
 	}
 	/*
 	 * fast path:  default or task policy
 	 */
-	return __alloc_pages(gfp, 0, zl);
+	return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
 }
 
 /**
@@ -1415,7 +1421,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 		pol = &default_policy;
 	if (pol->policy == MPOL_INTERLEAVE)
 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
-	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
+	return __alloc_pages_nodemask(gfp, order,
+			zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
@@ -1440,14 +1447,6 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 	}
 	*new = *old;
 	atomic_set(&new->refcnt, 1);
-	if (new->policy == MPOL_BIND) {
-		int sz = ksize(old->v.zonelist);
-		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
-		if (!new->v.zonelist) {
-			kmem_cache_free(policy_cache, new);
-			return ERR_PTR(-ENOMEM);
-		}
-	}
 	return new;
 }
 
@@ -1461,21 +1460,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	switch (a->policy) {
 	case MPOL_DEFAULT:
 		return 1;
+	case MPOL_BIND:
+		/* Fall through */
 	case MPOL_INTERLEAVE:
 		return nodes_equal(a->v.nodes, b->v.nodes);
 	case MPOL_PREFERRED:
 		return a->v.preferred_node == b->v.preferred_node;
-	case MPOL_BIND: {
-		int i;
-		for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) {
-			struct zone *za, *zb;
-			za = zonelist_zone(&a->v.zonelist->_zonerefs[i]);
-			zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]);
-			if (za != zb)
-				return 0;
-		}
-		return b->v.zonelist->_zonerefs[i].zone == NULL;
-	}
 	default:
 		BUG();
 		return 0;
@@ -1487,8 +1477,6 @@ void __mpol_free(struct mempolicy *p)
 {
 	if (!atomic_dec_and_test(&p->refcnt))
 		return;
-	if (p->policy == MPOL_BIND)
-		kfree(p->v.zonelist);
 	p->policy = MPOL_DEFAULT;
 	kmem_cache_free(policy_cache, p);
 }
@@ -1779,6 +1767,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
 		break;
+	case MPOL_BIND:
+		/* Fall through */
 	case MPOL_INTERLEAVE:
 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
 		pol->v.nodes = tmp;
@@ -1791,32 +1781,6 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 						*mpolmask, *newmask);
 		*mpolmask = *newmask;
 		break;
-	case MPOL_BIND: {
-		nodemask_t nodes;
-		struct zoneref *z;
-		struct zonelist *zonelist;
-
-		nodes_clear(nodes);
-		for (z = pol->v.zonelist->_zonerefs; z->zone; z++)
-			node_set(zonelist_node_idx(z), nodes);
-		nodes_remap(tmp, nodes, *mpolmask, *newmask);
-		nodes = tmp;
-
-		zonelist = bind_zonelist(&nodes);
-
-		/* If no mem, then zonelist is NULL and we keep old zonelist.
-		 * If that old zonelist has no remaining mems_allowed nodes,
-		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
-		 */
-
-		if (!IS_ERR(zonelist)) {
-			/* Good - got mem - substitute new zonelist */
-			kfree(pol->v.zonelist);
-			pol->v.zonelist = zonelist;
-		}
-		*mpolmask = *newmask;
-		break;
-	}
 	default:
 		BUG();
 		break;
@@ -1879,9 +1843,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 		break;
 
 	case MPOL_BIND:
-		get_zonemask(pol, &nodes);
-		break;
-
+		/* Fall through */
 	case MPOL_INTERLEAVE:
 		nodes = pol->v.nodes;
 		break;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index eb5838634f18..486ed595ee6f 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone)
 	return zone;
 }
 
+static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
+{
+#ifdef CONFIG_NUMA
+	return node_isset(zonelist_node_idx(zref), *nodes);
+#else
+	return 1;
+#endif /* CONFIG_NUMA */
+}
+
+/* Returns the next zone at or below highest_zoneidx in a zonelist */
+struct zoneref *next_zones_zonelist(struct zoneref *z,
+					enum zone_type highest_zoneidx,
+					nodemask_t *nodes,
+					struct zone **zone)
+{
+	/*
+	 * Find the next suitable zone to use for the allocation.
+	 * Only filter based on nodemask if it's set
+	 */
+	if (likely(nodes == NULL))
+		while (zonelist_zone_idx(z) > highest_zoneidx)
+			z++;
+	else
+		while (zonelist_zone_idx(z) > highest_zoneidx ||
+				(z->zone && !zref_in_nodemask(z, nodes)))
+			z++;
+
+	*zone = zonelist_zone(z++);
+	return z;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d94d04ea784..b4beb3eea8b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1377,7 +1377,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
  * a page.
  */
 static struct page *
-get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
 {
 	struct zoneref *z;
@@ -1388,16 +1388,17 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 
-	z = first_zones_zonelist(zonelist, high_zoneidx);
-	classzone_idx = zonelist_zone_idx(z);
-	preferred_zone = zonelist_zone(z);
+	(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
+							&preferred_zone);
+	classzone_idx = zone_idx(preferred_zone);
 
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+	for_each_zone_zonelist_nodemask(zone, z, zonelist,
+						high_zoneidx, nodemask) {
 		if (NUMA_BUILD && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
@@ -1447,9 +1448,9 @@ try_next_zone:
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
-struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order,
-		struct zonelist *zonelist)
+static struct page *
+__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
@@ -1478,7 +1479,7 @@ restart:
 		return NULL;
 	}
 
-	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
 	if (page)
 		goto got_pg;
@@ -1523,7 +1524,7 @@ restart:
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	page = get_page_from_freelist(gfp_mask, order, zonelist,
+	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 						high_zoneidx, alloc_flags);
 	if (page)
 		goto got_pg;
@@ -1536,7 +1537,7 @@ rebalance:
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 nofail_alloc:
 			/* go through the zonelist yet again, ignoring mins */
-			page = get_page_from_freelist(gfp_mask, order,
+			page = get_page_from_freelist(gfp_mask, nodemask, order,
 				zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
 			if (page)
 				goto got_pg;
@@ -1571,7 +1572,7 @@ nofail_alloc:
 		drain_all_pages();
 
 	if (likely(did_some_progress)) {
-		page = get_page_from_freelist(gfp_mask, order,
+		page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx, alloc_flags);
 		if (page)
 			goto got_pg;
@@ -1587,8 +1588,9 @@ nofail_alloc:
 		 * a parallel oom killing, we must fail if we're still
 		 * under heavy pressure.
 		 */
-		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-			zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+			order, zonelist, high_zoneidx,
+			ALLOC_WMARK_HIGH|ALLOC_CPUSET);
 		if (page) {
 			clear_zonelist_oom(zonelist, gfp_mask);
 			goto got_pg;
@@ -1637,6 +1639,20 @@ got_pg:
 	return page;
 }
 
+struct page *
+__alloc_pages(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist)
+{
+	return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
+}
+
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist, nodemask_t *nodemask)
+{
+	return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
+}
+
 EXPORT_SYMBOL(__alloc_pages);
 
 /*
@@ -1880,6 +1896,12 @@ void show_free_areas(void)
 	show_swap_cache_info();
 }
 
+static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
+{
+	zoneref->zone = zone;
+	zoneref->zone_idx = zone_idx(zone);
+}
+
 /*
  * Builds allocation fallback zone lists.
  *
-- 
cgit v1.2.3


From 797df5749032c2286bc7ff3a52de41fde0cdf0a5 Mon Sep 17 00:00:00 2001
From: Chris Dearman <chris@mips.com>
Date: Mon, 28 Apr 2008 02:12:19 -0700
Subject: mm: try both endianess when checking for endianess

When checking for the swap header try byteswapping the endianess dependent
fields to allow the swap partition to be shared between big & little endian
systems.

Signed-off-by: Chris Dearman <chris@mips.com>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2da149cfc9ac..67051be7083a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1582,6 +1582,14 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		error = -EINVAL;
 		goto bad_swap;
 	case 2:
+		/* swap partition endianess hack... */
+		if (swab32(swap_header->info.version) == 1) {
+			swab32s(&swap_header->info.version);
+			swab32s(&swap_header->info.last_page);
+			swab32s(&swap_header->info.nr_badpages);
+			for (i = 0; i < swap_header->info.nr_badpages; i++)
+				swab32s(&swap_header->info.badpages[i]);
+		}
 		/* Check the swap header's sub-version and the size of
                    the swap file and bad block lists */
 		if (swap_header->info.version != 1) {
-- 
cgit v1.2.3


From 19fc3f0acde32636529969570055c7e2a744787c Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Mon, 28 Apr 2008 02:12:20 -0700
Subject: hugetlb: decrease hugetlb_lock cycling in gather_surplus_huge_pages

To reduce hugetlb_lock acquisitions and releases when freeing excess surplus
pages, scan the page list in two parts.  First, transfer the needed pages to
the hugetlb pool.  Then drop the lock and free the remaining pages back to the
buddy allocator.

In the common case there are zero excess pages and no lock operations are
required.

Thanks Mel Gorman for this improvement.

Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3737d82f5225..93ea46a0fba4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -372,11 +372,19 @@ retry:
 	resv_huge_pages += delta;
 	ret = 0;
 free:
+	/* Free the needed pages to the hugetlb pool */
 	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+		if ((--needed) < 0)
+			break;
 		list_del(&page->lru);
-		if ((--needed) >= 0)
-			enqueue_huge_page(page);
-		else {
+		enqueue_huge_page(page);
+	}
+
+	/* Free unnecessary surplus pages to the buddy allocator */
+	if (!list_empty(&surplus_list)) {
+		spin_unlock(&hugetlb_lock);
+		list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+			list_del(&page->lru);
 			/*
 			 * The page has a reference count of zero already, so
 			 * call free_huge_page directly instead of using
@@ -384,10 +392,9 @@ free:
 			 * unlocked which is safe because free_huge_page takes
 			 * hugetlb_lock before deciding how to free the page.
 			 */
-			spin_unlock(&hugetlb_lock);
 			free_huge_page(page);
-			spin_lock(&hugetlb_lock);
 		}
+		spin_lock(&hugetlb_lock);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 1b27d05b6e21249d2338be26dfcbe8f8d8ff8a5b Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 28 Apr 2008 02:12:22 -0700
Subject: mm: move cache_line_size() to <linux/cache.h>

Not all architectures define cache_line_size() so as suggested by Andrew move
the private implementations in mm/slab.c and mm/slob.c to <linux/cache.h>.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 4 ----
 mm/slub.c | 5 -----
 2 files changed, 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 7bc4a136846e..39d20f8a0791 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -139,10 +139,6 @@
 #define	BYTES_PER_WORD		sizeof(void *)
 #define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
 
-#ifndef cache_line_size
-#define cache_line_size()	L1_CACHE_BYTES
-#endif
-
 #ifndef ARCH_KMALLOC_MINALIGN
 /*
  * Enforce a minimum alignment for the kmalloc caches.
diff --git a/mm/slub.c b/mm/slub.c
index 48fff83a1e9d..38914bc64aca 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -207,11 +207,6 @@ static inline void ClearSlabDebug(struct page *page)
 #define __KMALLOC_CACHE		0x20000000 /* objects freed using kfree */
 #define __PAGE_ALLOC_FALLBACK	0x10000000 /* Allow fallback to page alloc */
 
-/* Not all arches define cache_line_size */
-#ifndef cache_line_size
-#define cache_line_size()	L1_CACHE_BYTES
-#endif
-
 static int kmem_size = sizeof(struct kmem_cache);
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From a3b51e0142d1be156ac697eaadadd6cfbb7ba32b Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 28 Apr 2008 02:12:23 -0700
Subject: mempolicy: convert MPOL constants to enum

The mempolicy mode constants, MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND, and
MPOL_INTERLEAVE, are better declared as part of an enum since they are
sequentially numbered and cannot be combined.

The policy member of struct mempolicy is also converted from type short to
type unsigned short.  A negative policy does not have any legitimate meaning,
so it is possible to change its type in preparation for adding optional mode
flags later.

The equivalent member of struct shmem_sb_info is also changed from int to
unsigned short.

For compatibility, the policy formal to get_mempolicy() remains as a pointer
to an int:

	int get_mempolicy(int *policy, unsigned long *nmask,
			  unsigned long maxnode, unsigned long addr,
			  unsigned long flags);

although the only possible values is the range of type unsigned short.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 29 +++++++++++++++++------------
 mm/shmem.c     |  9 +++++----
 2 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index acb5ee3587c3..1311dc4a3888 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -114,7 +114,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
                                const nodemask_t *newmask);
 
 /* Do sanity checking on a policy */
-static int mpol_check_policy(int mode, nodemask_t *nodes)
+static int mpol_check_policy(unsigned short mode, nodemask_t *nodes)
 {
 	int was_empty, is_empty;
 
@@ -159,6 +159,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
 		if (!was_empty && is_empty)
 			return -EINVAL;
 		break;
+	default:
+		BUG();
 	}
 	return 0;
 }
@@ -185,7 +187,7 @@ static int is_valid_nodemask(nodemask_t *nodemask)
 }
 
 /* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(unsigned short mode, nodemask_t *nodes)
 {
 	struct mempolicy *policy;
 
@@ -218,6 +220,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 		}
 		policy->v.nodes = *nodes;
 		break;
+	default:
+		BUG();
 	}
 	policy->policy = mode;
 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
@@ -462,7 +466,7 @@ static void mpol_set_task_struct_flag(void)
 }
 
 /* Set the process memory policy */
-static long do_set_mempolicy(int mode, nodemask_t *nodes)
+static long do_set_mempolicy(unsigned short mode, nodemask_t *nodes)
 {
 	struct mempolicy *new;
 
@@ -759,7 +763,7 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
 #endif
 
 static long do_mbind(unsigned long start, unsigned long len,
-		     unsigned long mode, nodemask_t *nmask,
+		     unsigned short mode, nodemask_t *nmask,
 		     unsigned long flags)
 {
 	struct vm_area_struct *vma;
@@ -769,9 +773,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 	int err;
 	LIST_HEAD(pagelist);
 
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
-				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-	    || mode > MPOL_MAX)
+	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
+				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 		return -EINVAL;
 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 		return -EPERM;
@@ -804,7 +807,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (!new)
 		flags |= MPOL_MF_DISCONTIG_OK;
 
-	pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+	pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n", start, start + len,
 		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
 
 	down_write(&mm->mmap_sem);
@@ -905,6 +908,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 	nodemask_t nodes;
 	int err;
 
+	if (mode >= MPOL_MAX)
+		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
@@ -918,7 +923,7 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	int err;
 	nodemask_t nodes;
 
-	if (mode < 0 || mode > MPOL_MAX)
+	if (mode < 0 || mode >= MPOL_MAX)
 		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
@@ -1201,7 +1206,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
  */
 unsigned slab_node(struct mempolicy *policy)
 {
-	int pol = policy ? policy->policy : MPOL_DEFAULT;
+	unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
 
 	switch (pol) {
 	case MPOL_INTERLEAVE:
@@ -1635,7 +1640,7 @@ restart:
 	return 0;
 }
 
-void mpol_shared_policy_init(struct shared_policy *info, int policy,
+void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
 				nodemask_t *policy_nodes)
 {
 	info->root = RB_ROOT;
@@ -1830,7 +1835,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	char *p = buffer;
 	int l;
 	nodemask_t nodes;
-	int mode = pol ? pol->policy : MPOL_DEFAULT;
+	unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
 
 	switch (mode) {
 	case MPOL_DEFAULT:
diff --git a/mm/shmem.c b/mm/shmem.c
index f514dd392cd9..d8ef7ba831a5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1079,7 +1079,8 @@ redirty:
 
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_TMPFS
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static int shmem_parse_mpol(char *value, unsigned short *policy,
+			    nodemask_t *policy_nodes)
 {
 	char *nodelist = strchr(value, ':');
 	int err = 1;
@@ -1128,7 +1129,7 @@ out:
 	return err;
 }
 
-static void shmem_show_mpol(struct seq_file *seq, int policy,
+static void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
 			    const nodemask_t policy_nodes)
 {
 	char *policy_string;
@@ -1197,13 +1198,13 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-static inline int shmem_parse_mpol(char *value, int *policy,
+static inline int shmem_parse_mpol(char *value, unsigned short *policy,
 						nodemask_t *policy_nodes)
 {
 	return 1;
 }
 
-static inline void shmem_show_mpol(struct seq_file *seq, int policy,
+static inline void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
 			    const nodemask_t policy_nodes)
 {
 }
-- 
cgit v1.2.3


From 028fec414d803117eb4b2ed12acb4dd5da65b32d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 28 Apr 2008 02:12:25 -0700
Subject: mempolicy: support optional mode flags

With the evolution of mempolicies, it is necessary to support mempolicy mode
flags that specify how the policy shall behave in certain circumstances.  The
most immediate need for mode flag support is to suppress remapping the
nodemask of a policy at the time of rebind.

Both the mempolicy mode and flags are passed by the user in the 'int policy'
formal of either the set_mempolicy() or mbind() syscall.  A new constant,
MPOL_MODE_FLAGS, represents the union of legal optional flags that may be
passed as part of this int.  Mempolicies that include illegal flags as part of
their policy are rejected as invalid.

An additional member to struct mempolicy is added to support the mode flags:

	struct mempolicy {
		...
		unsigned short policy;
		unsigned short flags;
	}

The splitting of the 'int' actual passed by the user is done in
sys_set_mempolicy() and sys_mbind() for their respective syscalls.  This is
done by intersecting the actual with MPOL_MODE_FLAGS, rejecting the syscall of
there are additional flags, and storing it in the new 'flags' member of struct
mempolicy.  The intersection of the actual with ~MPOL_MODE_FLAGS is stored in
the 'policy' member of the struct and all current users of pol->policy remain
unchanged.

The union of the policy mode and optional mode flags is passed back to the
user in get_mempolicy().

This combination of mode and flags within the same actual does not break
userspace code that relies on get_mempolicy(&policy, ...) and either

	switch (policy) {
	case MPOL_BIND:
		...
	case MPOL_INTERLEAVE:
		...
	};

statements or

	if (policy == MPOL_INTERLEAVE) {
		...
	}

statements.  Such applications would need to use optional mode flags when
calling set_mempolicy() or mbind() for these previously implemented statements
to stop working.  If an application does start using optional mode flags, it
will need to mask the optional flags off the policy in switch and conditional
statements that only test mode.

An additional member is also added to struct shmem_sb_info to store the
optional mode flags.

[hugh@veritas.com: shmem mpol: fix build warning]
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 51 +++++++++++++++++++++++++++++++--------------------
 mm/shmem.c     | 24 ++++++++++++++++--------
 2 files changed, 47 insertions(+), 28 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1311dc4a3888..1f6ff9c1bbc3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -187,12 +187,13 @@ static int is_valid_nodemask(nodemask_t *nodemask)
 }
 
 /* Create a new policy */
-static struct mempolicy *mpol_new(unsigned short mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
+				  nodemask_t *nodes)
 {
 	struct mempolicy *policy;
 
-	pr_debug("setting mode %d nodes[0] %lx\n",
-		 mode, nodes ? nodes_addr(*nodes)[0] : -1);
+	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
+		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 
 	if (mode == MPOL_DEFAULT)
 		return NULL;
@@ -224,6 +225,7 @@ static struct mempolicy *mpol_new(unsigned short mode, nodemask_t *nodes)
 		BUG();
 	}
 	policy->policy = mode;
+	policy->flags = flags;
 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 	return policy;
 }
@@ -466,13 +468,14 @@ static void mpol_set_task_struct_flag(void)
 }
 
 /* Set the process memory policy */
-static long do_set_mempolicy(unsigned short mode, nodemask_t *nodes)
+static long do_set_mempolicy(unsigned short mode, unsigned short flags,
+			     nodemask_t *nodes)
 {
 	struct mempolicy *new;
 
 	if (mpol_check_policy(mode, nodes))
 		return -EINVAL;
-	new = mpol_new(mode, nodes);
+	new = mpol_new(mode, flags, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
@@ -573,7 +576,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			goto out;
 		}
 	} else
-		*policy = pol->policy;
+		*policy = pol->policy | pol->flags;
 
 	if (vma) {
 		up_read(&current->mm->mmap_sem);
@@ -763,8 +766,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
 #endif
 
 static long do_mbind(unsigned long start, unsigned long len,
-		     unsigned short mode, nodemask_t *nmask,
-		     unsigned long flags)
+		     unsigned short mode, unsigned short mode_flags,
+		     nodemask_t *nmask, unsigned long flags)
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
@@ -796,7 +799,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (mpol_check_policy(mode, nmask))
 		return -EINVAL;
 
-	new = mpol_new(mode, nmask);
+	new = mpol_new(mode, mode_flags, nmask);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
@@ -807,8 +810,9 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (!new)
 		flags |= MPOL_MF_DISCONTIG_OK;
 
-	pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n", start, start + len,
-		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
+	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
+		 start, start + len, mode, mode_flags,
+		 nmask ? nodes_addr(*nmask)[0] : -1);
 
 	down_write(&mm->mmap_sem);
 	vma = check_range(mm, start, end, nmask,
@@ -907,13 +911,16 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 {
 	nodemask_t nodes;
 	int err;
+	unsigned short mode_flags;
 
+	mode_flags = mode & MPOL_MODE_FLAGS;
+	mode &= ~MPOL_MODE_FLAGS;
 	if (mode >= MPOL_MAX)
 		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
-	return do_mbind(start, len, mode, &nodes, flags);
+	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
 }
 
 /* Set the process memory policy */
@@ -922,13 +929,16 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 {
 	int err;
 	nodemask_t nodes;
+	unsigned short flags;
 
-	if (mode < 0 || mode >= MPOL_MAX)
+	flags = mode & MPOL_MODE_FLAGS;
+	mode &= ~MPOL_MODE_FLAGS;
+	if ((unsigned int)mode >= MPOL_MAX)
 		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
-	return do_set_mempolicy(mode, &nodes);
+	return do_set_mempolicy(mode, flags, &nodes);
 }
 
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
@@ -1641,7 +1651,7 @@ restart:
 }
 
 void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
-				nodemask_t *policy_nodes)
+			unsigned short flags, nodemask_t *policy_nodes)
 {
 	info->root = RB_ROOT;
 	spin_lock_init(&info->lock);
@@ -1650,7 +1660,7 @@ void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
 		struct mempolicy *newpol;
 
 		/* Falls back to MPOL_DEFAULT on any error */
-		newpol = mpol_new(policy, policy_nodes);
+		newpol = mpol_new(policy, flags, policy_nodes);
 		if (!IS_ERR(newpol)) {
 			/* Create pseudo-vma that contains just the policy */
 			struct vm_area_struct pvma;
@@ -1671,9 +1681,10 @@ int mpol_set_shared_policy(struct shared_policy *info,
 	struct sp_node *new = NULL;
 	unsigned long sz = vma_pages(vma);
 
-	pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
+	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
 		 vma->vm_pgoff,
-		 sz, npol? npol->policy : -1,
+		 sz, npol ? npol->policy : -1,
+		 npol ? npol->flags : -1,
 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
 
 	if (npol) {
@@ -1746,14 +1757,14 @@ void __init numa_policy_init(void)
 	if (unlikely(nodes_empty(interleave_nodes)))
 		node_set(prefer, interleave_nodes);
 
-	if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
+	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
 		printk("numa_policy_init: interleaving failed\n");
 }
 
 /* Reset policy of current process to default */
 void numa_default_policy(void)
 {
-	do_set_mempolicy(MPOL_DEFAULT, NULL);
+	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
 }
 
 /* Migrate a policy to a different set of nodes */
diff --git a/mm/shmem.c b/mm/shmem.c
index d8ef7ba831a5..1ccf794fbe61 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1080,9 +1080,10 @@ redirty:
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_TMPFS
 static int shmem_parse_mpol(char *value, unsigned short *policy,
-			    nodemask_t *policy_nodes)
+			unsigned short *mode_flags, nodemask_t *policy_nodes)
 {
 	char *nodelist = strchr(value, ':');
+	char *flags = strchr(value, '=');
 	int err = 1;
 
 	if (nodelist) {
@@ -1093,6 +1094,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 		if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
 			goto out;
 	}
+	if (flags)
+		*flags++ = '\0';
 	if (!strcmp(value, "default")) {
 		*policy = MPOL_DEFAULT;
 		/* Don't allow a nodelist */
@@ -1122,6 +1125,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 			*policy_nodes = node_states[N_HIGH_MEMORY];
 		err = 0;
 	}
+	if (flags) {
+	}
 out:
 	/* Restore string for error message */
 	if (nodelist)
@@ -1130,7 +1135,7 @@ out:
 }
 
 static void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
-			    const nodemask_t policy_nodes)
+			unsigned short flags, const nodemask_t policy_nodes)
 {
 	char *policy_string;
 
@@ -1199,13 +1204,13 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
 static inline int shmem_parse_mpol(char *value, unsigned short *policy,
-						nodemask_t *policy_nodes)
+			unsigned short *mode_flags, nodemask_t *policy_nodes)
 {
 	return 1;
 }
 
 static inline void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
-			    const nodemask_t policy_nodes)
+			unsigned short flags, const nodemask_t policy_nodes)
 {
 }
 #endif /* CONFIG_TMPFS */
@@ -1578,7 +1583,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 			inode->i_op = &shmem_inode_operations;
 			inode->i_fop = &shmem_file_operations;
 			mpol_shared_policy_init(&info->policy, sbinfo->policy,
-							&sbinfo->policy_nodes);
+					sbinfo->flags, &sbinfo->policy_nodes);
 			break;
 		case S_IFDIR:
 			inc_nlink(inode);
@@ -1592,7 +1597,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 			 * Must not load anything in the rbtree,
 			 * mpol_free_shared_policy will not be called.
 			 */
-			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
+			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0,
 						NULL);
 			break;
 		}
@@ -2209,7 +2214,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 				goto bad_val;
 		} else if (!strcmp(this_char,"mpol")) {
 			if (shmem_parse_mpol(value, &sbinfo->policy,
-					     &sbinfo->policy_nodes))
+				&sbinfo->flags, &sbinfo->policy_nodes))
 				goto bad_val;
 		} else {
 			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2261,6 +2266,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 	sbinfo->max_inodes  = config.max_inodes;
 	sbinfo->free_inodes = config.max_inodes - inodes;
 	sbinfo->policy      = config.policy;
+	sbinfo->flags	    = config.flags;
 	sbinfo->policy_nodes = config.policy_nodes;
 out:
 	spin_unlock(&sbinfo->stat_lock);
@@ -2282,7 +2288,8 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",uid=%u", sbinfo->uid);
 	if (sbinfo->gid != 0)
 		seq_printf(seq, ",gid=%u", sbinfo->gid);
-	shmem_show_mpol(seq, sbinfo->policy, sbinfo->policy_nodes);
+	shmem_show_mpol(seq, sbinfo->policy, sbinfo->flags,
+			sbinfo->policy_nodes);
 	return 0;
 }
 #endif /* CONFIG_TMPFS */
@@ -2313,6 +2320,7 @@ static int shmem_fill_super(struct super_block *sb,
 	sbinfo->uid = current->fsuid;
 	sbinfo->gid = current->fsgid;
 	sbinfo->policy = MPOL_DEFAULT;
+	sbinfo->flags = 0;
 	sbinfo->policy_nodes = node_states[N_HIGH_MEMORY];
 	sb->s_fs_info = sbinfo;
 
-- 
cgit v1.2.3


From f5b087b52f1710eb0bf15a2d2b030c51a6a1ca9e Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 28 Apr 2008 02:12:27 -0700
Subject: mempolicy: add MPOL_F_STATIC_NODES flag

Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses the
node remap when the policy is rebound.

Adds another member to struct mempolicy, nodemask_t user_nodemask, as part of
a union with cpuset_mems_allowed:

	struct mempolicy {
		...
		union {
			nodemask_t cpuset_mems_allowed;
			nodemask_t user_nodemask;
		} w;
	}

that stores the the nodemask that the user passed when he or she created the
mempolicy via set_mempolicy() or mbind().  When using MPOL_F_STATIC_NODES,
which is passed with any mempolicy mode, the user's passed nodemask
intersected with the VMA or task's allowed nodes is always used when
determining the preferred node, setting the MPOL_BIND zonelist, or creating
the interleave nodemask.  This happens whenever the policy is rebound,
including when a task's cpuset assignment changes or the cpuset's mems are
changed.

This creates an interesting side-effect in that it allows the mempolicy
"intent" to lie dormant and uneffected until it has access to the node(s) that
it desires.  For example, if you currently ask for an interleaved policy over
a set of nodes that you do not have access to, the mempolicy is not created
and the task continues to use the previous policy.  With this change, however,
it is possible to create the same mempolicy; it is only effected when access
to nodes in the nodemask is acquired.

It is also possible to mount tmpfs with the static nodemask behavior when
specifying a node or nodemask.  To do this, simply add "=static" immediately
following the mempolicy mode at mount time:

	mount -o remount mpol=interleave=static:1-3

Also removes mpol_check_policy() and folds its logic into mpol_new() since it
is now obsoleted.  The unused vma_mpol_equal() is also removed.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 172 +++++++++++++++++++++++++++++----------------------------
 mm/shmem.c     |   2 +
 2 files changed, 90 insertions(+), 84 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1f6ff9c1bbc3..d59b1e766aee 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -113,58 +113,6 @@ struct mempolicy default_policy = {
 static void mpol_rebind_policy(struct mempolicy *pol,
                                const nodemask_t *newmask);
 
-/* Do sanity checking on a policy */
-static int mpol_check_policy(unsigned short mode, nodemask_t *nodes)
-{
-	int was_empty, is_empty;
-
-	if (!nodes)
-		return 0;
-
-	/*
-	 * "Contextualize" the in-coming nodemast for cpusets:
-	 * Remember whether in-coming nodemask was empty,  If not,
-	 * restrict the nodes to the allowed nodes in the cpuset.
-	 * This is guaranteed to be a subset of nodes with memory.
-	 */
-	cpuset_update_task_memory_state();
-	is_empty = was_empty = nodes_empty(*nodes);
-	if (!was_empty) {
-		nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
-		is_empty = nodes_empty(*nodes);	/* after "contextualization" */
-	}
-
-	switch (mode) {
-	case MPOL_DEFAULT:
-		/*
-		 * require caller to specify an empty nodemask
-		 * before "contextualization"
-		 */
-		if (!was_empty)
-			return -EINVAL;
-		break;
-	case MPOL_BIND:
-	case MPOL_INTERLEAVE:
-		/*
-		 * require at least 1 valid node after "contextualization"
-		 */
-		if (is_empty)
-			return -EINVAL;
-		break;
-	case MPOL_PREFERRED:
-		/*
-		 * Did caller specify invalid nodes?
-		 * Don't silently accept this as "local allocation".
-		 */
-		if (!was_empty && is_empty)
-			return -EINVAL;
-		break;
-	default:
-		BUG();
-	}
-	return 0;
-}
-
 /* Check that the nodemask contains at least one populated zone */
 static int is_valid_nodemask(nodemask_t *nodemask)
 {
@@ -186,48 +134,60 @@ static int is_valid_nodemask(nodemask_t *nodemask)
 	return 0;
 }
 
+static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
+{
+	return pol->flags & MPOL_F_STATIC_NODES;
+}
+
 /* Create a new policy */
 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 				  nodemask_t *nodes)
 {
 	struct mempolicy *policy;
+	nodemask_t cpuset_context_nmask;
 
 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 
 	if (mode == MPOL_DEFAULT)
-		return NULL;
+		return (nodes && nodes_weight(*nodes)) ? ERR_PTR(-EINVAL) :
+							 NULL;
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!policy)
 		return ERR_PTR(-ENOMEM);
 	atomic_set(&policy->refcnt, 1);
+	cpuset_update_task_memory_state();
+	nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
 	switch (mode) {
 	case MPOL_INTERLEAVE:
-		policy->v.nodes = *nodes;
-		if (nodes_weight(policy->v.nodes) == 0) {
-			kmem_cache_free(policy_cache, policy);
-			return ERR_PTR(-EINVAL);
-		}
+		if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
+			goto free;
+		policy->v.nodes = cpuset_context_nmask;
 		break;
 	case MPOL_PREFERRED:
-		policy->v.preferred_node = first_node(*nodes);
+		policy->v.preferred_node = first_node(cpuset_context_nmask);
 		if (policy->v.preferred_node >= MAX_NUMNODES)
-			policy->v.preferred_node = -1;
+			goto free;
 		break;
 	case MPOL_BIND:
-		if (!is_valid_nodemask(nodes)) {
-			kmem_cache_free(policy_cache, policy);
-			return ERR_PTR(-EINVAL);
-		}
-		policy->v.nodes = *nodes;
+		if (!is_valid_nodemask(&cpuset_context_nmask))
+			goto free;
+		policy->v.nodes = cpuset_context_nmask;
 		break;
 	default:
 		BUG();
 	}
 	policy->policy = mode;
 	policy->flags = flags;
-	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
+	if (mpol_store_user_nodemask(policy))
+		policy->w.user_nodemask = *nodes;
+	else
+		policy->w.cpuset_mems_allowed = cpuset_mems_allowed(current);
 	return policy;
+
+free:
+	kmem_cache_free(policy_cache, policy);
+	return ERR_PTR(-EINVAL);
 }
 
 static void gather_stats(struct page *, void *, int pte_dirty);
@@ -473,15 +433,14 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 {
 	struct mempolicy *new;
 
-	if (mpol_check_policy(mode, nodes))
-		return -EINVAL;
 	new = mpol_new(mode, flags, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
 	current->mempolicy = new;
 	mpol_set_task_struct_flag();
-	if (new && new->policy == MPOL_INTERLEAVE)
+	if (new && new->policy == MPOL_INTERLEAVE &&
+	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
 	return 0;
 }
@@ -796,9 +755,6 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (end == start)
 		return 0;
 
-	if (mpol_check_policy(mode, nmask))
-		return -EINVAL;
-
 	new = mpol_new(mode, mode_flags, nmask);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
@@ -1206,7 +1162,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	next = next_node(nid, policy->v.nodes);
 	if (next >= MAX_NUMNODES)
 		next = first_node(policy->v.nodes);
-	me->il_next = next;
+	if (next < MAX_NUMNODES)
+		me->il_next = next;
 	return nid;
 }
 
@@ -1252,10 +1209,13 @@ static unsigned offset_il_node(struct mempolicy *pol,
 		struct vm_area_struct *vma, unsigned long off)
 {
 	unsigned nnodes = nodes_weight(pol->v.nodes);
-	unsigned target = (unsigned)off % nnodes;
+	unsigned target;
 	int c;
 	int nid = -1;
 
+	if (!nnodes)
+		return numa_node_id();
+	target = (unsigned int)off % nnodes;
 	c = 0;
 	do {
 		nid = next_node(nid, pol->v.nodes);
@@ -1465,6 +1425,16 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 	return new;
 }
 
+static int mpol_match_intent(const struct mempolicy *a,
+			     const struct mempolicy *b)
+{
+	if (a->flags != b->flags)
+		return 0;
+	if (!mpol_store_user_nodemask(a))
+		return 1;
+	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
+}
+
 /* Slow path of a mempolicy comparison */
 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
@@ -1472,6 +1442,8 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 		return 0;
 	if (a->policy != b->policy)
 		return 0;
+	if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
+		return 0;
 	switch (a->policy) {
 	case MPOL_DEFAULT:
 		return 1;
@@ -1771,13 +1743,14 @@ void numa_default_policy(void)
 static void mpol_rebind_policy(struct mempolicy *pol,
 			       const nodemask_t *newmask)
 {
-	nodemask_t *mpolmask;
 	nodemask_t tmp;
+	int static_nodes;
 
 	if (!pol)
 		return;
-	mpolmask = &pol->cpuset_mems_allowed;
-	if (nodes_equal(*mpolmask, *newmask))
+	static_nodes = pol->flags & MPOL_F_STATIC_NODES;
+	if (!mpol_store_user_nodemask(pol) &&
+	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
 
 	switch (pol->policy) {
@@ -1786,16 +1759,35 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
-		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
+		if (static_nodes)
+			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else {
+			nodes_remap(tmp, pol->v.nodes,
+				    pol->w.cpuset_mems_allowed, *newmask);
+			pol->w.cpuset_mems_allowed = *newmask;
+		}
 		pol->v.nodes = tmp;
-		*mpolmask = *newmask;
-		current->il_next = node_remap(current->il_next,
-						*mpolmask, *newmask);
+		if (!node_isset(current->il_next, tmp)) {
+			current->il_next = next_node(current->il_next, tmp);
+			if (current->il_next >= MAX_NUMNODES)
+				current->il_next = first_node(tmp);
+			if (current->il_next >= MAX_NUMNODES)
+				current->il_next = numa_node_id();
+		}
 		break;
 	case MPOL_PREFERRED:
-		pol->v.preferred_node = node_remap(pol->v.preferred_node,
-						*mpolmask, *newmask);
-		*mpolmask = *newmask;
+		if (static_nodes) {
+			int node = first_node(pol->w.user_nodemask);
+
+			if (node_isset(node, *newmask))
+				pol->v.preferred_node = node;
+			else
+				pol->v.preferred_node = -1;
+		} else {
+			pol->v.preferred_node = node_remap(pol->v.preferred_node,
+					pol->w.cpuset_mems_allowed, *newmask);
+			pol->w.cpuset_mems_allowed = *newmask;
+		}
 		break;
 	default:
 		BUG();
@@ -1847,6 +1839,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	int l;
 	nodemask_t nodes;
 	unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
+	unsigned short flags = pol ? pol->flags : 0;
 
 	switch (mode) {
 	case MPOL_DEFAULT:
@@ -1876,6 +1869,17 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	strcpy(p, policy_types[mode]);
 	p += l;
 
+	if (flags) {
+		int need_bar = 0;
+
+		if (buffer + maxlen < p + 2)
+			return -ENOSPC;
+		*p++ = '=';
+
+		if (flags & MPOL_F_STATIC_NODES)
+			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
+	}
+
 	if (!nodes_empty(nodes)) {
 		if (buffer + maxlen < p + 2)
 			return -ENOSPC;
diff --git a/mm/shmem.c b/mm/shmem.c
index 1ccf794fbe61..3e9fda0ca470 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1126,6 +1126,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 		err = 0;
 	}
 	if (flags) {
+		if (!strcmp(flags, "static"))
+			*mode_flags |= MPOL_F_STATIC_NODES;
 	}
 out:
 	/* Restore string for error message */
-- 
cgit v1.2.3


From 4c50bc0116cf3cc35e7152d6a8424b4db65f52d6 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 28 Apr 2008 02:12:30 -0700
Subject: mempolicy: add MPOL_F_RELATIVE_NODES flag

Adds another optional mode flag, MPOL_F_RELATIVE_NODES, that specifies
nodemasks passed via set_mempolicy() or mbind() should be considered relative
to the current task's mems_allowed.

When the mempolicy is created, the passed nodemask is folded and mapped onto
the current task's mems_allowed.  For example, consider a task using
set_mempolicy() to pass MPOL_INTERLEAVE | MPOL_F_RELATIVE_NODES with a
nodemask of 1-3.  If current's mems_allowed is 4-7, the effected nodemask is
5-7 (the second, third, and fourth node of mems_allowed).

If the same task is attached to a cpuset, the mempolicy nodemask is rebound
each time the mems are changed.  Some possible rebinds and results are:

	mems			result
	1-3			1-3
	1-7			2-4
	1,5-6			1,5-6
	1,5-7			5-7

Likewise, the zonelist built for MPOL_BIND acts on the set of zones assigned
to the resultant nodemask from the relative remap.

In the MPOL_PREFERRED case, the preferred node is remapped from the currently
effected nodemask to the relative nodemask.

This mempolicy mode flag was conceived of by Paul Jackson <pj@sgi.com>.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 33 +++++++++++++++++++++++++++++++--
 mm/shmem.c     |  6 ++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d59b1e766aee..ffd3be66b255 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -136,7 +136,15 @@ static int is_valid_nodemask(nodemask_t *nodemask)
 
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 {
-	return pol->flags & MPOL_F_STATIC_NODES;
+	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
+}
+
+static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
+				   const nodemask_t *rel)
+{
+	nodemask_t tmp;
+	nodes_fold(tmp, *orig, nodes_weight(*rel));
+	nodes_onto(*ret, tmp, *rel);
 }
 
 /* Create a new policy */
@@ -157,7 +165,12 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 		return ERR_PTR(-ENOMEM);
 	atomic_set(&policy->refcnt, 1);
 	cpuset_update_task_memory_state();
-	nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
+	if (flags & MPOL_F_RELATIVE_NODES)
+		mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+				       &cpuset_current_mems_allowed);
+	else
+		nodes_and(cpuset_context_nmask, *nodes,
+			  cpuset_current_mems_allowed);
 	switch (mode) {
 	case MPOL_INTERLEAVE:
 		if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
@@ -873,6 +886,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 	mode &= ~MPOL_MODE_FLAGS;
 	if (mode >= MPOL_MAX)
 		return -EINVAL;
+	if ((mode_flags & MPOL_F_STATIC_NODES) &&
+	    (mode_flags & MPOL_F_RELATIVE_NODES))
+		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
@@ -891,6 +907,8 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	mode &= ~MPOL_MODE_FLAGS;
 	if ((unsigned int)mode >= MPOL_MAX)
 		return -EINVAL;
+	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
+		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
@@ -1745,10 +1763,12 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 {
 	nodemask_t tmp;
 	int static_nodes;
+	int relative_nodes;
 
 	if (!pol)
 		return;
 	static_nodes = pol->flags & MPOL_F_STATIC_NODES;
+	relative_nodes = pol->flags & MPOL_F_RELATIVE_NODES;
 	if (!mpol_store_user_nodemask(pol) &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
@@ -1761,6 +1781,9 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 	case MPOL_INTERLEAVE:
 		if (static_nodes)
 			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else if (relative_nodes)
+			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
+					       newmask);
 		else {
 			nodes_remap(tmp, pol->v.nodes,
 				    pol->w.cpuset_mems_allowed, *newmask);
@@ -1783,6 +1806,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 				pol->v.preferred_node = node;
 			else
 				pol->v.preferred_node = -1;
+		} else if (relative_nodes) {
+			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
+					       newmask);
+			pol->v.preferred_node = first_node(tmp);
 		} else {
 			pol->v.preferred_node = node_remap(pol->v.preferred_node,
 					pol->w.cpuset_mems_allowed, *newmask);
@@ -1878,6 +1905,8 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 
 		if (flags & MPOL_F_STATIC_NODES)
 			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
+		if (flags & MPOL_F_RELATIVE_NODES)
+			p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
 	}
 
 	if (!nodes_empty(nodes)) {
diff --git a/mm/shmem.c b/mm/shmem.c
index 3e9fda0ca470..9435f298dd75 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1128,6 +1128,12 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 	if (flags) {
 		if (!strcmp(flags, "static"))
 			*mode_flags |= MPOL_F_STATIC_NODES;
+		if (!strcmp(flags, "relative"))
+			*mode_flags |= MPOL_F_RELATIVE_NODES;
+
+		if ((*mode_flags & MPOL_F_STATIC_NODES) &&
+		    (*mode_flags & MPOL_F_RELATIVE_NODES))
+			err = 1;
 	}
 out:
 	/* Restore string for error message */
-- 
cgit v1.2.3


From 1d0d2680a01c4f9e292ec6d4714884da939053a1 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 28 Apr 2008 02:12:32 -0700
Subject: mempolicy: move rebind functions

Move the mpol_rebind_{policy,task,mm}() functions after mpol_new() to avoid
having to declare function prototypes.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 185 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 91 insertions(+), 94 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ffd3be66b255..d44c524e5ae4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -110,9 +110,6 @@ struct mempolicy default_policy = {
 	.policy = MPOL_DEFAULT,
 };
 
-static void mpol_rebind_policy(struct mempolicy *pol,
-                               const nodemask_t *newmask);
-
 /* Check that the nodemask contains at least one populated zone */
 static int is_valid_nodemask(nodemask_t *nodemask)
 {
@@ -203,6 +200,97 @@ free:
 	return ERR_PTR(-EINVAL);
 }
 
+/* Migrate a policy to a different set of nodes */
+static void mpol_rebind_policy(struct mempolicy *pol,
+			       const nodemask_t *newmask)
+{
+	nodemask_t tmp;
+	int static_nodes;
+	int relative_nodes;
+
+	if (!pol)
+		return;
+	static_nodes = pol->flags & MPOL_F_STATIC_NODES;
+	relative_nodes = pol->flags & MPOL_F_RELATIVE_NODES;
+	if (!mpol_store_user_nodemask(pol) &&
+	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
+		return;
+
+	switch (pol->policy) {
+	case MPOL_DEFAULT:
+		break;
+	case MPOL_BIND:
+		/* Fall through */
+	case MPOL_INTERLEAVE:
+		if (static_nodes)
+			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else if (relative_nodes)
+			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
+					       newmask);
+		else {
+			nodes_remap(tmp, pol->v.nodes,
+				    pol->w.cpuset_mems_allowed, *newmask);
+			pol->w.cpuset_mems_allowed = *newmask;
+		}
+		pol->v.nodes = tmp;
+		if (!node_isset(current->il_next, tmp)) {
+			current->il_next = next_node(current->il_next, tmp);
+			if (current->il_next >= MAX_NUMNODES)
+				current->il_next = first_node(tmp);
+			if (current->il_next >= MAX_NUMNODES)
+				current->il_next = numa_node_id();
+		}
+		break;
+	case MPOL_PREFERRED:
+		if (static_nodes) {
+			int node = first_node(pol->w.user_nodemask);
+
+			if (node_isset(node, *newmask))
+				pol->v.preferred_node = node;
+			else
+				pol->v.preferred_node = -1;
+		} else if (relative_nodes) {
+			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
+					       newmask);
+			pol->v.preferred_node = first_node(tmp);
+		} else {
+			pol->v.preferred_node = node_remap(pol->v.preferred_node,
+					pol->w.cpuset_mems_allowed, *newmask);
+			pol->w.cpuset_mems_allowed = *newmask;
+		}
+		break;
+	default:
+		BUG();
+		break;
+	}
+}
+
+/*
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ */
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+	mpol_rebind_policy(tsk->mempolicy, new);
+}
+
+/*
+ * Rebind each vma in mm to new nodemask.
+ *
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
+ */
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+	struct vm_area_struct *vma;
+
+	down_write(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		mpol_rebind_policy(vma->vm_policy, new);
+	up_write(&mm->mmap_sem);
+}
+
 static void gather_stats(struct page *, void *, int pte_dirty);
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags);
@@ -1757,97 +1845,6 @@ void numa_default_policy(void)
 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
 }
 
-/* Migrate a policy to a different set of nodes */
-static void mpol_rebind_policy(struct mempolicy *pol,
-			       const nodemask_t *newmask)
-{
-	nodemask_t tmp;
-	int static_nodes;
-	int relative_nodes;
-
-	if (!pol)
-		return;
-	static_nodes = pol->flags & MPOL_F_STATIC_NODES;
-	relative_nodes = pol->flags & MPOL_F_RELATIVE_NODES;
-	if (!mpol_store_user_nodemask(pol) &&
-	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
-		return;
-
-	switch (pol->policy) {
-	case MPOL_DEFAULT:
-		break;
-	case MPOL_BIND:
-		/* Fall through */
-	case MPOL_INTERLEAVE:
-		if (static_nodes)
-			nodes_and(tmp, pol->w.user_nodemask, *newmask);
-		else if (relative_nodes)
-			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
-					       newmask);
-		else {
-			nodes_remap(tmp, pol->v.nodes,
-				    pol->w.cpuset_mems_allowed, *newmask);
-			pol->w.cpuset_mems_allowed = *newmask;
-		}
-		pol->v.nodes = tmp;
-		if (!node_isset(current->il_next, tmp)) {
-			current->il_next = next_node(current->il_next, tmp);
-			if (current->il_next >= MAX_NUMNODES)
-				current->il_next = first_node(tmp);
-			if (current->il_next >= MAX_NUMNODES)
-				current->il_next = numa_node_id();
-		}
-		break;
-	case MPOL_PREFERRED:
-		if (static_nodes) {
-			int node = first_node(pol->w.user_nodemask);
-
-			if (node_isset(node, *newmask))
-				pol->v.preferred_node = node;
-			else
-				pol->v.preferred_node = -1;
-		} else if (relative_nodes) {
-			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
-					       newmask);
-			pol->v.preferred_node = first_node(tmp);
-		} else {
-			pol->v.preferred_node = node_remap(pol->v.preferred_node,
-					pol->w.cpuset_mems_allowed, *newmask);
-			pol->w.cpuset_mems_allowed = *newmask;
-		}
-		break;
-	default:
-		BUG();
-		break;
-	}
-}
-
-/*
- * Wrapper for mpol_rebind_policy() that just requires task
- * pointer, and updates task mempolicy.
- */
-
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
-{
-	mpol_rebind_policy(tsk->mempolicy, new);
-}
-
-/*
- * Rebind each vma in mm to new nodemask.
- *
- * Call holding a reference to mm.  Takes mm->mmap_sem during call.
- */
-
-void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
-{
-	struct vm_area_struct *vma;
-
-	down_write(&mm->mmap_sem);
-	for (vma = mm->mmap; vma; vma = vma->vm_next)
-		mpol_rebind_policy(vma->vm_policy, new);
-	up_write(&mm->mmap_sem);
-}
-
 /*
  * Display pages allocated per node and memory policy via /proc.
  */
-- 
cgit v1.2.3


From 37012946da940521fb997a758a219d2f1ab56e51 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 28 Apr 2008 02:12:33 -0700
Subject: mempolicy: create mempolicy_operations structure

Create a mempolicy_operations structure that currently points to two
functions[*] for the various modes:

	int (*create)(struct mempolicy *, const nodemask_t *);
	void (*rebind)(struct mempolicy *, const nodemask_t *);

This splits the implementation for the various modes out of two large
functions, mpol_new() and mpol_rebind_policy().  Eventually it may be
beneficial to add additional functions to accomodate the existing switch()
statements in mm/mempolicy.c.

 [*] The ->create() function for MPOL_DEFAULT is currently NULL since no
     struct mempolicy is dynamically allocated.

[Lee.Schermerhorn@hp.com: fix regression in the package mempolicy regression tests]
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Eric Whitney <eric.whitney@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 233 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 140 insertions(+), 93 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d44c524e5ae4..a94d994eaaa8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -63,7 +63,6 @@
    grows down?
    make bind policy root only? It can trigger oom much faster and the
    kernel is not always grateful with that.
-   could replace all the switch()es with a mempolicy_ops structure.
 */
 
 #include <linux/mempolicy.h>
@@ -110,8 +109,13 @@ struct mempolicy default_policy = {
 	.policy = MPOL_DEFAULT,
 };
 
+static const struct mempolicy_operations {
+	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
+	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
+} mpol_ops[MPOL_MAX];
+
 /* Check that the nodemask contains at least one populated zone */
-static int is_valid_nodemask(nodemask_t *nodemask)
+static int is_valid_nodemask(const nodemask_t *nodemask)
 {
 	int nd, k;
 
@@ -144,125 +148,151 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 	nodes_onto(*ret, tmp, *rel);
 }
 
+static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
+{
+	if (nodes_empty(*nodes))
+		return -EINVAL;
+	pol->v.nodes = *nodes;
+	return 0;
+}
+
+static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
+{
+	if (!nodes)
+		pol->v.preferred_node = -1;	/* local allocation */
+	else if (nodes_empty(*nodes))
+		return -EINVAL;			/*  no allowed nodes */
+	else
+		pol->v.preferred_node = first_node(*nodes);
+	return 0;
+}
+
+static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
+{
+	if (!is_valid_nodemask(nodes))
+		return -EINVAL;
+	pol->v.nodes = *nodes;
+	return 0;
+}
+
 /* Create a new policy */
 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 				  nodemask_t *nodes)
 {
 	struct mempolicy *policy;
 	nodemask_t cpuset_context_nmask;
+	int localalloc = 0;
+	int ret;
 
 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 
 	if (mode == MPOL_DEFAULT)
-		return (nodes && nodes_weight(*nodes)) ? ERR_PTR(-EINVAL) :
-							 NULL;
+		return NULL;
+	if (!nodes || nodes_empty(*nodes)) {
+		if (mode != MPOL_PREFERRED)
+			return ERR_PTR(-EINVAL);
+		localalloc = 1;	/* special case:  no mode flags */
+	}
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!policy)
 		return ERR_PTR(-ENOMEM);
 	atomic_set(&policy->refcnt, 1);
-	cpuset_update_task_memory_state();
-	if (flags & MPOL_F_RELATIVE_NODES)
-		mpol_relative_nodemask(&cpuset_context_nmask, nodes,
-				       &cpuset_current_mems_allowed);
-	else
-		nodes_and(cpuset_context_nmask, *nodes,
-			  cpuset_current_mems_allowed);
-	switch (mode) {
-	case MPOL_INTERLEAVE:
-		if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
-			goto free;
-		policy->v.nodes = cpuset_context_nmask;
-		break;
-	case MPOL_PREFERRED:
-		policy->v.preferred_node = first_node(cpuset_context_nmask);
-		if (policy->v.preferred_node >= MAX_NUMNODES)
-			goto free;
-		break;
-	case MPOL_BIND:
-		if (!is_valid_nodemask(&cpuset_context_nmask))
-			goto free;
-		policy->v.nodes = cpuset_context_nmask;
-		break;
-	default:
-		BUG();
-	}
 	policy->policy = mode;
-	policy->flags = flags;
-	if (mpol_store_user_nodemask(policy))
-		policy->w.user_nodemask = *nodes;
-	else
-		policy->w.cpuset_mems_allowed = cpuset_mems_allowed(current);
+
+	if (!localalloc) {
+		policy->flags = flags;
+		cpuset_update_task_memory_state();
+		if (flags & MPOL_F_RELATIVE_NODES)
+			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+					       &cpuset_current_mems_allowed);
+		else
+			nodes_and(cpuset_context_nmask, *nodes,
+				  cpuset_current_mems_allowed);
+		if (mpol_store_user_nodemask(policy))
+			policy->w.user_nodemask = *nodes;
+		else
+			policy->w.cpuset_mems_allowed =
+						cpuset_mems_allowed(current);
+	}
+
+	ret = mpol_ops[mode].create(policy,
+				localalloc ? NULL : &cpuset_context_nmask);
+	if (ret < 0) {
+		kmem_cache_free(policy_cache, policy);
+		return ERR_PTR(ret);
+	}
 	return policy;
+}
+
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
+{
+}
+
+static void mpol_rebind_nodemask(struct mempolicy *pol,
+				 const nodemask_t *nodes)
+{
+	nodemask_t tmp;
+
+	if (pol->flags & MPOL_F_STATIC_NODES)
+		nodes_and(tmp, pol->w.user_nodemask, *nodes);
+	else if (pol->flags & MPOL_F_RELATIVE_NODES)
+		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+	else {
+		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
+			    *nodes);
+		pol->w.cpuset_mems_allowed = *nodes;
+	}
 
-free:
-	kmem_cache_free(policy_cache, policy);
-	return ERR_PTR(-EINVAL);
+	pol->v.nodes = tmp;
+	if (!node_isset(current->il_next, tmp)) {
+		current->il_next = next_node(current->il_next, tmp);
+		if (current->il_next >= MAX_NUMNODES)
+			current->il_next = first_node(tmp);
+		if (current->il_next >= MAX_NUMNODES)
+			current->il_next = numa_node_id();
+	}
+}
+
+static void mpol_rebind_preferred(struct mempolicy *pol,
+				  const nodemask_t *nodes)
+{
+	nodemask_t tmp;
+
+	/*
+	 * check 'STATIC_NODES first, as preferred_node == -1 may be
+	 * a temporary, "fallback" state for this policy.
+	 */
+	if (pol->flags & MPOL_F_STATIC_NODES) {
+		int node = first_node(pol->w.user_nodemask);
+
+		if (node_isset(node, *nodes))
+			pol->v.preferred_node = node;
+		else
+			pol->v.preferred_node = -1;
+	} else if (pol->v.preferred_node == -1) {
+		return;	/* no remap required for explicit local alloc */
+	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
+		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+		pol->v.preferred_node = first_node(tmp);
+	} else {
+		pol->v.preferred_node = node_remap(pol->v.preferred_node,
+						   pol->w.cpuset_mems_allowed,
+						   *nodes);
+		pol->w.cpuset_mems_allowed = *nodes;
+	}
 }
 
 /* Migrate a policy to a different set of nodes */
 static void mpol_rebind_policy(struct mempolicy *pol,
 			       const nodemask_t *newmask)
 {
-	nodemask_t tmp;
-	int static_nodes;
-	int relative_nodes;
-
 	if (!pol)
 		return;
-	static_nodes = pol->flags & MPOL_F_STATIC_NODES;
-	relative_nodes = pol->flags & MPOL_F_RELATIVE_NODES;
 	if (!mpol_store_user_nodemask(pol) &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
-
-	switch (pol->policy) {
-	case MPOL_DEFAULT:
-		break;
-	case MPOL_BIND:
-		/* Fall through */
-	case MPOL_INTERLEAVE:
-		if (static_nodes)
-			nodes_and(tmp, pol->w.user_nodemask, *newmask);
-		else if (relative_nodes)
-			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
-					       newmask);
-		else {
-			nodes_remap(tmp, pol->v.nodes,
-				    pol->w.cpuset_mems_allowed, *newmask);
-			pol->w.cpuset_mems_allowed = *newmask;
-		}
-		pol->v.nodes = tmp;
-		if (!node_isset(current->il_next, tmp)) {
-			current->il_next = next_node(current->il_next, tmp);
-			if (current->il_next >= MAX_NUMNODES)
-				current->il_next = first_node(tmp);
-			if (current->il_next >= MAX_NUMNODES)
-				current->il_next = numa_node_id();
-		}
-		break;
-	case MPOL_PREFERRED:
-		if (static_nodes) {
-			int node = first_node(pol->w.user_nodemask);
-
-			if (node_isset(node, *newmask))
-				pol->v.preferred_node = node;
-			else
-				pol->v.preferred_node = -1;
-		} else if (relative_nodes) {
-			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
-					       newmask);
-			pol->v.preferred_node = first_node(tmp);
-		} else {
-			pol->v.preferred_node = node_remap(pol->v.preferred_node,
-					pol->w.cpuset_mems_allowed, *newmask);
-			pol->w.cpuset_mems_allowed = *newmask;
-		}
-		break;
-	default:
-		BUG();
-		break;
-	}
+	mpol_ops[pol->policy].rebind(pol, newmask);
 }
 
 /*
@@ -291,6 +321,24 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 	up_write(&mm->mmap_sem);
 }
 
+static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
+	[MPOL_DEFAULT] = {
+		.rebind = mpol_rebind_default,
+	},
+	[MPOL_INTERLEAVE] = {
+		.create = mpol_new_interleave,
+		.rebind = mpol_rebind_nodemask,
+	},
+	[MPOL_PREFERRED] = {
+		.create = mpol_new_preferred,
+		.rebind = mpol_rebind_preferred,
+	},
+	[MPOL_BIND] = {
+		.create = mpol_new_bind,
+		.rebind = mpol_rebind_nodemask,
+	},
+};
+
 static void gather_stats(struct page *, void *, int pte_dirty);
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags);
@@ -1848,7 +1896,6 @@ void numa_default_policy(void)
 /*
  * Display pages allocated per node and memory policy via /proc.
  */
-
 static const char * const policy_types[] =
 	{ "default", "prefer", "bind", "interleave" };
 
-- 
cgit v1.2.3


From 3e1f064562fcff7bf3856bc1d00dfa84d4f121cc Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 28 Apr 2008 02:12:34 -0700
Subject: mempolicy: disallow static or relative flags for local preferred mode

MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES don't mean anything for
MPOL_PREFERRED policies that were created with an empty nodemask (for purely
local allocations).  They'll never be invalidated because the allowed mems of
a task changes or need to be rebound relative to a cpuset's placement.

Also fixes a bug identified by Lee Schermerhorn that disallowed empty
nodemasks to be passed to MPOL_PREFERRED to specify local allocations.  [A
different, somewhat incomplete, patch already existed in 25-rc5-mm1.]

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a94d994eaaa8..c1b907789d84 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -181,27 +181,43 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 {
 	struct mempolicy *policy;
 	nodemask_t cpuset_context_nmask;
-	int localalloc = 0;
 	int ret;
 
 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 
-	if (mode == MPOL_DEFAULT)
-		return NULL;
-	if (!nodes || nodes_empty(*nodes)) {
-		if (mode != MPOL_PREFERRED)
+	if (mode == MPOL_DEFAULT) {
+		if (nodes && !nodes_empty(*nodes))
 			return ERR_PTR(-EINVAL);
-		localalloc = 1;	/* special case:  no mode flags */
+		return NULL;
 	}
+	VM_BUG_ON(!nodes);
+
+	/*
+	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
+	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
+	 * All other modes require a valid pointer to a non-empty nodemask.
+	 */
+	if (mode == MPOL_PREFERRED) {
+		if (nodes_empty(*nodes)) {
+			if (((flags & MPOL_F_STATIC_NODES) ||
+			     (flags & MPOL_F_RELATIVE_NODES)))
+				return ERR_PTR(-EINVAL);
+			nodes = NULL;	/* flag local alloc */
+		}
+	} else if (nodes_empty(*nodes))
+		return ERR_PTR(-EINVAL);
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!policy)
 		return ERR_PTR(-ENOMEM);
 	atomic_set(&policy->refcnt, 1);
 	policy->policy = mode;
+	policy->flags = flags;
 
-	if (!localalloc) {
-		policy->flags = flags;
+	if (nodes) {
+		/*
+		 * cpuset related setup doesn't apply to local allocation
+		 */
 		cpuset_update_task_memory_state();
 		if (flags & MPOL_F_RELATIVE_NODES)
 			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
@@ -217,7 +233,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	}
 
 	ret = mpol_ops[mode].create(policy,
-				localalloc ? NULL : &cpuset_context_nmask);
+				nodes ? &cpuset_context_nmask : NULL);
 	if (ret < 0) {
 		kmem_cache_free(policy_cache, policy);
 		return ERR_PTR(ret);
@@ -259,10 +275,6 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
 {
 	nodemask_t tmp;
 
-	/*
-	 * check 'STATIC_NODES first, as preferred_node == -1 may be
-	 * a temporary, "fallback" state for this policy.
-	 */
 	if (pol->flags & MPOL_F_STATIC_NODES) {
 		int node = first_node(pol->w.user_nodemask);
 
@@ -270,12 +282,10 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
 			pol->v.preferred_node = node;
 		else
 			pol->v.preferred_node = -1;
-	} else if (pol->v.preferred_node == -1) {
-		return;	/* no remap required for explicit local alloc */
 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 		pol->v.preferred_node = first_node(tmp);
-	} else {
+	} else if (pol->v.preferred_node != -1) {
 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
 						   pol->w.cpuset_mems_allowed,
 						   *nodes);
-- 
cgit v1.2.3


From a43361cf3cb6fb6431fdbfb0f3ef26a334826160 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:12:36 -0700
Subject: mempolicy: fix parsing of tmpfs mpol mount option

Parsing of new mode flags in the tmpfs mpol mount option is slightly broken:

Setting a valid flag works OK:
	#mount -o remount,mpol=bind=static:1-2 /dev/shm
	#mount
	...
	tmpfs on /dev/shm type tmpfs (rw,mpol=bind=static:1-2)
	...

However, we can't remove them or change them, once we've
set a valid flag:

	#mount -o remount,mpol=bind:1-2 /dev/shm
	#mount
	...
	tmpfs on /dev/shm type tmpfs (rw,mpol=bind:1-2)
	...

It SAYS it removed it, but that's just a copy of the input
string.  If we now try to set it to a different flag, we
get:

	#mount -o remount,mpol=bind=relative:1-2 /dev/shm
	mount: /dev/shm not mounted already, or bad option

And on the console, we see:
	tmpfs: Bad value 'bind' for mount option 'mpol'
	                      ^ lost remainder of string

Furthermore, bogus flags are accepted with out error.
Granted, they are a no-op:

	#mount -o remount,mpol=interleave=foo:0-3 /dev/shm
	#mount
	...
	tmpfs on /dev/shm type tmpfs (rw,mpol=interleave=foo:0-3)

Again, that's just a copy of the input string shown by the mount command.

This patch fixes the behavior by pre-zeroing the flags so that only one of the
mutually exclusive flags can be set at one time.  It also reports an error
when an unrecognized flag is specified.

The check for both flags being set is removed because it can't happen with
this implementation.  If we ever want to support multiple non-exclusive flags,
this area will need rework and we will need to check that any mutually
exclusive flags aren't specified.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Eric Whitney <eric.whitney@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 9435f298dd75..177c7a7d2bb3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1125,20 +1125,26 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 			*policy_nodes = node_states[N_HIGH_MEMORY];
 		err = 0;
 	}
+
+	*mode_flags = 0;
 	if (flags) {
+		/*
+		 * Currently, we only support two mutually exclusive
+		 * mode flags.
+		 */
 		if (!strcmp(flags, "static"))
 			*mode_flags |= MPOL_F_STATIC_NODES;
-		if (!strcmp(flags, "relative"))
+		else if (!strcmp(flags, "relative"))
 			*mode_flags |= MPOL_F_RELATIVE_NODES;
-
-		if ((*mode_flags & MPOL_F_STATIC_NODES) &&
-		    (*mode_flags & MPOL_F_RELATIVE_NODES))
-			err = 1;
+		else
+			err = 1;	/* unrecognized flag */
 	}
 out:
 	/* Restore string for error message */
 	if (nodelist)
 		*--nodelist = ':';
+	if (flags)
+		*--flags = '=';
 	return err;
 }
 
-- 
cgit v1.2.3


From b5ee5befa75e33e55d34584ad10286c5005cb1de Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 28 Apr 2008 02:12:37 -0700
Subject: dmapool: enable debugging for CONFIG_SLUB_DEBUG_ON too

Previously it was only enabled for CONFIG_DEBUG_SLAB.

Not hooked into the slub runtime debug configuration, so you currently only
get it with CONFIG_SLUB_DEBUG_ON, not plain CONFIG_SLUB_DEBUG

Acked-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/dmapool.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/dmapool.c b/mm/dmapool.c
index 34aaac451a96..b1f0885dda22 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -37,6 +37,10 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+#define DMAPOOL_DEBUG 1
+#endif
+
 struct dma_pool {		/* the pool */
 	struct list_head page_list;
 	spinlock_t lock;
@@ -216,7 +220,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
 	page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
 					 &page->dma, mem_flags);
 	if (page->vaddr) {
-#ifdef	CONFIG_DEBUG_SLAB
+#ifdef	DMAPOOL_DEBUG
 		memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
 #endif
 		pool_initialise_page(pool, page);
@@ -239,7 +243,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
 {
 	dma_addr_t dma = page->dma;
 
-#ifdef	CONFIG_DEBUG_SLAB
+#ifdef	DMAPOOL_DEBUG
 	memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
 #endif
 	dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
@@ -336,7 +340,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 	page->offset = *(int *)(page->vaddr + offset);
 	retval = offset + page->vaddr;
 	*handle = offset + page->dma;
-#ifdef	CONFIG_DEBUG_SLAB
+#ifdef	DMAPOOL_DEBUG
 	memset(retval, POOL_POISON_ALLOCATED, pool->size);
 #endif
  done:
@@ -391,7 +395,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 	}
 
 	offset = vaddr - page->vaddr;
-#ifdef	CONFIG_DEBUG_SLAB
+#ifdef	DMAPOOL_DEBUG
 	if ((dma - page->dma) != offset) {
 		if (pool->dev)
 			dev_err(pool->dev,
-- 
cgit v1.2.3


From f05111f50105ac479a008cf85749cf9c956453ea Mon Sep 17 00:00:00 2001
From: "S.Caglar Onur" <caglar@pardus.org.tr>
Date: Mon, 28 Apr 2008 02:12:38 -0700
Subject: mm/page_alloc.c: fix indentation

zlc_setup(): handle jiffies wraparound
(10ed273f5016c582413dfbc468dd084957d847e1) changes tab with spaces

Signed-off-by: S.Caglar Onur <caglar@pardus.org.tr>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b4beb3eea8b7..af28e2cec8b4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1284,7 +1284,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 	if (!zlc)
 		return NULL;
 
-       if (time_after(jiffies, zlc->last_full_zap + HZ)) {
+	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
 		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 		zlc->last_full_zap = jiffies;
 	}
-- 
cgit v1.2.3


From ac6aadb24b7d4f0e54246732e221c102073412bf Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 28 Apr 2008 02:12:38 -0700
Subject: mm: rotate_reclaimable_page() cleanup

Clean up messy conditional calling of test_clear_page_writeback() from both
rotate_reclaimable_page() and end_page_writeback().

The only user of rotate_reclaimable_page() is end_page_writeback() so this is
OK.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 10 ++++++----
 mm/swap.c    | 37 ++++++++++++-------------------------
 2 files changed, 18 insertions(+), 29 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 07e9d9258b48..239d36163bbe 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -576,10 +576,12 @@ EXPORT_SYMBOL(unlock_page);
  */
 void end_page_writeback(struct page *page)
 {
-	if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
-		if (!test_clear_page_writeback(page))
-			BUG();
-	}
+	if (TestClearPageReclaim(page))
+		rotate_reclaimable_page(page);
+
+	if (!test_clear_page_writeback(page))
+		BUG();
+
 	smp_mb__after_clear_bit();
 	wake_up_page(page, PG_writeback);
 }
diff --git a/mm/swap.c b/mm/swap.c
index aa1139ccf3a7..91e194445a5e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -132,34 +132,21 @@ static void pagevec_move_tail(struct pagevec *pvec)
  * Writeback is about to end against a page which has been marked for immediate
  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
  * inactive list.
- *
- * Returns zero if it cleared PG_writeback.
  */
-int rotate_reclaimable_page(struct page *page)
+void  rotate_reclaimable_page(struct page *page)
 {
-	struct pagevec *pvec;
-	unsigned long flags;
-
-	if (PageLocked(page))
-		return 1;
-	if (PageDirty(page))
-		return 1;
-	if (PageActive(page))
-		return 1;
-	if (!PageLRU(page))
-		return 1;
-
-	page_cache_get(page);
-	local_irq_save(flags);
-	pvec = &__get_cpu_var(lru_rotate_pvecs);
-	if (!pagevec_add(pvec, page))
-		pagevec_move_tail(pvec);
-	local_irq_restore(flags);
-
-	if (!test_clear_page_writeback(page))
-		BUG();
+	if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
+	    PageLRU(page)) {
+		struct pagevec *pvec;
+		unsigned long flags;
 
-	return 0;
+		page_cache_get(page);
+		local_irq_save(flags);
+		pvec = &__get_cpu_var(lru_rotate_pvecs);
+		if (!pagevec_add(pvec, page))
+			pagevec_move_tail(pvec);
+		local_irq_restore(flags);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From a10aa579878fc6f9cd17455067380bbdf1d53c91 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 28 Apr 2008 02:12:40 -0700
Subject: vmalloc: show vmalloced areas via /proc/vmallocinfo

Implement a new proc file that allows the display of the currently allocated
vmalloc memory.

It allows to see the users of vmalloc.  That is important if vmalloc space is
scarce (i386 for example).

And it's going to be important for the compound page fallback to vmalloc.
Many of the current users can be switched to use compound pages with fallback.
 This means that the number of users of vmalloc is reduced and page tables no
longer necessary to access the memory.  /proc/vmallocinfo allows to review how
that reduction occurs.

If memory becomes fragmented and larger order allocations are no longer
possible then /proc/vmallocinfo allows to see which compound page allocations
fell back to virtual compound pages.  That is important for new users of
virtual compound pages.  Such as order 1 stack allocation etc that may
fallback to virtual compound pages in the future.

/proc/vmallocinfo permissions are made readable-only-by-root to avoid possible
information leakage.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: CONFIG_MMU=n build fix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ecf91f8034bf..afa550f66537 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -14,7 +14,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
-
+#include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 
 #include <asm/uaccess.h>
@@ -873,3 +873,77 @@ void free_vm_area(struct vm_struct *area)
 	kfree(area);
 }
 EXPORT_SYMBOL_GPL(free_vm_area);
+
+
+#ifdef CONFIG_PROC_FS
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	struct vm_struct *v;
+
+	read_lock(&vmlist_lock);
+	v = vmlist;
+	while (n > 0 && v) {
+		n--;
+		v = v->next;
+	}
+	if (!n)
+		return v;
+
+	return NULL;
+
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct vm_struct *v = p;
+
+	++*pos;
+	return v->next;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+	read_unlock(&vmlist_lock);
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+	struct vm_struct *v = p;
+
+	seq_printf(m, "0x%p-0x%p %7ld",
+		v->addr, v->addr + v->size, v->size);
+
+	if (v->nr_pages)
+		seq_printf(m, " pages=%d", v->nr_pages);
+
+	if (v->phys_addr)
+		seq_printf(m, " phys=%lx", v->phys_addr);
+
+	if (v->flags & VM_IOREMAP)
+		seq_printf(m, " ioremap");
+
+	if (v->flags & VM_ALLOC)
+		seq_printf(m, " vmalloc");
+
+	if (v->flags & VM_MAP)
+		seq_printf(m, " vmap");
+
+	if (v->flags & VM_USERMAP)
+		seq_printf(m, " user");
+
+	if (v->flags & VM_VPAGES)
+		seq_printf(m, " vpages");
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+const struct seq_operations vmalloc_op = {
+	.start = s_start,
+	.next = s_next,
+	.stop = s_stop,
+	.show = s_show,
+};
+#endif
+
-- 
cgit v1.2.3


From 2301696932b55e2ea2085cefc84f7b94fa2dd54b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 28 Apr 2008 02:12:42 -0700
Subject: vmallocinfo: add caller information

Add caller information so that /proc/vmallocinfo shows where the allocation
request for a slice of vmalloc memory originated.

Results in output like this:

0xffffc20000000000-0xffffc20000801000 8392704 alloc_large_system_hash+0x127/0x246 pages=2048 vmalloc vpages
0xffffc20000801000-0xffffc20000806000   20480 alloc_large_system_hash+0x127/0x246 pages=4 vmalloc
0xffffc20000806000-0xffffc20000c07000 4198400 alloc_large_system_hash+0x127/0x246 pages=1024 vmalloc vpages
0xffffc20000c07000-0xffffc20000c0a000   12288 alloc_large_system_hash+0x127/0x246 pages=2 vmalloc
0xffffc20000c0a000-0xffffc20000c0c000    8192 acpi_os_map_memory+0x13/0x1c phys=cff68000 ioremap
0xffffc20000c0c000-0xffffc20000c0f000   12288 acpi_os_map_memory+0x13/0x1c phys=cff64000 ioremap
0xffffc20000c10000-0xffffc20000c15000   20480 acpi_os_map_memory+0x13/0x1c phys=cff65000 ioremap
0xffffc20000c16000-0xffffc20000c18000    8192 acpi_os_map_memory+0x13/0x1c phys=cff69000 ioremap
0xffffc20000c18000-0xffffc20000c1a000    8192 acpi_os_map_memory+0x13/0x1c phys=fed1f000 ioremap
0xffffc20000c1a000-0xffffc20000c1c000    8192 acpi_os_map_memory+0x13/0x1c phys=cff68000 ioremap
0xffffc20000c1c000-0xffffc20000c1e000    8192 acpi_os_map_memory+0x13/0x1c phys=cff68000 ioremap
0xffffc20000c1e000-0xffffc20000c20000    8192 acpi_os_map_memory+0x13/0x1c phys=cff68000 ioremap
0xffffc20000c20000-0xffffc20000c22000    8192 acpi_os_map_memory+0x13/0x1c phys=cff68000 ioremap
0xffffc20000c22000-0xffffc20000c24000    8192 acpi_os_map_memory+0x13/0x1c phys=cff68000 ioremap
0xffffc20000c24000-0xffffc20000c26000    8192 acpi_os_map_memory+0x13/0x1c phys=e0081000 ioremap
0xffffc20000c26000-0xffffc20000c28000    8192 acpi_os_map_memory+0x13/0x1c phys=e0080000 ioremap
0xffffc20000c28000-0xffffc20000c2d000   20480 alloc_large_system_hash+0x127/0x246 pages=4 vmalloc
0xffffc20000c2d000-0xffffc20000c31000   16384 tcp_init+0xd5/0x31c pages=3 vmalloc
0xffffc20000c31000-0xffffc20000c34000   12288 alloc_large_system_hash+0x127/0x246 pages=2 vmalloc
0xffffc20000c34000-0xffffc20000c36000    8192 init_vdso_vars+0xde/0x1f1
0xffffc20000c36000-0xffffc20000c38000    8192 pci_iomap+0x8a/0xb4 phys=d8e00000 ioremap
0xffffc20000c38000-0xffffc20000c3a000    8192 usb_hcd_pci_probe+0x139/0x295 [usbcore] phys=d8e00000 ioremap
0xffffc20000c3a000-0xffffc20000c3e000   16384 sys_swapon+0x509/0xa15 pages=3 vmalloc
0xffffc20000c40000-0xffffc20000c61000  135168 e1000_probe+0x1c4/0xa32 phys=d8a20000 ioremap
0xffffc20000c61000-0xffffc20000c6a000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc20000c6a000-0xffffc20000c73000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc20000c73000-0xffffc20000c7c000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc20000c7c000-0xffffc20000c7f000   12288 e1000e_setup_tx_resources+0x29/0xbe pages=2 vmalloc
0xffffc20000c80000-0xffffc20001481000 8392704 pci_mmcfg_arch_init+0x90/0x118 phys=e0000000 ioremap
0xffffc20001481000-0xffffc20001682000 2101248 alloc_large_system_hash+0x127/0x246 pages=512 vmalloc
0xffffc20001682000-0xffffc20001e83000 8392704 alloc_large_system_hash+0x127/0x246 pages=2048 vmalloc vpages
0xffffc20001e83000-0xffffc20002204000 3674112 alloc_large_system_hash+0x127/0x246 pages=896 vmalloc vpages
0xffffc20002204000-0xffffc2000220d000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc2000220d000-0xffffc20002216000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc20002216000-0xffffc2000221f000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc2000221f000-0xffffc20002228000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc20002228000-0xffffc20002231000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc20002231000-0xffffc20002234000   12288 e1000e_setup_rx_resources+0x35/0x122 pages=2 vmalloc
0xffffc20002240000-0xffffc20002261000  135168 e1000_probe+0x1c4/0xa32 phys=d8a60000 ioremap
0xffffc20002261000-0xffffc2000270c000 4894720 sys_swapon+0x509/0xa15 pages=1194 vmalloc vpages
0xffffffffa0000000-0xffffffffa0022000  139264 module_alloc+0x4f/0x55 pages=33 vmalloc
0xffffffffa0022000-0xffffffffa0029000   28672 module_alloc+0x4f/0x55 pages=6 vmalloc
0xffffffffa002b000-0xffffffffa0034000   36864 module_alloc+0x4f/0x55 pages=8 vmalloc
0xffffffffa0034000-0xffffffffa003d000   36864 module_alloc+0x4f/0x55 pages=8 vmalloc
0xffffffffa003d000-0xffffffffa0049000   49152 module_alloc+0x4f/0x55 pages=11 vmalloc
0xffffffffa0049000-0xffffffffa0050000   28672 module_alloc+0x4f/0x55 pages=6 vmalloc

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 65 +++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index afa550f66537..e33e0ae69ad1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
+#include <linux/kallsyms.h>
 
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
@@ -25,7 +26,7 @@ DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
 
 static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
-			    int node);
+			    int node, void *caller);
 
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
 {
@@ -204,9 +205,9 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 }
 EXPORT_SYMBOL(vmalloc_to_pfn);
 
-static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
-					    unsigned long start, unsigned long end,
-					    int node, gfp_t gfp_mask)
+static struct vm_struct *
+__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
+		unsigned long end, int node, gfp_t gfp_mask, void *caller)
 {
 	struct vm_struct **p, *tmp, *area;
 	unsigned long align = 1;
@@ -269,6 +270,7 @@ found:
 	area->pages = NULL;
 	area->nr_pages = 0;
 	area->phys_addr = 0;
+	area->caller = caller;
 	write_unlock(&vmlist_lock);
 
 	return area;
@@ -284,7 +286,8 @@ out:
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 				unsigned long start, unsigned long end)
 {
-	return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
+	return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+						__builtin_return_address(0));
 }
 EXPORT_SYMBOL_GPL(__get_vm_area);
 
@@ -299,14 +302,22 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
  */
 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 {
-	return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
+	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+				-1, GFP_KERNEL, __builtin_return_address(0));
+}
+
+struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
+				void *caller)
+{
+	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+						-1, GFP_KERNEL, caller);
 }
 
 struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
 				   int node, gfp_t gfp_mask)
 {
 	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
-				  gfp_mask);
+				  gfp_mask, __builtin_return_address(0));
 }
 
 /* Caller must hold vmlist_lock */
@@ -455,9 +466,11 @@ void *vmap(struct page **pages, unsigned int count,
 	if (count > num_physpages)
 		return NULL;
 
-	area = get_vm_area((count << PAGE_SHIFT), flags);
+	area = get_vm_area_caller((count << PAGE_SHIFT), flags,
+					__builtin_return_address(0));
 	if (!area)
 		return NULL;
+
 	if (map_vm_area(area, prot, &pages)) {
 		vunmap(area->addr);
 		return NULL;
@@ -468,7 +481,7 @@ void *vmap(struct page **pages, unsigned int count,
 EXPORT_SYMBOL(vmap);
 
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-				 pgprot_t prot, int node)
+				 pgprot_t prot, int node, void *caller)
 {
 	struct page **pages;
 	unsigned int nr_pages, array_size, i;
@@ -480,7 +493,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	/* Please note that the recursion is strictly bounded. */
 	if (array_size > PAGE_SIZE) {
 		pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
-					PAGE_KERNEL, node);
+				PAGE_KERNEL, node, caller);
 		area->flags |= VM_VPAGES;
 	} else {
 		pages = kmalloc_node(array_size,
@@ -488,6 +501,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				node);
 	}
 	area->pages = pages;
+	area->caller = caller;
 	if (!area->pages) {
 		remove_vm_area(area->addr);
 		kfree(area);
@@ -521,7 +535,8 @@ fail:
 
 void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 {
-	return __vmalloc_area_node(area, gfp_mask, prot, -1);
+	return __vmalloc_area_node(area, gfp_mask, prot, -1,
+					__builtin_return_address(0));
 }
 
 /**
@@ -536,7 +551,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
  *	kernel virtual space, using a pagetable protection of @prot.
  */
 static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
-			    int node)
+						int node, void *caller)
 {
 	struct vm_struct *area;
 
@@ -544,16 +559,19 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
 	if (!size || (size >> PAGE_SHIFT) > num_physpages)
 		return NULL;
 
-	area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask);
+	area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
+						node, gfp_mask, caller);
+
 	if (!area)
 		return NULL;
 
-	return __vmalloc_area_node(area, gfp_mask, prot, node);
+	return __vmalloc_area_node(area, gfp_mask, prot, node, caller);
 }
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
-	return __vmalloc_node(size, gfp_mask, prot, -1);
+	return __vmalloc_node(size, gfp_mask, prot, -1,
+				__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__vmalloc);
 
@@ -568,7 +586,8 @@ EXPORT_SYMBOL(__vmalloc);
  */
 void *vmalloc(unsigned long size)
 {
-	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+					-1, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc);
 
@@ -608,7 +627,8 @@ EXPORT_SYMBOL(vmalloc_user);
  */
 void *vmalloc_node(unsigned long size, int node)
 {
-	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+					node, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_node);
 
@@ -843,7 +863,8 @@ struct vm_struct *alloc_vm_area(size_t size)
 {
 	struct vm_struct *area;
 
-	area = get_vm_area(size, VM_IOREMAP);
+	area = get_vm_area_caller(size, VM_IOREMAP,
+				__builtin_return_address(0));
 	if (area == NULL)
 		return NULL;
 
@@ -914,6 +935,14 @@ static int s_show(struct seq_file *m, void *p)
 	seq_printf(m, "0x%p-0x%p %7ld",
 		v->addr, v->addr + v->size, v->size);
 
+	if (v->caller) {
+		char buff[2 * KSYM_NAME_LEN];
+
+		seq_putc(m, ' ');
+		sprint_symbol(buff, (unsigned long)v->caller);
+		seq_puts(m, buff);
+	}
+
 	if (v->nr_pages)
 		seq_printf(m, " pages=%d", v->nr_pages);
 
-- 
cgit v1.2.3


From 0a128b2b1a5e8ebce0260e3345812ee70daccc7f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 28 Apr 2008 02:12:52 -0700
Subject: pageflags: eliminate PG_xxx aliases

Remove aliases of PG_xxx.  We can easily drop those now and alias by
specifying the PG_xxx flag in the macro that generates the functions.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index af28e2cec8b4..e0fc3baba843 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -632,7 +632,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 	if (PageReserved(page))
 		return 1;
 
-	page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead |
+	page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
 	set_page_private(page, 0);
-- 
cgit v1.2.3


From e20b8cca760ed2a6abcfe37ef56f2306790db648 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 28 Apr 2008 02:12:55 -0700
Subject: PAGEFLAGS_EXTENDED and separate page flags for Head and Tail

Having separate page flags for the head and the tail of a compound page allows
the compiler to use bitops instead of operations on a word to check for a tail
page.  That is f.e.  important for virt_to_head_page() which is used in
various critical code paths (kfree for example):

Code for PageTail(page)

Before:

 mov    (%rdi),%rdx		page->flags
 mov    %rdx,%rax		3 bytes
 and    $0x12000,%eax		5 bytes
 cmp    $0x12000,%rax		6 bytes
 je     897 <kfree+0xa7>

After:

 mov    (%rdi),%rax
 test   $0x40,%ah			(3 bytes)
 jne    887 <kfree+0x97>

So we go from 14 bytes to 3 bytes and from 3 instructions to one.  From the
use of 2 registers we go to none.

We can only use page flags for this if we have page flags available.  This
patch introduces CONFIG_PAGEFLAGS_EXTENDED that is set if pageflags are not
scarce due to SPARSEMEM using page flags for its sectionid on 32 bit NUMA
platforms.

Additional page flag definitions can be added to the CONFIG_PAGEFLAGS_EXTENDED
section in page-flags.h if the functionality depends on PAGEFLAGS_EXTENDED or
if more page flag overlapping tricks are used for the !PAGEFLAGS_EXTENDED
fallback (the upcoming virtual compound patch may hook in here and Rik's/Lee's
additional page flags to solve the reclaim issues could also be added there
[hint...  hint...  where are these patchsets?]).

Avoiding the overlaying of Pg_reclaim also clears the way for possible use of
compound pages for the pagecache or on the LRU.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Kconfig | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index 0016ebd4dcba..3aa819d628c1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,18 @@ config MEMORY_HOTREMOVE
 	depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
 	depends on MIGRATION
 
+#
+# If we have space for more page flags then we can enable additional
+# optimizations and functionality.
+#
+# Regular Sparsemem takes page flag bits for the sectionid if it does not
+# use a virtual memmap. Disable extended page flags for 32 bit platforms
+# that require the use of a sectionid in the page flags.
+#
+config PAGEFLAGS_EXTENDED
+	def_bool y
+	depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
-- 
cgit v1.2.3


From b379d790197cdf8a95fb67507d75a24ac0a1678d Mon Sep 17 00:00:00 2001
From: Jared Hulbert <jaredeh@gmail.com>
Date: Mon, 28 Apr 2008 02:12:58 -0700
Subject: mm: introduce VM_MIXEDMAP

This series introduces some important infrastructure work.  The overall result
is that:

1. We now support XIP backed filesystems using memory that have no
   struct page allocated to them. And patches 6 and 7 actually implement
   this for s390.

   This is pretty important in a number of cases. As far as I understand,
   in the case of virtualisation (eg. s390), each guest may mount a
   readonly copy of the same filesystem (eg. the distro). Currently,
   guests need to allocate struct pages for this image. So if you have
   100 guests, you already need to allocate more memory for the struct
   pages than the size of the image. I think. (Carsten?)

   For other (eg. embedded) systems, you may have a very large non-
   volatile filesystem. If you have to have struct pages for this, then
   your RAM consumption will go up proportionally to fs size. Even
   though it is just a small proportion, the RAM can be much more costly
   eg in terms of power, so every KB less that Linux uses makes it more
   attractive to a lot of these guys.

2. VM_MIXEDMAP allows us to support mappings where you actually do want
   to refcount _some_ pages in the mapping, but not others, and support
   COW on arbitrary (non-linear) mappings. Jared needs this for his NVRAM
   filesystem in progress. Future iterations of this filesystem will
   most likely want to migrate pages between pagecache and XIP backing,
   which is where the requirement for mixed (some refcounted, some not)
   comes from.

3. pte_special also has a peripheral usage that I need for my lockless
   get_user_pages patch. That was shown to speed up "oltp" on db2 by
   10% on a 2 socket system, which is kind of significant because they
   scrounge for months to try to find 0.1% improvement on these
   workloads. I'm hoping we might finally be faster than AIX on
   pSeries with this :). My reference to lockless get_user_pages is not
   meant to justify this patchset (which doesn't include lockless gup),
   but just to show that pte_special is not some s390 specific thing that
   should be hidden in arch code or xip code: I definitely want to use it
   on at least x86 and powerpc as well.

This patch:

Introduce a new type of mapping, VM_MIXEDMAP.  This is unlike VM_PFNMAP in
that it can support COW mappings of arbitrary ranges including ranges without
struct page *and* ranges with a struct page that we actually want to refcount
(PFNMAP can only support COW in those cases where the un-COW-ed translations
are mapped linearly in the virtual address, and can only support non
refcounted ranges).

VM_MIXEDMAP achieves this by refcounting all pfn_valid pages, and not
refcounting !pfn_valid pages (which is not an option for VM_PFNMAP, because it
needs to avoid refcounting pfn_valid pages eg.  for /dev/mem mappings).

Signed-off-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 79 +++++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 58 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 46958fb97c2d..0da414c383e7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -371,35 +371,65 @@ static inline int is_cow_mapping(unsigned int flags)
 }
 
 /*
- * This function gets the "struct page" associated with a pte.
+ * This function gets the "struct page" associated with a pte or returns
+ * NULL if no "struct page" is associated with the pte.
  *
- * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
- * will have each page table entry just pointing to a raw page frame
- * number, and as far as the VM layer is concerned, those do not have
- * pages associated with them - even if the PFN might point to memory
+ * A raw VM_PFNMAP mapping (ie. one that is not COWed) may not have any "struct
+ * page" backing, and even if they do, they are not refcounted. COWed pages of
+ * a VM_PFNMAP do always have a struct page, and they are normally refcounted
+ * (they are _normal_ pages).
+ *
+ * So a raw PFNMAP mapping will have each page table entry just pointing
+ * to a page frame number, and as far as the VM layer is concerned, those do
+ * not have pages associated with them - even if the PFN might point to memory
  * that otherwise is perfectly fine and has a "struct page".
  *
- * The way we recognize those mappings is through the rules set up
- * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
- * and the vm_pgoff will point to the first PFN mapped: thus every
+ * The way we recognize COWed pages within VM_PFNMAP mappings is through the
+ * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
+ * set, and the vm_pgoff will point to the first PFN mapped: thus every
  * page that is a raw mapping will always honor the rule
  *
  *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
  *
- * and if that isn't true, the page has been COW'ed (in which case it
- * _does_ have a "struct page" associated with it even if it is in a
- * VM_PFNMAP range).
+ * A call to vm_normal_page() will return NULL for such a page.
+ *
+ * If the page doesn't follow the "remap_pfn_range()" rule in a VM_PFNMAP
+ * then the page has been COW'ed.  A COW'ed page _does_ have a "struct page"
+ * associated with it even if it is in a VM_PFNMAP range.  Calling
+ * vm_normal_page() on such a page will therefore return the "struct page".
+ *
+ *
+ * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
+ * page" backing, however the difference is that _all_ pages with a struct
+ * page (that is, those where pfn_valid is true) are refcounted and considered
+ * normal pages by the VM. The disadvantage is that pages are refcounted
+ * (which can be slower and simply not an option for some PFNMAP users). The
+ * advantage is that we don't have to follow the strict linearity rule of
+ * PFNMAP mappings in order to support COWable mappings.
+ *
+ * A call to vm_normal_page() with a VM_MIXEDMAP mapping will return the
+ * associated "struct page" or NULL for memory not backed by a "struct page".
+ *
+ *
+ * All other mappings should have a valid struct page, which will be
+ * returned by a call to vm_normal_page().
  */
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
 	unsigned long pfn = pte_pfn(pte);
 
-	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
-		unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
-		if (pfn == vma->vm_pgoff + off)
-			return NULL;
-		if (!is_cow_mapping(vma->vm_flags))
-			return NULL;
+	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+		if (vma->vm_flags & VM_MIXEDMAP) {
+			if (!pfn_valid(pfn))
+				return NULL;
+			goto out;
+		} else {
+			unsigned long off = (addr-vma->vm_start) >> PAGE_SHIFT;
+			if (pfn == vma->vm_pgoff + off)
+				return NULL;
+			if (!is_cow_mapping(vma->vm_flags))
+				return NULL;
+		}
 	}
 
 #ifdef CONFIG_DEBUG_VM
@@ -422,6 +452,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
 	 * The PAGE_ZERO() pages and various VDSO mappings can
 	 * cause them to exist.
 	 */
+out:
 	return pfn_to_page(pfn);
 }
 
@@ -1232,8 +1263,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	pte_t *pte, entry;
 	spinlock_t *ptl;
 
-	BUG_ON(!(vma->vm_flags & VM_PFNMAP));
-	BUG_ON(is_cow_mapping(vma->vm_flags));
+	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+						(VM_PFNMAP|VM_MIXEDMAP));
+	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
 
 	retval = -ENOMEM;
 	pte = get_locked_pte(mm, addr, &ptl);
@@ -2365,10 +2399,13 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long pfn;
 
 	pte_unmap(page_table);
-	BUG_ON(!(vma->vm_flags & VM_PFNMAP));
-	BUG_ON(is_cow_mapping(vma->vm_flags));
+	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
 
 	pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
+
+	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
 	if (unlikely(pfn == NOPFN_OOM))
 		return VM_FAULT_OOM;
 	else if (unlikely(pfn == NOPFN_SIGBUS))
-- 
cgit v1.2.3


From 7e675137a8e1a4d45822746456dd389b65745bf6 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 28 Apr 2008 02:13:00 -0700
Subject: mm: introduce pte_special pte bit

s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most).  Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:

vm_normal_page()
{
	...
        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
			if (!mixedmap_refcount_pte(pte))
				return NULL;
#else
                        if (!pfn_valid(pfn))
                                return NULL;
#endif
                        goto out;
                }
	...
}

This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes.  So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):

vm_normal_page()
{
#ifdef s390
	if (!mixedmap_refcount_pte(pte))
		return NULL;
	return pte_page(pte);
#else
	...
#endif
}

And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits.  This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.

So introduce a pte_special pte state, and use it in mm/memory.c.  It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.

BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.

[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 99 ++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 55 insertions(+), 44 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 0da414c383e7..c5e88bcd8ec3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -371,33 +371,37 @@ static inline int is_cow_mapping(unsigned int flags)
 }
 
 /*
- * This function gets the "struct page" associated with a pte or returns
- * NULL if no "struct page" is associated with the pte.
+ * vm_normal_page -- This function gets the "struct page" associated with a pte.
  *
- * A raw VM_PFNMAP mapping (ie. one that is not COWed) may not have any "struct
- * page" backing, and even if they do, they are not refcounted. COWed pages of
- * a VM_PFNMAP do always have a struct page, and they are normally refcounted
- * (they are _normal_ pages).
+ * "Special" mappings do not wish to be associated with a "struct page" (either
+ * it doesn't exist, or it exists but they don't want to touch it). In this
+ * case, NULL is returned here. "Normal" mappings do have a struct page.
  *
- * So a raw PFNMAP mapping will have each page table entry just pointing
- * to a page frame number, and as far as the VM layer is concerned, those do
- * not have pages associated with them - even if the PFN might point to memory
- * that otherwise is perfectly fine and has a "struct page".
+ * There are 2 broad cases. Firstly, an architecture may define a pte_special()
+ * pte bit, in which case this function is trivial. Secondly, an architecture
+ * may not have a spare pte bit, which requires a more complicated scheme,
+ * described below.
+ *
+ * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
+ * special mapping (even if there are underlying and valid "struct pages").
+ * COWed pages of a VM_PFNMAP are always normal.
  *
  * The way we recognize COWed pages within VM_PFNMAP mappings is through the
  * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
- * set, and the vm_pgoff will point to the first PFN mapped: thus every
- * page that is a raw mapping will always honor the rule
+ * set, and the vm_pgoff will point to the first PFN mapped: thus every special
+ * mapping will always honor the rule
  *
  *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
  *
- * A call to vm_normal_page() will return NULL for such a page.
+ * And for normal mappings this is false.
+ *
+ * This restricts such mappings to be a linear translation from virtual address
+ * to pfn. To get around this restriction, we allow arbitrary mappings so long
+ * as the vma is not a COW mapping; in that case, we know that all ptes are
+ * special (because none can have been COWed).
  *
- * If the page doesn't follow the "remap_pfn_range()" rule in a VM_PFNMAP
- * then the page has been COW'ed.  A COW'ed page _does_ have a "struct page"
- * associated with it even if it is in a VM_PFNMAP range.  Calling
- * vm_normal_page() on such a page will therefore return the "struct page".
  *
+ * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
  *
  * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
  * page" backing, however the difference is that _all_ pages with a struct
@@ -407,16 +411,29 @@ static inline int is_cow_mapping(unsigned int flags)
  * advantage is that we don't have to follow the strict linearity rule of
  * PFNMAP mappings in order to support COWable mappings.
  *
- * A call to vm_normal_page() with a VM_MIXEDMAP mapping will return the
- * associated "struct page" or NULL for memory not backed by a "struct page".
- *
- *
- * All other mappings should have a valid struct page, which will be
- * returned by a call to vm_normal_page().
  */
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+# define HAVE_PTE_SPECIAL 1
+#else
+# define HAVE_PTE_SPECIAL 0
+#endif
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+				pte_t pte)
 {
-	unsigned long pfn = pte_pfn(pte);
+	unsigned long pfn;
+
+	if (HAVE_PTE_SPECIAL) {
+		if (likely(!pte_special(pte))) {
+			VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+			return pte_page(pte);
+		}
+		VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
+		return NULL;
+	}
+
+	/* !HAVE_PTE_SPECIAL case follows: */
+
+	pfn = pte_pfn(pte);
 
 	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
 		if (vma->vm_flags & VM_MIXEDMAP) {
@@ -424,7 +441,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
 				return NULL;
 			goto out;
 		} else {
-			unsigned long off = (addr-vma->vm_start) >> PAGE_SHIFT;
+			unsigned long off;
+			off = (addr - vma->vm_start) >> PAGE_SHIFT;
 			if (pfn == vma->vm_pgoff + off)
 				return NULL;
 			if (!is_cow_mapping(vma->vm_flags))
@@ -432,25 +450,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
 		}
 	}
 
-#ifdef CONFIG_DEBUG_VM
-	/*
-	 * Add some anal sanity checks for now. Eventually,
-	 * we should just do "return pfn_to_page(pfn)", but
-	 * in the meantime we check that we get a valid pfn,
-	 * and that the resulting page looks ok.
-	 */
-	if (unlikely(!pfn_valid(pfn))) {
-		print_bad_pte(vma, pte, addr);
-		return NULL;
-	}
-#endif
+	VM_BUG_ON(!pfn_valid(pfn));
 
 	/*
-	 * NOTE! We still have PageReserved() pages in the page 
-	 * tables. 
+	 * NOTE! We still have PageReserved() pages in the page tables.
 	 *
-	 * The PAGE_ZERO() pages and various VDSO mappings can
-	 * cause them to exist.
+	 * eg. VDSO mappings can cause them to exist.
 	 */
 out:
 	return pfn_to_page(pfn);
@@ -1263,6 +1268,12 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	pte_t *pte, entry;
 	spinlock_t *ptl;
 
+	/*
+	 * Technically, architectures with pte_special can avoid all these
+	 * restrictions (same for remap_pfn_range).  However we would like
+	 * consistency in testing and feature parity among all, so we should
+	 * try to keep these invariants in place for everybody.
+	 */
 	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
 	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
 						(VM_PFNMAP|VM_MIXEDMAP));
@@ -1278,7 +1289,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 		goto out_unlock;
 
 	/* Ok, finally just insert the thing.. */
-	entry = pfn_pte(pfn, vma->vm_page_prot);
+	entry = pte_mkspecial(pfn_pte(pfn, vma->vm_page_prot));
 	set_pte_at(mm, addr, pte, entry);
 	update_mmu_cache(vma, addr, entry);
 
@@ -1309,7 +1320,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	arch_enter_lazy_mmu_mode();
 	do {
 		BUG_ON(!pte_none(*pte));
-		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
-- 
cgit v1.2.3


From 423bad600443c590f34ed7ce357591f76f48f137 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 28 Apr 2008 02:13:01 -0700
Subject: mm: add vm_insert_mixed

vm_insert_mixed will insert either a raw pfn or a refcounted struct page into
the page tables, depending on whether vm_normal_page() will return the page or
not.  With the introduction of the new pte bit, this is now a too tricky for
drivers to be doing themselves.

filemap_xip uses this in a subsequent patch.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 86 ++++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index c5e88bcd8ec3..bbab1e37055e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1176,8 +1176,10 @@ pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
  * old drivers should use this, and they needed to mark their
  * pages reserved for the old functions anyway.
  */
-static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
+static int insert_page(struct vm_area_struct *vma, unsigned long addr,
+			struct page *page, pgprot_t prot)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	int retval;
 	pte_t *pte;
 	spinlock_t *ptl;
@@ -1237,17 +1239,46 @@ out:
  *
  * The page does not need to be reserved.
  */
-int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
+			struct page *page)
 {
 	if (addr < vma->vm_start || addr >= vma->vm_end)
 		return -EFAULT;
 	if (!page_count(page))
 		return -EINVAL;
 	vma->vm_flags |= VM_INSERTPAGE;
-	return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
+	return insert_page(vma, addr, page, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_insert_page);
 
+static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+			unsigned long pfn, pgprot_t prot)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	int retval;
+	pte_t *pte, entry;
+	spinlock_t *ptl;
+
+	retval = -ENOMEM;
+	pte = get_locked_pte(mm, addr, &ptl);
+	if (!pte)
+		goto out;
+	retval = -EBUSY;
+	if (!pte_none(*pte))
+		goto out_unlock;
+
+	/* Ok, finally just insert the thing.. */
+	entry = pte_mkspecial(pfn_pte(pfn, prot));
+	set_pte_at(mm, addr, pte, entry);
+	update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
+
+	retval = 0;
+out_unlock:
+	pte_unmap_unlock(pte, ptl);
+out:
+	return retval;
+}
+
 /**
  * vm_insert_pfn - insert single pfn into user vma
  * @vma: user vma to map to
@@ -1261,13 +1292,8 @@ EXPORT_SYMBOL(vm_insert_page);
  * in that case the handler should return NULL.
  */
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-		unsigned long pfn)
+			unsigned long pfn)
 {
-	struct mm_struct *mm = vma->vm_mm;
-	int retval;
-	pte_t *pte, entry;
-	spinlock_t *ptl;
-
 	/*
 	 * Technically, architectures with pte_special can avoid all these
 	 * restrictions (same for remap_pfn_range).  However we would like
@@ -1280,27 +1306,35 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
 	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
 
-	retval = -ENOMEM;
-	pte = get_locked_pte(mm, addr, &ptl);
-	if (!pte)
-		goto out;
-	retval = -EBUSY;
-	if (!pte_none(*pte))
-		goto out_unlock;
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return -EFAULT;
+	return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_pfn);
 
-	/* Ok, finally just insert the thing.. */
-	entry = pte_mkspecial(pfn_pte(pfn, vma->vm_page_prot));
-	set_pte_at(mm, addr, pte, entry);
-	update_mmu_cache(vma, addr, entry);
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+			unsigned long pfn)
+{
+	BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
 
-	retval = 0;
-out_unlock:
-	pte_unmap_unlock(pte, ptl);
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return -EFAULT;
 
-out:
-	return retval;
+	/*
+	 * If we don't have pte special, then we have to use the pfn_valid()
+	 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
+	 * refcount the page if pfn_valid is true (hence insert_page rather
+	 * than insert_pfn).
+	 */
+	if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
+		struct page *page;
+
+		page = pfn_to_page(pfn);
+		return insert_page(vma, addr, page, vma->vm_page_prot);
+	}
+	return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
 }
-EXPORT_SYMBOL(vm_insert_pfn);
+EXPORT_SYMBOL(vm_insert_mixed);
 
 /*
  * maps a range of physical memory into the requested pages. the old
-- 
cgit v1.2.3


From 70688e4dd1647f0ceb502bbd5964fa344c5eb411 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 28 Apr 2008 02:13:02 -0700
Subject: xip: support non-struct page backed memory

Convert XIP to support non-struct page backed memory, using VM_MIXEDMAP for
the user mappings.

This requires the get_xip_page API to be changed to an address based one.
Improve the API layering a little bit too, while we're here.

This is required in order to support XIP filesystems on memory that isn't
backed with struct page (but memory with struct page is still supported too).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/fadvise.c     |   2 +-
 mm/filemap_xip.c | 200 +++++++++++++++++++++++++++----------------------------
 mm/madvise.c     |   2 +-
 3 files changed, 102 insertions(+), 102 deletions(-)

(limited to 'mm')

diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3c0f1e99f5e4..343cfdfebd9e 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -49,7 +49,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 		goto out;
 	}
 
-	if (mapping->a_ops->get_xip_page) {
+	if (mapping->a_ops->get_xip_mem) {
 		switch (advice) {
 		case POSIX_FADV_NORMAL:
 		case POSIX_FADV_RANDOM:
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 5e598c42afd7..3e744abcce9d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/sched.h>
 #include <asm/tlbflush.h>
+#include <asm/io.h>
 
 /*
  * We do use our own empty page to avoid interference with other users
@@ -42,37 +43,41 @@ static struct page *xip_sparse_page(void)
 
 /*
  * This is a file read routine for execute in place files, and uses
- * the mapping->a_ops->get_xip_page() function for the actual low-level
+ * the mapping->a_ops->get_xip_mem() function for the actual low-level
  * stuff.
  *
  * Note the struct file* is not used at all.  It may be NULL.
  */
-static void
+static ssize_t
 do_xip_mapping_read(struct address_space *mapping,
 		    struct file_ra_state *_ra,
 		    struct file *filp,
-		    loff_t *ppos,
-		    read_descriptor_t *desc,
-		    read_actor_t actor)
+		    char __user *buf,
+		    size_t len,
+		    loff_t *ppos)
 {
 	struct inode *inode = mapping->host;
 	pgoff_t index, end_index;
 	unsigned long offset;
-	loff_t isize;
+	loff_t isize, pos;
+	size_t copied = 0, error = 0;
 
-	BUG_ON(!mapping->a_ops->get_xip_page);
+	BUG_ON(!mapping->a_ops->get_xip_mem);
 
-	index = *ppos >> PAGE_CACHE_SHIFT;
-	offset = *ppos & ~PAGE_CACHE_MASK;
+	pos = *ppos;
+	index = pos >> PAGE_CACHE_SHIFT;
+	offset = pos & ~PAGE_CACHE_MASK;
 
 	isize = i_size_read(inode);
 	if (!isize)
 		goto out;
 
 	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
-	for (;;) {
-		struct page *page;
-		unsigned long nr, ret;
+	do {
+		unsigned long nr, left;
+		void *xip_mem;
+		unsigned long xip_pfn;
+		int zero = 0;
 
 		/* nr is the maximum number of bytes to copy from this page */
 		nr = PAGE_CACHE_SIZE;
@@ -85,19 +90,17 @@ do_xip_mapping_read(struct address_space *mapping,
 			}
 		}
 		nr = nr - offset;
+		if (nr > len)
+			nr = len;
 
-		page = mapping->a_ops->get_xip_page(mapping,
-			index*(PAGE_SIZE/512), 0);
-		if (!page)
-			goto no_xip_page;
-		if (unlikely(IS_ERR(page))) {
-			if (PTR_ERR(page) == -ENODATA) {
+		error = mapping->a_ops->get_xip_mem(mapping, index, 0,
+							&xip_mem, &xip_pfn);
+		if (unlikely(error)) {
+			if (error == -ENODATA) {
 				/* sparse */
-				page = ZERO_PAGE(0);
-			} else {
-				desc->error = PTR_ERR(page);
+				zero = 1;
+			} else
 				goto out;
-			}
 		}
 
 		/* If users can be writing to this page using arbitrary
@@ -105,10 +108,10 @@ do_xip_mapping_read(struct address_space *mapping,
 		 * before reading the page on the kernel side.
 		 */
 		if (mapping_writably_mapped(mapping))
-			flush_dcache_page(page);
+			/* address based flush */ ;
 
 		/*
-		 * Ok, we have the page, so now we can copy it to user space...
+		 * Ok, we have the mem, so now we can copy it to user space...
 		 *
 		 * The actor routine returns how many bytes were actually used..
 		 * NOTE! This may not be the same as how much of a user buffer
@@ -116,47 +119,38 @@ do_xip_mapping_read(struct address_space *mapping,
 		 * "pos" here (the actor routine has to update the user buffer
 		 * pointers and the remaining count).
 		 */
-		ret = actor(desc, page, offset, nr);
-		offset += ret;
-		index += offset >> PAGE_CACHE_SHIFT;
-		offset &= ~PAGE_CACHE_MASK;
+		if (!zero)
+			left = __copy_to_user(buf+copied, xip_mem+offset, nr);
+		else
+			left = __clear_user(buf + copied, nr);
 
-		if (ret == nr && desc->count)
-			continue;
-		goto out;
+		if (left) {
+			error = -EFAULT;
+			goto out;
+		}
 
-no_xip_page:
-		/* Did not get the page. Report it */
-		desc->error = -EIO;
-		goto out;
-	}
+		copied += (nr - left);
+		offset += (nr - left);
+		index += offset >> PAGE_CACHE_SHIFT;
+		offset &= ~PAGE_CACHE_MASK;
+	} while (copied < len);
 
 out:
-	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+	*ppos = pos + copied;
 	if (filp)
 		file_accessed(filp);
+
+	return (copied ? copied : error);
 }
 
 ssize_t
 xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 {
-	read_descriptor_t desc;
-
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	desc.written = 0;
-	desc.arg.buf = buf;
-	desc.count = len;
-	desc.error = 0;
-
-	do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
-			    ppos, &desc, file_read_actor);
-
-	if (desc.written)
-		return desc.written;
-	else
-		return desc.error;
+	return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
+			    buf, len, ppos);
 }
 EXPORT_SYMBOL_GPL(xip_file_read);
 
@@ -211,13 +205,16 @@ __xip_unmap (struct address_space * mapping,
  *
  * This function is derived from filemap_fault, but used for execute in place
  */
-static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)
+static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct file *file = area->vm_file;
+	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	struct page *page;
 	pgoff_t size;
+	void *xip_mem;
+	unsigned long xip_pfn;
+	struct page *page;
+	int error;
 
 	/* XXX: are VM_FAULT_ codes OK? */
 
@@ -225,35 +222,44 @@ static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 	if (vmf->pgoff >= size)
 		return VM_FAULT_SIGBUS;
 
-	page = mapping->a_ops->get_xip_page(mapping,
-					vmf->pgoff*(PAGE_SIZE/512), 0);
-	if (!IS_ERR(page))
-		goto out;
-	if (PTR_ERR(page) != -ENODATA)
+	error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+						&xip_mem, &xip_pfn);
+	if (likely(!error))
+		goto found;
+	if (error != -ENODATA)
 		return VM_FAULT_OOM;
 
 	/* sparse block */
-	if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
-	    (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
+	if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
+	    (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
 	    (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
+		int err;
+
 		/* maybe shared writable, allocate new block */
-		page = mapping->a_ops->get_xip_page(mapping,
-					vmf->pgoff*(PAGE_SIZE/512), 1);
-		if (IS_ERR(page))
+		error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
+							&xip_mem, &xip_pfn);
+		if (error)
 			return VM_FAULT_SIGBUS;
-		/* unmap page at pgoff from all other vmas */
+		/* unmap sparse mappings at pgoff from all other vmas */
 		__xip_unmap(mapping, vmf->pgoff);
+
+found:
+		err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+							xip_pfn);
+		if (err == -ENOMEM)
+			return VM_FAULT_OOM;
+		BUG_ON(err);
+		return VM_FAULT_NOPAGE;
 	} else {
 		/* not shared and writable, use xip_sparse_page() */
 		page = xip_sparse_page();
 		if (!page)
 			return VM_FAULT_OOM;
-	}
 
-out:
-	page_cache_get(page);
-	vmf->page = page;
-	return 0;
+		page_cache_get(page);
+		vmf->page = page;
+		return 0;
+	}
 }
 
 static struct vm_operations_struct xip_file_vm_ops = {
@@ -262,11 +268,11 @@ static struct vm_operations_struct xip_file_vm_ops = {
 
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
 {
-	BUG_ON(!file->f_mapping->a_ops->get_xip_page);
+	BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
 
 	file_accessed(file);
 	vma->vm_ops = &xip_file_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
+	vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
@@ -279,17 +285,17 @@ __xip_file_write(struct file *filp, const char __user *buf,
 	const struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode 	*inode = mapping->host;
 	long		status = 0;
-	struct page	*page;
 	size_t		bytes;
 	ssize_t		written = 0;
 
-	BUG_ON(!mapping->a_ops->get_xip_page);
+	BUG_ON(!mapping->a_ops->get_xip_mem);
 
 	do {
 		unsigned long index;
 		unsigned long offset;
 		size_t copied;
-		char *kaddr;
+		void *xip_mem;
+		unsigned long xip_pfn;
 
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 		index = pos >> PAGE_CACHE_SHIFT;
@@ -297,28 +303,22 @@ __xip_file_write(struct file *filp, const char __user *buf,
 		if (bytes > count)
 			bytes = count;
 
-		page = a_ops->get_xip_page(mapping,
-					   index*(PAGE_SIZE/512), 0);
-		if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
+		status = a_ops->get_xip_mem(mapping, index, 0,
+						&xip_mem, &xip_pfn);
+		if (status == -ENODATA) {
 			/* we allocate a new page unmap it */
-			page = a_ops->get_xip_page(mapping,
-						   index*(PAGE_SIZE/512), 1);
-			if (!IS_ERR(page))
+			status = a_ops->get_xip_mem(mapping, index, 1,
+							&xip_mem, &xip_pfn);
+			if (!status)
 				/* unmap page at pgoff from all other vmas */
 				__xip_unmap(mapping, index);
 		}
 
-		if (IS_ERR(page)) {
-			status = PTR_ERR(page);
+		if (status)
 			break;
-		}
 
-		fault_in_pages_readable(buf, bytes);
-		kaddr = kmap_atomic(page, KM_USER0);
 		copied = bytes -
-			__copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
-		kunmap_atomic(kaddr, KM_USER0);
-		flush_dcache_page(page);
+			__copy_from_user_nocache(xip_mem + offset, buf, bytes);
 
 		if (likely(copied > 0)) {
 			status = copied;
@@ -398,7 +398,7 @@ EXPORT_SYMBOL_GPL(xip_file_write);
 
 /*
  * truncate a page used for execute in place
- * functionality is analog to block_truncate_page but does use get_xip_page
+ * functionality is analog to block_truncate_page but does use get_xip_mem
  * to get the page instead of page cache
  */
 int
@@ -408,9 +408,11 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize;
 	unsigned length;
-	struct page *page;
+	void *xip_mem;
+	unsigned long xip_pfn;
+	int err;
 
-	BUG_ON(!mapping->a_ops->get_xip_page);
+	BUG_ON(!mapping->a_ops->get_xip_mem);
 
 	blocksize = 1 << mapping->host->i_blkbits;
 	length = offset & (blocksize - 1);
@@ -421,18 +423,16 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
 
 	length = blocksize - length;
 
-	page = mapping->a_ops->get_xip_page(mapping,
-					    index*(PAGE_SIZE/512), 0);
-	if (!page)
-		return -ENOMEM;
-	if (unlikely(IS_ERR(page))) {
-		if (PTR_ERR(page) == -ENODATA)
+	err = mapping->a_ops->get_xip_mem(mapping, index, 0,
+						&xip_mem, &xip_pfn);
+	if (unlikely(err)) {
+		if (err == -ENODATA)
 			/* Hole? No need to truncate */
 			return 0;
 		else
-			return PTR_ERR(page);
+			return err;
 	}
-	zero_user(page, offset, length);
+	memset(xip_mem + offset, 0, length);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/madvise.c b/mm/madvise.c
index 93ee375b38e7..23a0ec3e0ea0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -112,7 +112,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
 	if (!file)
 		return -EBADF;
 
-	if (file->f_mapping->a_ops->get_xip_page) {
+	if (file->f_mapping->a_ops->get_xip_mem) {
 		/* no bad return value, but ignore advice */
 		return 0;
 	}
-- 
cgit v1.2.3


From 3b1163006332302117b1b2acf226d4014ff46525 Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Mon, 28 Apr 2008 02:13:06 -0700
Subject: Subject: [PATCH] hugetlb: vmstat events for huge page allocations

Allocating huge pages directly from the buddy allocator is not guaranteed to
succeed.  Success depends on several factors (such as the amount of physical
memory available and the level of fragmentation).  With the addition of
dynamic hugetlb pool resizing, allocations can occur much more frequently.
For these reasons it is desirable to keep track of huge page allocation
successes and failures.

Add two new vmstat entries to track huge page allocations that succeed and
fail.  The presence of the two entries is contingent upon CONFIG_HUGETLB_PAGE
being enabled.

[akpm@linux-foundation.org: reduced ifdeffery]
Signed-off-by: Adam Litke <agl@us.ibm.com>
Signed-off-by: Eric Munson <ebmunson@us.ibm.com>
Tested-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Andy Whitcroft <apw@shadowen.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 7 +++++++
 mm/vmstat.c  | 4 ++++
 2 files changed, 11 insertions(+)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 93ea46a0fba4..8deae4eb9696 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -242,6 +242,11 @@ static int alloc_fresh_huge_page(void)
 		hugetlb_next_nid = next_nid;
 	} while (!page && hugetlb_next_nid != start_nid);
 
+	if (ret)
+		count_vm_event(HTLB_BUDDY_PGALLOC);
+	else
+		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
 	return ret;
 }
 
@@ -302,9 +307,11 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
 		 */
 		nr_huge_pages_node[nid]++;
 		surplus_huge_pages_node[nid]++;
+		__count_vm_event(HTLB_BUDDY_PGALLOC);
 	} else {
 		nr_huge_pages--;
 		surplus_huge_pages--;
+		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
 	}
 	spin_unlock(&hugetlb_lock);
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 879bcc0a1d4c..4c21670f8d91 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -645,6 +645,10 @@ static const char * const vmstat_text[] = {
 	"allocstall",
 
 	"pgrotated",
+#ifdef CONFIG_HUGETLB_PAGE
+	"htlb_buddy_alloc_success",
+	"htlb_buddy_alloc_fail",
+#endif
 #endif
 };
 
-- 
cgit v1.2.3


From f0be3d32b05d3fea2fcdbbb81a39dac2a7163169 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:08 -0700
Subject: mempolicy: rename mpol_free to mpol_put

This is a change that was requested some time ago by Mel Gorman.  Makes sense
to me, so here it is.

Note: I retain the name "mpol_free_shared_policy()" because it actually does
free the shared_policy, which is NOT a reference counted object.  However, ...

The mempolicy object[s] referenced by the shared_policy are reference counted,
so mpol_put() is used to release the reference held by the shared_policy.  The
mempolicy might not be freed at this time, because some task attached to the
shared object associated with the shared policy may be in the process of
allocating a page based on the mempolicy.  In that case, the task performing
the allocation will hold a reference on the mempolicy, obtained via
mpol_shared_policy_lookup().  The mempolicy will be freed when all tasks
holding such a reference have called mpol_put() for the mempolicy.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c   |  2 +-
 mm/mempolicy.c | 26 +++++++++++++-------------
 mm/mmap.c      |  6 +++---
 mm/shmem.c     |  4 ++--
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8deae4eb9696..53afa8c76ada 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -116,7 +116,7 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 			break;
 		}
 	}
-	mpol_free(mpol);	/* unref if mpol !NULL */
+	mpol_put(mpol);	/* unref if mpol !NULL */
 	return page;
 }
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c1b907789d84..ce2c5b6bf9f8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -529,7 +529,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 	if (!err) {
 		mpol_get(new);
 		vma->vm_policy = new;
-		mpol_free(old);
+		mpol_put(old);
 	}
 	return err;
 }
@@ -595,7 +595,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	new = mpol_new(mode, flags, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
-	mpol_free(current->mempolicy);
+	mpol_put(current->mempolicy);
 	current->mempolicy = new;
 	mpol_set_task_struct_flag();
 	if (new && new->policy == MPOL_INTERLEAVE &&
@@ -948,7 +948,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	}
 
 	up_write(&mm->mmap_sem);
-	mpol_free(new);
+	mpol_put(new);
 	return err;
 }
 
@@ -1446,14 +1446,14 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
 		if (unlikely(pol != &default_policy &&
 				pol != current->mempolicy))
-			__mpol_free(pol);	/* finished with pol */
+			__mpol_put(pol);	/* finished with pol */
 		return node_zonelist(nid, gfp_flags);
 	}
 
 	zl = zonelist_policy(GFP_HIGHUSER, pol);
 	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
 		if (pol->policy != MPOL_BIND)
-			__mpol_free(pol);	/* finished with pol */
+			__mpol_put(pol);	/* finished with pol */
 		else
 			*mpol = pol;	/* unref needed after allocation */
 	}
@@ -1512,7 +1512,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
 		if (unlikely(pol != &default_policy &&
 				pol != current->mempolicy))
-			__mpol_free(pol);	/* finished with pol */
+			__mpol_put(pol);	/* finished with pol */
 		return alloc_page_interleave(gfp, 0, nid);
 	}
 	zl = zonelist_policy(gfp, pol);
@@ -1522,7 +1522,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		 */
 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
 						zl, nodemask_policy(gfp, pol));
-		__mpol_free(pol);
+		__mpol_put(pol);
 		return page;
 	}
 	/*
@@ -1624,7 +1624,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 }
 
 /* Slow path of a mpol destructor. */
-void __mpol_free(struct mempolicy *p)
+void __mpol_put(struct mempolicy *p)
 {
 	if (!atomic_dec_and_test(&p->refcnt))
 		return;
@@ -1720,7 +1720,7 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
 	rb_erase(&n->nd, &sp->root);
-	mpol_free(n->policy);
+	mpol_put(n->policy);
 	kmem_cache_free(sn_cache, n);
 }
 
@@ -1780,7 +1780,7 @@ restart:
 		sp_insert(sp, new);
 	spin_unlock(&sp->lock);
 	if (new2) {
-		mpol_free(new2->policy);
+		mpol_put(new2->policy);
 		kmem_cache_free(sn_cache, new2);
 	}
 	return 0;
@@ -1805,7 +1805,7 @@ void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
 			/* Policy covers entire file */
 			pvma.vm_end = TASK_SIZE;
 			mpol_set_shared_policy(info, &pvma, newpol);
-			mpol_free(newpol);
+			mpol_put(newpol);
 		}
 	}
 }
@@ -1848,7 +1848,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
 		n = rb_entry(next, struct sp_node, nd);
 		next = rb_next(&n->nd);
 		rb_erase(&n->nd, &p->root);
-		mpol_free(n->policy);
+		mpol_put(n->policy);
 		kmem_cache_free(sn_cache, n);
 	}
 	spin_unlock(&p->lock);
@@ -2068,7 +2068,7 @@ int show_numa_map(struct seq_file *m, void *v)
 	 * unref shared or other task's mempolicy
 	 */
 	if (pol != &default_policy && pol != current->mempolicy)
-		__mpol_free(pol);
+		__mpol_put(pol);
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 6aaf657adb87..36c85e04fa93 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -232,7 +232,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file)
 		fput(vma->vm_file);
-	mpol_free(vma_policy(vma));
+	mpol_put(vma_policy(vma));
 	kmem_cache_free(vm_area_cachep, vma);
 	return next;
 }
@@ -626,7 +626,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 		if (file)
 			fput(file);
 		mm->map_count--;
-		mpol_free(vma_policy(next));
+		mpol_put(vma_policy(next));
 		kmem_cache_free(vm_area_cachep, next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
@@ -1182,7 +1182,7 @@ munmap_back:
 
 	if (file && vma_merge(mm, prev, addr, vma->vm_end,
 			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
-		mpol_free(vma_policy(vma));
+		mpol_put(vma_policy(vma));
 		kmem_cache_free(vm_area_cachep, vma);
 		fput(file);
 	} else {
diff --git a/mm/shmem.c b/mm/shmem.c
index 177c7a7d2bb3..5326876d814d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1196,7 +1196,7 @@ static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
 	page = swapin_readahead(entry, gfp, &pvma, 0);
-	mpol_free(pvma.vm_policy);
+	mpol_put(pvma.vm_policy);
 	return page;
 }
 
@@ -1212,7 +1212,7 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
 	page = alloc_page_vma(gfp, &pvma, 0);
-	mpol_free(pvma.vm_policy);
+	mpol_put(pvma.vm_policy);
 	return page;
 }
 #else /* !CONFIG_NUMA */
-- 
cgit v1.2.3


From 846a16bf0fc80dc95a414ffce465e3cbf9680247 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:09 -0700
Subject: mempolicy: rename mpol_copy to mpol_dup

This patch renames mpol_copy() to mpol_dup() because, well, that's what it
does.  Like, e.g., strdup() for strings, mpol_dup() takes a pointer to an
existing mempolicy, allocates a new one and copies the contents.

In a later patch, I want to use the name mpol_copy() to copy the contents from
one mempolicy to another like, e.g., strcpy() does for strings.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 6 +++---
 mm/mmap.c      | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ce2c5b6bf9f8..e9fc1c1ae66c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1566,15 +1566,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 EXPORT_SYMBOL(alloc_pages_current);
 
 /*
- * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
  * with the mems_allowed returned by cpuset_mems_allowed().  This
  * keeps mempolicies cpuset relative after its cpuset moves.  See
  * further kernel/cpuset.c update_nodemask().
  */
 
-/* Slow path of a mempolicy copy */
-struct mempolicy *__mpol_copy(struct mempolicy *old)
+/* Slow path of a mempolicy duplicate */
+struct mempolicy *__mpol_dup(struct mempolicy *old)
 {
 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 36c85e04fa93..677d184b0d42 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1810,7 +1810,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
 	}
 
-	pol = mpol_copy(vma_policy(vma));
+	pol = mpol_dup(vma_policy(vma));
 	if (IS_ERR(pol)) {
 		kmem_cache_free(vm_area_cachep, new);
 		return PTR_ERR(pol);
@@ -2126,7 +2126,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
 			*new_vma = *vma;
-			pol = mpol_copy(vma_policy(vma));
+			pol = mpol_dup(vma_policy(vma));
 			if (IS_ERR(pol)) {
 				kmem_cache_free(vm_area_cachep, new_vma);
 				return NULL;
-- 
cgit v1.2.3


From f4e53d910b7dde2685b177f1e7c3e3e0b4a42f7b Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:10 -0700
Subject: mempolicy: write lock mmap_sem while changing task mempolicy

A read of /proc/<pid>/numa_maps holds the target task's mmap_sem for read
while examining each vma's mempolicy.  A vma's mempolicy can fall back to the
task's policy.  However, the task could be changing it's task policy and free
the one that the show_numa_maps() is examining.

To prevent this, grab the mmap_sem for write when updating task mempolicy.
Pointed out to me by Christoph Lameter and extracted and reworked from
Christoph's alternative mempol reference counting patch.

This is analogous to the way that do_mbind() and do_get_mempolicy() prevent
races between task's sharing an mm_struct [a.k.a.  threads] setting and
querying a mempolicy for a particular address.

Note: this is necessary, but not sufficient, to allow us to stop taking an
extra reference on "other task's mempolicy" in get_vma_policy.  Subsequent
patches will complete this update, allowing us to simplify the tests for
whether we need to unref a mempolicy at various points in the code.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e9fc1c1ae66c..c6c61ea6bb8c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -591,16 +591,29 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 			     nodemask_t *nodes)
 {
 	struct mempolicy *new;
+	struct mm_struct *mm = current->mm;
 
 	new = mpol_new(mode, flags, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
+
+	/*
+	 * prevent changing our mempolicy while show_numa_maps()
+	 * is using it.
+	 * Note:  do_set_mempolicy() can be called at init time
+	 * with no 'mm'.
+	 */
+	if (mm)
+		down_write(&mm->mmap_sem);
 	mpol_put(current->mempolicy);
 	current->mempolicy = new;
 	mpol_set_task_struct_flag();
 	if (new && new->policy == MPOL_INTERLEAVE &&
 	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
+	if (mm)
+		up_write(&mm->mmap_sem);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From ae4d8c16aa22775f5731677abb8a82f03cec877e Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:11 -0700
Subject: mempolicy: fixup Fallback for Default Shmem Policy

get_vma_policy() is not handling fallback to task policy correctly when the
get_policy() vm_op returns NULL.  The NULL overwrites the 'pol' variable that
was holding the fallback task mempolicy.  So, it was falling back directly to
system default policy.

Fix get_vma_policy() to use only non-NULL policy returned from the vma
get_policy op.

shm_get_policy() was falling back to current task's mempolicy if the "backing
file system" [tmpfs vs hugetlbfs] does not support the get_policy vm_op and
the vma policy is null.  This is incorrect for show_numa_maps() which is
likely querying the numa_maps of some task other than current.  Remove this
fallback.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c6c61ea6bb8c..8924aaf4665c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1262,7 +1262,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
  * @task != current].  It is the caller's responsibility to
  * free the reference in these cases.
  */
-static struct mempolicy * get_vma_policy(struct task_struct *task,
+static struct mempolicy *get_vma_policy(struct task_struct *task,
 		struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = task->mempolicy;
@@ -1270,7 +1270,10 @@ static struct mempolicy * get_vma_policy(struct task_struct *task,
 
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy) {
-			pol = vma->vm_ops->get_policy(vma, addr);
+			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
+									addr);
+			if (vpol)
+				pol = vpol;
 			shared_pol = 1;	/* if pol non-NULL, add ref below */
 		} else if (vma->vm_policy &&
 				vma->vm_policy->policy != MPOL_DEFAULT)
-- 
cgit v1.2.3


From 45c4745af381851b0406d8e4db99e62e265691c2 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:12 -0700
Subject: mempolicy: rename struct mempolicy 'policy' member to 'mode'

The terms 'policy' and 'mode' are both used in various places to describe the
semantics of the value stored in the 'policy' member of struct mempolicy.
Furthermore, the term 'policy' is used to refer to that member, to the entire
struct mempolicy and to the more abstract concept of the tuple consisting of a
"mode" and an optional node or set of nodes.  Recently, we have added "mode
flags" that are passed in the upper bits of the 'mode' [or sometimes,
'policy'] member of the numa APIs.

I'd like to resolve this confusion, which perhaps only exists in my mind, by
renaming the 'policy' member to 'mode' throughout, and fixing up the
Documentation.  Man pages will be updated separately.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8924aaf4665c..5e7eea2dc8b4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -106,7 +106,7 @@ enum zone_type policy_zone = 0;
 
 struct mempolicy default_policy = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
-	.policy = MPOL_DEFAULT,
+	.mode   = MPOL_DEFAULT,
 };
 
 static const struct mempolicy_operations {
@@ -211,7 +211,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	if (!policy)
 		return ERR_PTR(-ENOMEM);
 	atomic_set(&policy->refcnt, 1);
-	policy->policy = mode;
+	policy->mode = mode;
 	policy->flags = flags;
 
 	if (nodes) {
@@ -302,7 +302,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 	if (!mpol_store_user_nodemask(pol) &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
-	mpol_ops[pol->policy].rebind(pol, newmask);
+	mpol_ops[pol->mode].rebind(pol, newmask);
 }
 
 /*
@@ -608,7 +608,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	mpol_put(current->mempolicy);
 	current->mempolicy = new;
 	mpol_set_task_struct_flag();
-	if (new && new->policy == MPOL_INTERLEAVE &&
+	if (new && new->mode == MPOL_INTERLEAVE &&
 	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
 	if (mm)
@@ -621,7 +621,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 {
 	nodes_clear(*nodes);
-	switch (p->policy) {
+	switch (p->mode) {
 	case MPOL_DEFAULT:
 		break;
 	case MPOL_BIND:
@@ -700,14 +700,14 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 				goto out;
 			*policy = err;
 		} else if (pol == current->mempolicy &&
-				pol->policy == MPOL_INTERLEAVE) {
+				pol->mode == MPOL_INTERLEAVE) {
 			*policy = current->il_next;
 		} else {
 			err = -EINVAL;
 			goto out;
 		}
 	} else
-		*policy = pol->policy | pol->flags;
+		*policy = pol->mode | pol->flags;
 
 	if (vma) {
 		up_read(&current->mm->mmap_sem);
@@ -1276,7 +1276,7 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
 				pol = vpol;
 			shared_pol = 1;	/* if pol non-NULL, add ref below */
 		} else if (vma->vm_policy &&
-				vma->vm_policy->policy != MPOL_DEFAULT)
+				vma->vm_policy->mode != MPOL_DEFAULT)
 			pol = vma->vm_policy;
 	}
 	if (!pol)
@@ -1290,7 +1290,7 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
 static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
 {
 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
-	if (unlikely(policy->policy == MPOL_BIND) &&
+	if (unlikely(policy->mode == MPOL_BIND) &&
 			gfp_zone(gfp) >= policy_zone &&
 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
 		return &policy->v.nodes;
@@ -1303,7 +1303,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 {
 	int nd;
 
-	switch (policy->policy) {
+	switch (policy->mode) {
 	case MPOL_PREFERRED:
 		nd = policy->v.preferred_node;
 		if (nd < 0)
@@ -1353,7 +1353,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
  */
 unsigned slab_node(struct mempolicy *policy)
 {
-	unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
+	unsigned short pol = policy ? policy->mode : MPOL_DEFAULT;
 
 	switch (pol) {
 	case MPOL_INTERLEAVE:
@@ -1454,9 +1454,9 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 
 	*mpol = NULL;		/* probably no unref needed */
 	*nodemask = NULL;	/* assume !MPOL_BIND */
-	if (pol->policy == MPOL_BIND) {
+	if (pol->mode == MPOL_BIND) {
 			*nodemask = &pol->v.nodes;
-	} else if (pol->policy == MPOL_INTERLEAVE) {
+	} else if (pol->mode == MPOL_INTERLEAVE) {
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
@@ -1468,7 +1468,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 
 	zl = zonelist_policy(GFP_HIGHUSER, pol);
 	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
-		if (pol->policy != MPOL_BIND)
+		if (pol->mode != MPOL_BIND)
 			__mpol_put(pol);	/* finished with pol */
 		else
 			*mpol = pol;	/* unref needed after allocation */
@@ -1522,7 +1522,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 
 	cpuset_update_task_memory_state();
 
-	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
+	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
@@ -1574,7 +1574,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 		cpuset_update_task_memory_state();
 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
-	if (pol->policy == MPOL_INTERLEAVE)
+	if (pol->mode == MPOL_INTERLEAVE)
 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
 	return __alloc_pages_nodemask(gfp, order,
 			zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
@@ -1620,11 +1620,11 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
 	if (!a || !b)
 		return 0;
-	if (a->policy != b->policy)
+	if (a->mode != b->mode)
 		return 0;
-	if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
+	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
 		return 0;
-	switch (a->policy) {
+	switch (a->mode) {
 	case MPOL_DEFAULT:
 		return 1;
 	case MPOL_BIND:
@@ -1644,7 +1644,7 @@ void __mpol_put(struct mempolicy *p)
 {
 	if (!atomic_dec_and_test(&p->refcnt))
 		return;
-	p->policy = MPOL_DEFAULT;
+	p->mode = MPOL_DEFAULT;
 	kmem_cache_free(policy_cache, p);
 }
 
@@ -1710,7 +1710,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
 	rb_link_node(&new->nd, parent, p);
 	rb_insert_color(&new->nd, &sp->root);
 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
-		 new->policy ? new->policy->policy : 0);
+		 new->policy ? new->policy->mode : 0);
 }
 
 /* Find shared policy intersecting idx */
@@ -1835,7 +1835,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
 
 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
 		 vma->vm_pgoff,
-		 sz, npol ? npol->policy : -1,
+		 sz, npol ? npol->mode : -1,
 		 npol ? npol->flags : -1,
 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
 
@@ -1935,7 +1935,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	char *p = buffer;
 	int l;
 	nodemask_t nodes;
-	unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
+	unsigned short mode = pol ? pol->mode : MPOL_DEFAULT;
 	unsigned short flags = pol ? pol->flags : 0;
 
 	switch (mode) {
-- 
cgit v1.2.3


From aab0b1029f0843756b68e0ed3ca983685bf43ed6 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:13 -0700
Subject: mempolicy: mark shared policies for unref

As part of yet another rework of mempolicy reference counting, we want to be
able to identify shared policies efficiently, because they have an extra ref
taken on lookup that needs to be removed when we're finished using the policy.

  Note:  the extra ref is required because the policies are
  shared between tasks/processes and can be changed/freed
  by one task while another task is using them--e.g., for
  page allocation.

Building on David Rientjes mempolicy "mode flags" enhancement, this patch
indicates a "shared" policy by setting a new MPOL_F_SHARED flag in the flags
member of the struct mempolicy added by David.  MPOL_F_SHARED, and any future
"internal mode flags" are reserved from bit zero up, as they will never be
passed in the upper bits of the mode argument of a mempolicy API.

I set the MPOL_F_SHARED flag when the policy is installed in the shared policy
rb-tree.  Don't need/want to clear the flag when removing from the tree as the
mempolicy is freed [unref'd] internally to the sp_delete() function.  However,
a task could hold another reference on this mempolicy from a prior lookup.  We
need the MPOL_F_SHARED flag to stay put so that any tasks holding a ref will
unref, eventually freeing, the mempolicy.

A later patch in this series will introduce a function to conditionally unref
[mpol_free] a policy.  The MPOL_F_SHARED flag is one reason [currently the
only reason] to unref/free a policy via the conditional free.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5e7eea2dc8b4..78b18a60b9b2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1750,6 +1750,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 	n->start = start;
 	n->end = end;
 	mpol_get(pol);
+	pol->flags |= MPOL_F_SHARED;	/* for unref */
 	n->policy = pol;
 	return n;
 }
-- 
cgit v1.2.3


From 52cd3b074050dd664380b5e8cfc85d4a6ed8ad48 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:16 -0700
Subject: mempolicy: rework mempolicy Reference Counting [yet again]

After further discussion with Christoph Lameter, it has become clear that my
earlier attempts to clean up the mempolicy reference counting were a bit of
overkill in some areas, resulting in superflous ref/unref in what are usually
fast paths.  In other areas, further inspection reveals that I botched the
unref for interleave policies.

A separate patch, suitable for upstream/stable trees, fixes up the known
errors in the previous attempt to fix reference counting.

This patch reworks the memory policy referencing counting and, one hopes,
simplifies the code.  Maybe I'll get it right this time.

See the update to the numa_memory_policy.txt document for a discussion of
memory policy reference counting that motivates this patch.

Summary:

Lookup of mempolicy, based on (vma, address) need only add a reference for
shared policy, and we need only unref the policy when finished for shared
policies.  So, this patch backs out all of the unneeded extra reference
counting added by my previous attempt.  It then unrefs only shared policies
when we're finished with them, using the mpol_cond_put() [conditional put]
helper function introduced by this patch.

Note that shmem_swapin() calls read_swap_cache_async() with a dummy vma
containing just the policy.  read_swap_cache_async() can call alloc_page_vma()
multiple times, so we can't let alloc_page_vma() unref the shared policy in
this case.  To avoid this, we make a copy of any non-null shared policy and
remove the MPOL_F_SHARED flag from the copy.  This copy occurs before reading
a page [or multiple pages] from swap, so the overhead should not be an issue
here.

I introduced a new static inline function "mpol_cond_copy()" to copy the
shared policy to an on-stack policy and remove the flags that would require a
conditional free.  The current implementation of mpol_cond_copy() assumes that
the struct mempolicy contains no pointers to dynamically allocated structures
that must be duplicated or reference counted during copy.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c   |   2 +-
 mm/mempolicy.c | 146 +++++++++++++++++++++++++++++++--------------------------
 mm/shmem.c     |  16 ++++---
 3 files changed, 90 insertions(+), 74 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 53afa8c76ada..d36e1f11a5f2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -116,7 +116,7 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 			break;
 		}
 	}
-	mpol_put(mpol);	/* unref if mpol !NULL */
+	mpol_cond_put(mpol);
 	return page;
 }
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 78b18a60b9b2..a237295f8190 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -241,6 +241,15 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	return policy;
 }
 
+/* Slow path of a mpol destructor. */
+void __mpol_put(struct mempolicy *p)
+{
+	if (!atomic_dec_and_test(&p->refcnt))
+		return;
+	p->mode = MPOL_DEFAULT;
+	kmem_cache_free(policy_cache, p);
+}
+
 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 {
 }
@@ -719,6 +728,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 		get_zonemask(pol, nmask);
 
  out:
+	mpol_cond_put(pol);
 	if (vma)
 		up_read(&current->mm->mmap_sem);
 	return err;
@@ -1257,16 +1267,18 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
  *
  * Returns effective policy for a VMA at specified address.
  * Falls back to @task or system default policy, as necessary.
- * Returned policy has extra reference count if shared, vma,
- * or some other task's policy [show_numa_maps() can pass
- * @task != current].  It is the caller's responsibility to
- * free the reference in these cases.
+ * Current or other task's task mempolicy and non-shared vma policies
+ * are protected by the task's mmap_sem, which must be held for read by
+ * the caller.
+ * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
+ * count--added by the get_policy() vm_op, as appropriate--to protect against
+ * freeing by another task.  It is the caller's responsibility to free the
+ * extra reference for shared policies.
  */
 static struct mempolicy *get_vma_policy(struct task_struct *task,
 		struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = task->mempolicy;
-	int shared_pol = 0;
 
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1274,20 +1286,20 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
 									addr);
 			if (vpol)
 				pol = vpol;
-			shared_pol = 1;	/* if pol non-NULL, add ref below */
 		} else if (vma->vm_policy &&
 				vma->vm_policy->mode != MPOL_DEFAULT)
 			pol = vma->vm_policy;
 	}
 	if (!pol)
 		pol = &default_policy;
-	else if (!shared_pol && pol != current->mempolicy)
-		mpol_get(pol);	/* vma or other task's policy */
 	return pol;
 }
 
-/* Return a nodemask representing a mempolicy */
-static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
+/*
+ * Return a nodemask representing a mempolicy for filtering nodes for
+ * page allocation
+ */
+static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 {
 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
 	if (unlikely(policy->mode == MPOL_BIND) &&
@@ -1298,8 +1310,8 @@ static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
 	return NULL;
 }
 
-/* Return a zonelist representing a mempolicy */
-static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
+/* Return a zonelist indicated by gfp for node representing a mempolicy */
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
 {
 	int nd;
 
@@ -1311,10 +1323,10 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 		break;
 	case MPOL_BIND:
 		/*
-		 * Normally, MPOL_BIND allocations node-local are node-local
-		 * within the allowed nodemask. However, if __GFP_THISNODE is
-		 * set and the current node is part of the mask, we use the
-		 * the zonelist for the first node in the mask instead.
+		 * Normally, MPOL_BIND allocations are node-local within the
+		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
+		 * current node is part of the mask, we use the zonelist for
+		 * the first node in the mask instead.
 		 */
 		nd = numa_node_id();
 		if (unlikely(gfp & __GFP_THISNODE) &&
@@ -1350,6 +1362,10 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 /*
  * Depending on the memory policy provide a node from which to allocate the
  * next slab entry.
+ * @policy must be protected by freeing by the caller.  If @policy is
+ * the current task's mempolicy, this protection is implicit, as only the
+ * task can change it's policy.  The system default policy requires no
+ * such protection.
  */
 unsigned slab_node(struct mempolicy *policy)
 {
@@ -1435,43 +1451,27 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
  *
- * Returns a zonelist suitable for a huge page allocation.
- * If the effective policy is 'BIND, returns pointer to local node's zonelist,
- * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
- * If it is also a policy for which get_vma_policy() returns an extra
- * reference, we must hold that reference until after the allocation.
- * In that case, return policy via @mpol so hugetlb allocation can drop
- * the reference. For non-'BIND referenced policies, we can/do drop the
- * reference here, so the caller doesn't need to know about the special case
- * for default and current task policy.
+ * Returns a zonelist suitable for a huge page allocation and a pointer
+ * to the struct mempolicy for conditional unref after allocation.
+ * If the effective policy is 'BIND, returns a pointer to the mempolicy's
+ * @nodemask for filtering the zonelist.
  */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 				gfp_t gfp_flags, struct mempolicy **mpol,
 				nodemask_t **nodemask)
 {
-	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
 
-	*mpol = NULL;		/* probably no unref needed */
+	*mpol = get_vma_policy(current, vma, addr);
 	*nodemask = NULL;	/* assume !MPOL_BIND */
-	if (pol->mode == MPOL_BIND) {
-			*nodemask = &pol->v.nodes;
-	} else if (pol->mode == MPOL_INTERLEAVE) {
-		unsigned nid;
-
-		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
-		if (unlikely(pol != &default_policy &&
-				pol != current->mempolicy))
-			__mpol_put(pol);	/* finished with pol */
-		return node_zonelist(nid, gfp_flags);
-	}
 
-	zl = zonelist_policy(GFP_HIGHUSER, pol);
-	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
-		if (pol->mode != MPOL_BIND)
-			__mpol_put(pol);	/* finished with pol */
-		else
-			*mpol = pol;	/* unref needed after allocation */
+	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
+		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
+						HPAGE_SHIFT), gfp_flags);
+	} else {
+		zl = policy_zonelist(gfp_flags, *mpol);
+		if ((*mpol)->mode == MPOL_BIND)
+			*nodemask = &(*mpol)->v.nodes;
 	}
 	return zl;
 }
@@ -1526,25 +1526,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
-		if (unlikely(pol != &default_policy &&
-				pol != current->mempolicy))
-			__mpol_put(pol);	/* finished with pol */
+		mpol_cond_put(pol);
 		return alloc_page_interleave(gfp, 0, nid);
 	}
-	zl = zonelist_policy(gfp, pol);
-	if (pol != &default_policy && pol != current->mempolicy) {
+	zl = policy_zonelist(gfp, pol);
+	if (unlikely(mpol_needs_cond_ref(pol))) {
 		/*
-		 * slow path: ref counted policy -- shared or vma
+		 * slow path: ref counted shared policy
 		 */
 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
-						zl, nodemask_policy(gfp, pol));
+						zl, policy_nodemask(gfp, pol));
 		__mpol_put(pol);
 		return page;
 	}
 	/*
 	 * fast path:  default or task policy
 	 */
-	return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
+	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
 }
 
 /**
@@ -1574,10 +1572,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 		cpuset_update_task_memory_state();
 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
+
+	/*
+	 * No reference counting needed for current->mempolicy
+	 * nor system default_policy
+	 */
 	if (pol->mode == MPOL_INTERLEAVE)
 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
 	return __alloc_pages_nodemask(gfp, order,
-			zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
+			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
@@ -1605,6 +1608,28 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
 	return new;
 }
 
+/*
+ * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
+ * eliminate the * MPOL_F_* flags that require conditional ref and
+ * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
+ * after return.  Use the returned value.
+ *
+ * Allows use of a mempolicy for, e.g., multiple allocations with a single
+ * policy lookup, even if the policy needs/has extra ref on lookup.
+ * shmem_readahead needs this.
+ */
+struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
+						struct mempolicy *frompol)
+{
+	if (!mpol_needs_cond_ref(frompol))
+		return frompol;
+
+	*tompol = *frompol;
+	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
+	__mpol_put(frompol);
+	return tompol;
+}
+
 static int mpol_match_intent(const struct mempolicy *a,
 			     const struct mempolicy *b)
 {
@@ -1639,15 +1664,6 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	}
 }
 
-/* Slow path of a mpol destructor. */
-void __mpol_put(struct mempolicy *p)
-{
-	if (!atomic_dec_and_test(&p->refcnt))
-		return;
-	p->mode = MPOL_DEFAULT;
-	kmem_cache_free(policy_cache, p);
-}
-
 /*
  * Shared memory backing store policy support.
  *
@@ -2081,11 +2097,7 @@ int show_numa_map(struct seq_file *m, void *v)
 
 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
 	mpol_to_str(buffer, sizeof(buffer), pol);
-	/*
-	 * unref shared or other task's mempolicy
-	 */
-	if (pol != &default_policy && pol != current->mempolicy)
-		__mpol_put(pol);
+	mpol_cond_put(pol);
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 5326876d814d..0b591c669b2d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1187,16 +1187,19 @@ static void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
 static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
 			struct shmem_inode_info *info, unsigned long idx)
 {
+	struct mempolicy mpol, *spol;
 	struct vm_area_struct pvma;
 	struct page *page;
 
+	spol = mpol_cond_copy(&mpol,
+				mpol_shared_policy_lookup(&info->policy, idx));
+
 	/* Create a pseudo vma that just contains the policy */
 	pvma.vm_start = 0;
 	pvma.vm_pgoff = idx;
 	pvma.vm_ops = NULL;
-	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+	pvma.vm_policy = spol;
 	page = swapin_readahead(entry, gfp, &pvma, 0);
-	mpol_put(pvma.vm_policy);
 	return page;
 }
 
@@ -1204,16 +1207,17 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 			struct shmem_inode_info *info, unsigned long idx)
 {
 	struct vm_area_struct pvma;
-	struct page *page;
 
 	/* Create a pseudo vma that just contains the policy */
 	pvma.vm_start = 0;
 	pvma.vm_pgoff = idx;
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
-	page = alloc_page_vma(gfp, &pvma, 0);
-	mpol_put(pvma.vm_policy);
-	return page;
+
+	/*
+	 * alloc_page_vma() will drop the shared policy reference
+	 */
+	return alloc_page_vma(gfp, &pvma, 0);
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-- 
cgit v1.2.3


From bea904d54d6faa92400f10c8ea3d3828b8e1eb93 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:18 -0700
Subject: mempolicy: use MPOL_PREFERRED for system-wide default policy

Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API
[set_mempolicy(), mbind() and internal versions], the kernel simply installs a
NULL struct mempolicy pointer in the appropriate context: task policy, vma
policy, or shared policy.  This causes any use of that policy to "fall back"
to the next most specific policy scope.

The only use of MPOL_DEFAULT to mean "local allocation" is in the system
default policy.  This requires extra checks/cases for MPOL_DEFAULT in many
mempolicy.c functions.

There is another, "preferred" way to specify local allocation via the APIs.
That is using the MPOL_PREFERRED policy mode with an empty nodemask.
Internally, the empty nodemask gets converted to a preferred_node id of '-1'.
All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the
node local to the cpu where the allocation occurs.

System default policy, except during boot, is hard-coded to "local
allocation".  By using the MPOL_PREFERRED mode with a negative value of
preferred node for system default policy, MPOL_DEFAULT will never occur in the
'policy' member of a struct mempolicy.  Thus, we can remove all checks for
MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation
paths.

In slab_node() return local node id when policy pointer is NULL.  No need to
set a pol value to take the switch default.  Replace switch default with
BUG()--i.e., shouldn't happen.

With this patch MPOL_DEFAULT is only used in the APIs, including internal
calls to do_set_mempolicy() and in the display of policy in
/proc/<pid>/numa_maps.  It always means "fall back" to the the next most
specific policy scope.  This simplifies the description of memory policies
quite a bit, with no visible change in behavior.

get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when
the requested policy [task or vma/shared] is NULL.  These are the values one
would supply via set_mempolicy() or mbind() to achieve that condition--default
behavior.

This patch updates Documentation to reflect this change.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 68 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a237295f8190..fea4a5da6e44 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -104,9 +104,13 @@ static struct kmem_cache *sn_cache;
    policied. */
 enum zone_type policy_zone = 0;
 
+/*
+ * run-time system-wide default policy => local allocation
+ */
 struct mempolicy default_policy = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
-	.mode   = MPOL_DEFAULT,
+	.mode = MPOL_PREFERRED,
+	.v =  { .preferred_node =  -1 },
 };
 
 static const struct mempolicy_operations {
@@ -189,7 +193,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	if (mode == MPOL_DEFAULT) {
 		if (nodes && !nodes_empty(*nodes))
 			return ERR_PTR(-EINVAL);
-		return NULL;
+		return NULL;	/* simply delete any existing policy */
 	}
 	VM_BUG_ON(!nodes);
 
@@ -246,7 +250,6 @@ void __mpol_put(struct mempolicy *p)
 {
 	if (!atomic_dec_and_test(&p->refcnt))
 		return;
-	p->mode = MPOL_DEFAULT;
 	kmem_cache_free(policy_cache, p);
 }
 
@@ -626,13 +629,16 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	return 0;
 }
 
-/* Fill a zone bitmap for a policy */
-static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
+/*
+ * Return nodemask for policy for get_mempolicy() query
+ */
+static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 {
 	nodes_clear(*nodes);
+	if (p == &default_policy)
+		return;
+
 	switch (p->mode) {
-	case MPOL_DEFAULT:
-		break;
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
@@ -686,6 +692,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	}
 
 	if (flags & MPOL_F_ADDR) {
+		/*
+		 * Do NOT fall back to task policy if the
+		 * vma/shared policy at addr is NULL.  We
+		 * want to return MPOL_DEFAULT in this case.
+		 */
 		down_read(&mm->mmap_sem);
 		vma = find_vma_intersection(mm, addr, addr+1);
 		if (!vma) {
@@ -700,7 +711,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 		return -EINVAL;
 
 	if (!pol)
-		pol = &default_policy;
+		pol = &default_policy;	/* indicates default behavior */
 
 	if (flags & MPOL_F_NODE) {
 		if (flags & MPOL_F_ADDR) {
@@ -715,8 +726,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			err = -EINVAL;
 			goto out;
 		}
-	} else
-		*policy = pol->mode | pol->flags;
+	} else {
+		*policy = pol == &default_policy ? MPOL_DEFAULT :
+						pol->mode;
+		*policy |= pol->flags;
+	}
 
 	if (vma) {
 		up_read(&current->mm->mmap_sem);
@@ -725,7 +739,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 
 	err = 0;
 	if (nmask)
-		get_zonemask(pol, nmask);
+		get_policy_nodemask(pol, nmask);
 
  out:
 	mpol_cond_put(pol);
@@ -1286,8 +1300,7 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
 									addr);
 			if (vpol)
 				pol = vpol;
-		} else if (vma->vm_policy &&
-				vma->vm_policy->mode != MPOL_DEFAULT)
+		} else if (vma->vm_policy)
 			pol = vma->vm_policy;
 	}
 	if (!pol)
@@ -1334,7 +1347,6 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
 			nd = first_node(policy->v.nodes);
 		break;
 	case MPOL_INTERLEAVE: /* should not happen */
-	case MPOL_DEFAULT:
 		nd = numa_node_id();
 		break;
 	default:
@@ -1369,9 +1381,15 @@ static unsigned interleave_nodes(struct mempolicy *policy)
  */
 unsigned slab_node(struct mempolicy *policy)
 {
-	unsigned short pol = policy ? policy->mode : MPOL_DEFAULT;
+	if (!policy)
+		return numa_node_id();
+
+	switch (policy->mode) {
+	case MPOL_PREFERRED:
+		if (unlikely(policy->v.preferred_node >= 0))
+			return policy->v.preferred_node;
+		return numa_node_id();
 
-	switch (pol) {
 	case MPOL_INTERLEAVE:
 		return interleave_nodes(policy);
 
@@ -1390,13 +1408,8 @@ unsigned slab_node(struct mempolicy *policy)
 		return zone->node;
 	}
 
-	case MPOL_PREFERRED:
-		if (policy->v.preferred_node >= 0)
-			return policy->v.preferred_node;
-		/* Fall through */
-
 	default:
-		return numa_node_id();
+		BUG();
 	}
 }
 
@@ -1650,8 +1663,6 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
 		return 0;
 	switch (a->mode) {
-	case MPOL_DEFAULT:
-		return 1;
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
@@ -1828,7 +1839,7 @@ void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
 	if (policy != MPOL_DEFAULT) {
 		struct mempolicy *newpol;
 
-		/* Falls back to MPOL_DEFAULT on any error */
+		/* Falls back to NULL policy [MPOL_DEFAULT] on any error */
 		newpol = mpol_new(policy, flags, policy_nodes);
 		if (!IS_ERR(newpol)) {
 			/* Create pseudo-vma that contains just the policy */
@@ -1952,9 +1963,14 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	char *p = buffer;
 	int l;
 	nodemask_t nodes;
-	unsigned short mode = pol ? pol->mode : MPOL_DEFAULT;
+	unsigned short mode;
 	unsigned short flags = pol ? pol->flags : 0;
 
+	if (!pol || pol == &default_policy)
+		mode = MPOL_DEFAULT;
+	else
+		mode = pol->mode;
+
 	switch (mode) {
 	case MPOL_DEFAULT:
 		nodes_clear(nodes);
-- 
cgit v1.2.3


From 53f2556b6792ed99fde965f5e061749edd455623 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:20 -0700
Subject: mempolicy: mPOL_PREFERRED cleanups for "local allocation"

Here are a couple of "cleanups" for MPOL_PREFERRED behavior when
v.preferred_node < 0 -- i.e., "local allocation":

1)  [do_]get_mempolicy() calls the now renamed get_policy_nodemask()
    to fetch the nodemask associated with a policy.  Currently,
    get_policy_nodemask() returns the set of nodes with memory, when
    the policy 'mode' is 'PREFERRED, and the preferred_node is < 0.
    Change to return an empty nodemask, as this is what was specified
    to achieve "local allocation".

2)  When a task is moved into a [new] cpuset, mpol_rebind_policy() is
    called to adjust any task and vma policy nodes to be valid in the
    new cpuset.  However, when the policy is MPOL_PREFERRED, and the
    preferred_node is <0, no rebind is necessary.  The "local allocation"
    indication is valid in any cpuset.  Existing code will "do the right
    thing" because node_remap() will just return the argument node when
    it is outside of the valid range of node ids.  However, I think it is
    clearer and cleaner to skip the remap explicitly in this case.

3)  mpol_to_str() produces a printable, "human readable" string from a
    struct mempolicy.  For MPOL_PREFERRED with preferred_node <0,  show
    "local", as this indicates local allocation, as the task migrates
    among nodes.  Note that this matches the usage of "local allocation"
    in libnuma() and numactl.  Without this change, I believe that node_set()
    [via set_bit()] will set bit 31, resulting in a misleading display.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index fea4a5da6e44..7b3ae977b158 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -645,11 +645,9 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 		*nodes = p->v.nodes;
 		break;
 	case MPOL_PREFERRED:
-		/* or use current node instead of memory_map? */
-		if (p->v.preferred_node < 0)
-			*nodes = node_states[N_HIGH_MEMORY];
-		else
+		if (p->v.preferred_node >= 0)
 			node_set(p->v.preferred_node, *nodes);
+		/* else return empty node mask for local allocation */
 		break;
 	default:
 		BUG();
@@ -804,7 +802,7 @@ int do_migrate_pages(struct mm_struct *mm,
 	int err = 0;
 	nodemask_t tmp;
 
-  	down_read(&mm->mmap_sem);
+	down_read(&mm->mmap_sem);
 
 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 	if (err)
@@ -1948,10 +1946,12 @@ void numa_default_policy(void)
 }
 
 /*
- * Display pages allocated per node and memory policy via /proc.
+ * "local" is pseudo-policy:  MPOL_PREFERRED with preferred_node == -1
+ * Used only for mpol_to_str()
  */
+#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
 static const char * const policy_types[] =
-	{ "default", "prefer", "bind", "interleave" };
+	{ "default", "prefer", "bind", "interleave", "local" };
 
 /*
  * Convert a mempolicy into a string.
@@ -1962,6 +1962,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 {
 	char *p = buffer;
 	int l;
+	int nid;
 	nodemask_t nodes;
 	unsigned short mode;
 	unsigned short flags = pol ? pol->flags : 0;
@@ -1978,7 +1979,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 
 	case MPOL_PREFERRED:
 		nodes_clear(nodes);
-		node_set(pol->v.preferred_node, nodes);
+		nid = pol->v.preferred_node;
+		if (nid < 0)
+			mode = MPOL_LOCAL;	/* pseudo-policy */
+		else
+			node_set(nid, nodes);
 		break;
 
 	case MPOL_BIND:
@@ -1993,8 +1998,8 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	}
 
 	l = strlen(policy_types[mode]);
- 	if (buffer + maxlen < p + l + 1)
- 		return -ENOSPC;
+	if (buffer + maxlen < p + l + 1)
+		return -ENOSPC;
 
 	strcpy(p, policy_types[mode]);
 	p += l;
@@ -2093,6 +2098,9 @@ static inline void check_huge_range(struct vm_area_struct *vma,
 }
 #endif
 
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
 int show_numa_map(struct seq_file *m, void *v)
 {
 	struct proc_maps_private *priv = m->private;
-- 
cgit v1.2.3


From fc36b8d3d819047eb4d23ca079fb4d3af20ff076 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:21 -0700
Subject: mempolicy: use MPOL_F_LOCAL to Indicate Preferred Local Policy

Now that we're using "preferred local" policy for system default, we need to
make this as fast as possible.  Because of the variable size of the mempolicy
structure [based on size of nodemasks], the preferred_node may be in a
different cacheline from the mode.  This can result in accessing an extra
cacheline in the normal case of system default policy.  Suspect this is the
cause of an observed 2-3% slowdown in page fault testing relative to kernel
without this patch series.

To alleviate this, use an internal mode flag, MPOL_F_LOCAL in the mempolicy
flags member which is guaranteed [?] to be in the same cacheline as the mode
itself.

Verified that reworked mempolicy now performs slightly better on 25-rc8-mm1
for both anon and shmem segments with system default and vma [preferred local]
policy.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 47 ++++++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7b3ae977b158..143b019e9834 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -110,7 +110,7 @@ enum zone_type policy_zone = 0;
 struct mempolicy default_policy = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
 	.mode = MPOL_PREFERRED,
-	.v =  { .preferred_node =  -1 },
+	.flags = MPOL_F_LOCAL,
 };
 
 static const struct mempolicy_operations {
@@ -163,7 +163,7 @@ static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	if (!nodes)
-		pol->v.preferred_node = -1;	/* local allocation */
+		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
 	else if (nodes_empty(*nodes))
 		return -EINVAL;			/*  no allowed nodes */
 	else
@@ -290,14 +290,15 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
 	if (pol->flags & MPOL_F_STATIC_NODES) {
 		int node = first_node(pol->w.user_nodemask);
 
-		if (node_isset(node, *nodes))
+		if (node_isset(node, *nodes)) {
 			pol->v.preferred_node = node;
-		else
-			pol->v.preferred_node = -1;
+			pol->flags &= ~MPOL_F_LOCAL;
+		} else
+			pol->flags |= MPOL_F_LOCAL;
 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 		pol->v.preferred_node = first_node(tmp);
-	} else if (pol->v.preferred_node != -1) {
+	} else if (!(pol->flags & MPOL_F_LOCAL)) {
 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
 						   pol->w.cpuset_mems_allowed,
 						   *nodes);
@@ -645,7 +646,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 		*nodes = p->v.nodes;
 		break;
 	case MPOL_PREFERRED:
-		if (p->v.preferred_node >= 0)
+		if (!(p->flags & MPOL_F_LOCAL))
 			node_set(p->v.preferred_node, *nodes);
 		/* else return empty node mask for local allocation */
 		break;
@@ -1324,13 +1325,12 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 /* Return a zonelist indicated by gfp for node representing a mempolicy */
 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
 {
-	int nd;
+	int nd = numa_node_id();
 
 	switch (policy->mode) {
 	case MPOL_PREFERRED:
-		nd = policy->v.preferred_node;
-		if (nd < 0)
-			nd = numa_node_id();
+		if (!(policy->flags & MPOL_F_LOCAL))
+			nd = policy->v.preferred_node;
 		break;
 	case MPOL_BIND:
 		/*
@@ -1339,16 +1339,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
 		 * current node is part of the mask, we use the zonelist for
 		 * the first node in the mask instead.
 		 */
-		nd = numa_node_id();
 		if (unlikely(gfp & __GFP_THISNODE) &&
 				unlikely(!node_isset(nd, policy->v.nodes)))
 			nd = first_node(policy->v.nodes);
 		break;
 	case MPOL_INTERLEAVE: /* should not happen */
-		nd = numa_node_id();
 		break;
 	default:
-		nd = 0;
 		BUG();
 	}
 	return node_zonelist(nd, gfp);
@@ -1379,14 +1376,15 @@ static unsigned interleave_nodes(struct mempolicy *policy)
  */
 unsigned slab_node(struct mempolicy *policy)
 {
-	if (!policy)
+	if (!policy || policy->flags & MPOL_F_LOCAL)
 		return numa_node_id();
 
 	switch (policy->mode) {
 	case MPOL_PREFERRED:
-		if (unlikely(policy->v.preferred_node >= 0))
-			return policy->v.preferred_node;
-		return numa_node_id();
+		/*
+		 * handled MPOL_F_LOCAL above
+		 */
+		return policy->v.preferred_node;
 
 	case MPOL_INTERLEAVE:
 		return interleave_nodes(policy);
@@ -1666,7 +1664,8 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	case MPOL_INTERLEAVE:
 		return nodes_equal(a->v.nodes, b->v.nodes);
 	case MPOL_PREFERRED:
-		return a->v.preferred_node == b->v.preferred_node;
+		return a->v.preferred_node == b->v.preferred_node &&
+			a->flags == b->flags;
 	default:
 		BUG();
 		return 0;
@@ -1946,7 +1945,7 @@ void numa_default_policy(void)
 }
 
 /*
- * "local" is pseudo-policy:  MPOL_PREFERRED with preferred_node == -1
+ * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
  * Used only for mpol_to_str()
  */
 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
@@ -1962,7 +1961,6 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 {
 	char *p = buffer;
 	int l;
-	int nid;
 	nodemask_t nodes;
 	unsigned short mode;
 	unsigned short flags = pol ? pol->flags : 0;
@@ -1979,11 +1977,10 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 
 	case MPOL_PREFERRED:
 		nodes_clear(nodes);
-		nid = pol->v.preferred_node;
-		if (nid < 0)
+		if (flags & MPOL_F_LOCAL)
 			mode = MPOL_LOCAL;	/* pseudo-policy */
 		else
-			node_set(nid, nodes);
+			node_set(pol->v.preferred_node, nodes);
 		break;
 
 	case MPOL_BIND:
@@ -2004,7 +2001,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	strcpy(p, policy_types[mode]);
 	p += l;
 
-	if (flags) {
+	if (flags & MPOL_MODE_FLAGS) {
 		int need_bar = 0;
 
 		if (buffer + maxlen < p + 2)
-- 
cgit v1.2.3


From 2291990ab36b4b2d8a81b1f92e7a046e51632a60 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:22 -0700
Subject: mempolicy: clean-up mpol-to-str() mempolicy formatting

mpol-to-str() formats memory policies into printable strings.  Currently this
is only used to display "numa_maps".  A subsequent patch will use
mpol_to_str() for formatting tmpfs [shmem] mpol mount options, allowing us to
remove essentially duplicate code in mm/shmem.c.  This patch cleans up
mpol_to_str() generally and in preparation for that patch.

1) show_numa_maps() is not checking the return code from mpol_to_str().
   There's not a lot we can do in this context if mpol_to_str() did return the
   error [insufficient space in buffer].  Proposed "solution": just check,
   under DEBUG_VM, that callers are providing sufficient buffer space for the
   policy, flags, and a few nodes.  This way, we'll get some display.
   show_numa_maps() is providing a 50-byte buffer, so it won't trip this
   check.  50-bytes should be sufficient unless one has a large number of
   nodes in a very sparse nodemask.

2) The display of the new mode flags ["static" & "relative"] was set up to
   display multiple flags, separated by a "bar" '|'.  However, this support is
   incomplete--e.g., need_bar was never incremented; and currently, these two
   flags are mutually exclusive.  So remove the "bar" support, for now, and
   only display one flag.

3) Use snprint() to format flags, so as not to overflow the buffer.  Not
   that it's ever happed, AFAIK.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 143b019e9834..3c8ee31572ec 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1965,6 +1965,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	unsigned short mode;
 	unsigned short flags = pol ? pol->flags : 0;
 
+	/*
+	 * Sanity check:  room for longest mode, flag and some nodes
+	 */
+	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
+
 	if (!pol || pol == &default_policy)
 		mode = MPOL_DEFAULT;
 	else
@@ -1991,7 +1996,6 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 
 	default:
 		BUG();
-		return -EFAULT;
 	}
 
 	l = strlen(policy_types[mode]);
@@ -2002,16 +2006,17 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	p += l;
 
 	if (flags & MPOL_MODE_FLAGS) {
-		int need_bar = 0;
-
 		if (buffer + maxlen < p + 2)
 			return -ENOSPC;
 		*p++ = '=';
 
+		/*
+		 * Currently, the only defined flags are mutually exclusive
+		 */
 		if (flags & MPOL_F_STATIC_NODES)
-			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
-		if (flags & MPOL_F_RELATIVE_NODES)
-			p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
+			p += snprintf(p, buffer + maxlen - p, "static");
+		else if (flags & MPOL_F_RELATIVE_NODES)
+			p += snprintf(p, buffer + maxlen - p, "relative");
 	}
 
 	if (!nodes_empty(nodes)) {
-- 
cgit v1.2.3


From 095f1fc4ebf36c64fddf9b6db29b1ab5517378e6 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:23 -0700
Subject: mempolicy: rework shmem mpol parsing and display

mm/shmem.c currently contains functions to parse and display memory policy
strings for the tmpfs 'mpol' mount option.  Move this to mm/mempolicy.c with
the rest of the mempolicy support.  With subsequent patches, we'll be able to
remove knowledge of the details [mode, flags, policy, ...] completely from
shmem.c

1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in
   mm/mempolicy.c.  Rework to use the policy_types[] array [used by
   mpol_to_str()] to look up mode by name.

2) use mpol_to_str() to format policy for shmem_show_mpol().  mpol_to_str()
   expects a pointer to a struct mempolicy, so temporarily construct one.
   This will be replaced with a reference to a struct mempolicy in the tmpfs
   superblock in a subsequent patch.

   NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal
   sign '=' as the nodemask delimiter to match mpol_parse_str() and the
   tmpfs/shmem mpol mount option formatting that now uses mpol_to_str().  This
   is a user visible change to numa_maps, but then the addition of the mode
   flags already changed the display.  It makes sense to me to have the mounts
   and numa_maps display the policy in the same format.  However, if anyone
   objects strongly, I can pass the desired nodemask delimeter as an arg to
   mpol_to_str().

   Note 2: Like show_numa_map(), I don't check the return code from
   mpol_to_str().  I do use a longer buffer than the one provided by
   show_numa_map(), which seems to have sufficed so far.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++-
 mm/shmem.c     | 118 +++++++--------------------------------------------------
 2 files changed, 115 insertions(+), 107 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3c8ee31572ec..155bb284dbf1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -88,6 +88,7 @@
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/ctype.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -1944,6 +1945,10 @@ void numa_default_policy(void)
 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
 }
 
+/*
+ * Parse and format mempolicy from/to strings
+ */
+
 /*
  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
  * Used only for mpol_to_str()
@@ -1952,12 +1957,107 @@ void numa_default_policy(void)
 static const char * const policy_types[] =
 	{ "default", "prefer", "bind", "interleave", "local" };
 
+
+#ifdef CONFIG_TMPFS
+/**
+ * mpol_parse_str - parse string to mempolicy
+ * @str:  string containing mempolicy to parse
+ * @mode:  pointer to returned policy mode
+ * @mode_flags:  pointer to returned flags
+ * @policy_nodes:  pointer to returned nodemask
+ *
+ * Format of input:
+ *	<mode>[=<flags>][:<nodelist>]
+ *
+ * Currently only used for tmpfs/shmem mount options
+ */
+int mpol_parse_str(char *str, unsigned short *mode, unsigned short *mode_flags,
+			nodemask_t *policy_nodes)
+{
+	char *nodelist = strchr(str, ':');
+	char *flags = strchr(str, '=');
+	int i;
+	int err = 1;
+
+	if (nodelist) {
+		/* NUL-terminate mode or flags string */
+		*nodelist++ = '\0';
+		if (nodelist_parse(nodelist, *policy_nodes))
+			goto out;
+		if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
+			goto out;
+	}
+	if (flags)
+		*flags++ = '\0';	/* terminate mode string */
+
+	for (i = 0; i < MPOL_MAX; i++) {
+		if (!strcmp(str, policy_types[i])) {
+			*mode = i;
+			break;
+		}
+	}
+	if (i == MPOL_MAX)
+		goto out;
+
+	switch (*mode) {
+	case MPOL_DEFAULT:
+		/* Don't allow a nodelist nor flags */
+		if (!nodelist && !flags)
+			err = 0;
+		break;
+	case MPOL_PREFERRED:
+		/* Insist on a nodelist of one node only */
+		if (nodelist) {
+			char *rest = nodelist;
+			while (isdigit(*rest))
+				rest++;
+			if (!*rest)
+				err = 0;
+		}
+		break;
+	case MPOL_BIND:
+		/* Insist on a nodelist */
+		if (nodelist)
+			err = 0;
+		break;
+	case MPOL_INTERLEAVE:
+		/*
+		 * Default to online nodes with memory if no nodelist
+		 */
+		if (!nodelist)
+			*policy_nodes = node_states[N_HIGH_MEMORY];
+		err = 0;
+	}
+
+	*mode_flags = 0;
+	if (flags) {
+		/*
+		 * Currently, we only support two mutually exclusive
+		 * mode flags.
+		 */
+		if (!strcmp(flags, "static"))
+			*mode_flags |= MPOL_F_STATIC_NODES;
+		else if (!strcmp(flags, "relative"))
+			*mode_flags |= MPOL_F_RELATIVE_NODES;
+		else
+			err = 1;
+	}
+out:
+	/* Restore string for error message */
+	if (nodelist)
+		*--nodelist = ':';
+	if (flags)
+		*--flags = '=';
+	return err;
+}
+#endif /* CONFIG_TMPFS */
+
 /*
  * Convert a mempolicy into a string.
  * Returns the number of characters in buffer (if positive)
  * or an error (negative)
  */
-static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 {
 	char *p = buffer;
 	int l;
@@ -2022,7 +2122,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	if (!nodes_empty(nodes)) {
 		if (buffer + maxlen < p + 2)
 			return -ENOSPC;
-		*p++ = '=';
+		*p++ = ':';
 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
 	}
 	return p - buffer;
diff --git a/mm/shmem.c b/mm/shmem.c
index 0b591c669b2d..3c620dc10135 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1079,108 +1079,22 @@ redirty:
 
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_TMPFS
-static int shmem_parse_mpol(char *value, unsigned short *policy,
-			unsigned short *mode_flags, nodemask_t *policy_nodes)
-{
-	char *nodelist = strchr(value, ':');
-	char *flags = strchr(value, '=');
-	int err = 1;
-
-	if (nodelist) {
-		/* NUL-terminate policy string */
-		*nodelist++ = '\0';
-		if (nodelist_parse(nodelist, *policy_nodes))
-			goto out;
-		if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
-			goto out;
-	}
-	if (flags)
-		*flags++ = '\0';
-	if (!strcmp(value, "default")) {
-		*policy = MPOL_DEFAULT;
-		/* Don't allow a nodelist */
-		if (!nodelist)
-			err = 0;
-	} else if (!strcmp(value, "prefer")) {
-		*policy = MPOL_PREFERRED;
-		/* Insist on a nodelist of one node only */
-		if (nodelist) {
-			char *rest = nodelist;
-			while (isdigit(*rest))
-				rest++;
-			if (!*rest)
-				err = 0;
-		}
-	} else if (!strcmp(value, "bind")) {
-		*policy = MPOL_BIND;
-		/* Insist on a nodelist */
-		if (nodelist)
-			err = 0;
-	} else if (!strcmp(value, "interleave")) {
-		*policy = MPOL_INTERLEAVE;
-		/*
-		 * Default to online nodes with memory if no nodelist
-		 */
-		if (!nodelist)
-			*policy_nodes = node_states[N_HIGH_MEMORY];
-		err = 0;
-	}
-
-	*mode_flags = 0;
-	if (flags) {
-		/*
-		 * Currently, we only support two mutually exclusive
-		 * mode flags.
-		 */
-		if (!strcmp(flags, "static"))
-			*mode_flags |= MPOL_F_STATIC_NODES;
-		else if (!strcmp(flags, "relative"))
-			*mode_flags |= MPOL_F_RELATIVE_NODES;
-		else
-			err = 1;	/* unrecognized flag */
-	}
-out:
-	/* Restore string for error message */
-	if (nodelist)
-		*--nodelist = ':';
-	if (flags)
-		*--flags = '=';
-	return err;
-}
-
-static void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
+static void shmem_show_mpol(struct seq_file *seq, unsigned short mode,
 			unsigned short flags, const nodemask_t policy_nodes)
 {
-	char *policy_string;
-
-	switch (policy) {
-	case MPOL_PREFERRED:
-		policy_string = "prefer";
-		break;
-	case MPOL_BIND:
-		policy_string = "bind";
-		break;
-	case MPOL_INTERLEAVE:
-		policy_string = "interleave";
-		break;
-	default:
-		/* MPOL_DEFAULT */
-		return;
-	}
+	struct mempolicy temp;
+	char buffer[64];
 
-	seq_printf(seq, ",mpol=%s", policy_string);
+	if (mode == MPOL_DEFAULT)
+		return;		/* show nothing */
 
-	if (policy != MPOL_INTERLEAVE ||
-	    !nodes_equal(policy_nodes, node_states[N_HIGH_MEMORY])) {
-		char buffer[64];
-		int len;
+	temp.mode = mode;
+	temp.flags = flags;
+	temp.v.nodes = policy_nodes;
 
-		len = nodelist_scnprintf(buffer, sizeof(buffer), policy_nodes);
-		if (len < sizeof(buffer))
-			seq_printf(seq, ":%s", buffer);
-		else
-			seq_printf(seq, ":?");
-	}
+	mpol_to_str(buffer, sizeof(buffer), &temp);
+
+	seq_printf(seq, ",mpol=%s", buffer);
 }
 #endif /* CONFIG_TMPFS */
 
@@ -1221,12 +1135,6 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-static inline int shmem_parse_mpol(char *value, unsigned short *policy,
-			unsigned short *mode_flags, nodemask_t *policy_nodes)
-{
-	return 1;
-}
-
 static inline void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
 			unsigned short flags, const nodemask_t policy_nodes)
 {
@@ -2231,8 +2139,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 			if (*rest)
 				goto bad_val;
 		} else if (!strcmp(this_char,"mpol")) {
-			if (shmem_parse_mpol(value, &sbinfo->policy,
-				&sbinfo->flags, &sbinfo->policy_nodes))
+			if (mpol_parse_str(value, &sbinfo->policy,
+					 &sbinfo->flags, &sbinfo->policy_nodes))
 				goto bad_val;
 		} else {
 			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
-- 
cgit v1.2.3


From 3f226aa1cbc006f9d90f22084f519ad2a1286cd8 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:24 -0700
Subject: mempolicy: support mpol=local tmpfs mount option

For tmpfs/shmem shared policies, MPOL_DEFAULT is not necessarily equivalent to
"local allocation".  Because shared policies are at the same "scope" level
[see Documentation/vm/numa_memory_policy.txt], as vma policies MPOL_DEFAULT
means "fall back to current task policy".

This patch extends the memory policy string parsing function to display
"local" for MPOL_PREFERRED + MPOL_F_LOCAL.  This allows one to specify local
allocation as the default policy for shared memory areas via the tmpfs mpol
mount option, regardless of the current task's policy.

Also, "local" is now displayed for this policy.  This patch allows us to
accept the same input format as the display.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 155bb284dbf1..6b751565eed1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1951,7 +1951,7 @@ void numa_default_policy(void)
 
 /*
  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
- * Used only for mpol_to_str()
+ * Used only for mpol_parse_str() and mpol_to_str()
  */
 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
 static const char * const policy_types[] =
@@ -1990,21 +1990,16 @@ int mpol_parse_str(char *str, unsigned short *mode, unsigned short *mode_flags,
 	if (flags)
 		*flags++ = '\0';	/* terminate mode string */
 
-	for (i = 0; i < MPOL_MAX; i++) {
+	for (i = 0; i <= MPOL_LOCAL; i++) {
 		if (!strcmp(str, policy_types[i])) {
 			*mode = i;
 			break;
 		}
 	}
-	if (i == MPOL_MAX)
+	if (i > MPOL_LOCAL)
 		goto out;
 
 	switch (*mode) {
-	case MPOL_DEFAULT:
-		/* Don't allow a nodelist nor flags */
-		if (!nodelist && !flags)
-			err = 0;
-		break;
 	case MPOL_PREFERRED:
 		/* Insist on a nodelist of one node only */
 		if (nodelist) {
@@ -2027,6 +2022,20 @@ int mpol_parse_str(char *str, unsigned short *mode, unsigned short *mode_flags,
 		if (!nodelist)
 			*policy_nodes = node_states[N_HIGH_MEMORY];
 		err = 0;
+		break;
+	default:
+		/*
+		 * MPOL_DEFAULT or MPOL_LOCAL
+		 * Don't allow a nodelist nor flags
+		 */
+		if (!nodelist && !flags)
+			err = 0;
+		if (*mode == MPOL_DEFAULT)
+			goto out;
+		/* else MPOL_LOCAL */
+		*mode = MPOL_PREFERRED;
+		nodes_clear(*policy_nodes);
+		break;
 	}
 
 	*mode_flags = 0;
-- 
cgit v1.2.3


From 71fe804b6d56d6a7aed680e096901434cef6a2c3 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:26 -0700
Subject: mempolicy: use struct mempolicy pointer in shmem_sb_info

This patch replaces the mempolicy mode, mode_flags, and nodemask in the
shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL.
This removes dependency on the details of mempolicy from shmem.c and hugetlbfs
inode.c and simplifies the interfaces.

mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a
pointer arg, a struct mempolicy pointer on success.  For MPOL_DEFAULT, the
returned pointer is NULL.  Further, mpol_parse_str() now takes a 'no_context'
argument that causes the input nodemask to be stored in the w.user_nodemask of
the created mempolicy for use when the mempolicy is installed in a tmpfs inode
shared policy tree.  At that time, any cpuset contextualization is applied to
the original input nodemask.  This preserves the previous behavior where the
input nodemask was stored in the superblock.  We can think of the returned
mempolicy as "context free".

Because mpol_parse_str() is now calling mpol_new(), we can remove from
mpol_to_str() the semantic checks that mpol_new() already performs.

Add 'no_context' parameter to mpol_to_str() to specify that it should format
the nodemask in w.user_nodemask for 'bind' and 'interleave' policies.

Change mpol_shared_policy_init() to take a pointer to a "context free" struct
mempolicy and to create a new, "contextualized" mempolicy using the mode,
mode_flags and user_nodemask from the input mempolicy.

  Note: we know that the mempolicy passed to mpol_to_str() or
  mpol_shared_policy_init() from a tmpfs superblock is "context free".  This
  is currently the only instance thereof.  However, if we found more uses for
  this concept, and introduced any ambiguity as to whether a mempolicy was
  context free or not, we could add another internal mode flag to identify
  context free mempolicies.  Then, we could remove the 'no_context' argument
  from mpol_to_str().

Added shmem_get_sbmpol() to return a reference counted superblock mempolicy,
if one exists, to pass to mpol_shared_policy_init().  We must add the
reference under the sb stat_lock to prevent races with replacement of the mpol
by remount.  This reference is removed in mpol_shared_policy_init().

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: another build fix]
[akpm@linux-foundation.org: yet another build fix]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 144 +++++++++++++++++++++++++++++++++++----------------------
 mm/shmem.c     |  57 +++++++++++++----------
 2 files changed, 121 insertions(+), 80 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6b751565eed1..a37a5034f63d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1828,27 +1828,35 @@ restart:
 	return 0;
 }
 
-void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
-			unsigned short flags, nodemask_t *policy_nodes)
-{
-	info->root = RB_ROOT;
-	spin_lock_init(&info->lock);
-
-	if (policy != MPOL_DEFAULT) {
-		struct mempolicy *newpol;
-
-		/* Falls back to NULL policy [MPOL_DEFAULT] on any error */
-		newpol = mpol_new(policy, flags, policy_nodes);
-		if (!IS_ERR(newpol)) {
-			/* Create pseudo-vma that contains just the policy */
-			struct vm_area_struct pvma;
-
-			memset(&pvma, 0, sizeof(struct vm_area_struct));
-			/* Policy covers entire file */
-			pvma.vm_end = TASK_SIZE;
-			mpol_set_shared_policy(info, &pvma, newpol);
-			mpol_put(newpol);
-		}
+/**
+ * mpol_shared_policy_init - initialize shared policy for inode
+ * @sp: pointer to inode shared policy
+ * @mpol:  struct mempolicy to install
+ *
+ * Install non-NULL @mpol in inode's shared policy rb-tree.
+ * On entry, the current task has a reference on a non-NULL @mpol.
+ * This must be released on exit.
+ */
+void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
+{
+	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
+	spin_lock_init(&sp->lock);
+
+	if (mpol) {
+		struct vm_area_struct pvma;
+		struct mempolicy *new;
+
+		/* contextualize the tmpfs mount point mempolicy */
+		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
+		mpol_put(mpol);	/* drop our ref on sb mpol */
+		if (IS_ERR(new))
+			return;		/* no valid nodemask intersection */
+
+		/* Create pseudo-vma that contains just the policy */
+		memset(&pvma, 0, sizeof(struct vm_area_struct));
+		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
+		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
+		mpol_put(new);			/* drop initial ref */
 	}
 }
 
@@ -1962,18 +1970,27 @@ static const char * const policy_types[] =
 /**
  * mpol_parse_str - parse string to mempolicy
  * @str:  string containing mempolicy to parse
- * @mode:  pointer to returned policy mode
- * @mode_flags:  pointer to returned flags
- * @policy_nodes:  pointer to returned nodemask
+ * @mpol:  pointer to struct mempolicy pointer, returned on success.
+ * @no_context:  flag whether to "contextualize" the mempolicy
  *
  * Format of input:
  *	<mode>[=<flags>][:<nodelist>]
  *
- * Currently only used for tmpfs/shmem mount options
+ * if @no_context is true, save the input nodemask in w.user_nodemask in
+ * the returned mempolicy.  This will be used to "clone" the mempolicy in
+ * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
+ * mount option.  Note that if 'static' or 'relative' mode flags were
+ * specified, the input nodemask will already have been saved.  Saving
+ * it again is redundant, but safe.
+ *
+ * On success, returns 0, else 1
  */
-int mpol_parse_str(char *str, unsigned short *mode, unsigned short *mode_flags,
-			nodemask_t *policy_nodes)
+int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 {
+	struct mempolicy *new = NULL;
+	unsigned short uninitialized_var(mode);
+	unsigned short uninitialized_var(mode_flags);
+	nodemask_t nodes;
 	char *nodelist = strchr(str, ':');
 	char *flags = strchr(str, '=');
 	int i;
@@ -1982,26 +1999,30 @@ int mpol_parse_str(char *str, unsigned short *mode, unsigned short *mode_flags,
 	if (nodelist) {
 		/* NUL-terminate mode or flags string */
 		*nodelist++ = '\0';
-		if (nodelist_parse(nodelist, *policy_nodes))
+		if (nodelist_parse(nodelist, nodes))
 			goto out;
-		if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
+		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
 			goto out;
-	}
+	} else
+		nodes_clear(nodes);
+
 	if (flags)
 		*flags++ = '\0';	/* terminate mode string */
 
 	for (i = 0; i <= MPOL_LOCAL; i++) {
 		if (!strcmp(str, policy_types[i])) {
-			*mode = i;
+			mode = i;
 			break;
 		}
 	}
 	if (i > MPOL_LOCAL)
 		goto out;
 
-	switch (*mode) {
+	switch (mode) {
 	case MPOL_PREFERRED:
-		/* Insist on a nodelist of one node only */
+		/*
+		 * Insist on a nodelist of one node only
+		 */
 		if (nodelist) {
 			char *rest = nodelist;
 			while (isdigit(*rest))
@@ -2010,63 +2031,73 @@ int mpol_parse_str(char *str, unsigned short *mode, unsigned short *mode_flags,
 				err = 0;
 		}
 		break;
-	case MPOL_BIND:
-		/* Insist on a nodelist */
-		if (nodelist)
-			err = 0;
-		break;
 	case MPOL_INTERLEAVE:
 		/*
 		 * Default to online nodes with memory if no nodelist
 		 */
 		if (!nodelist)
-			*policy_nodes = node_states[N_HIGH_MEMORY];
+			nodes = node_states[N_HIGH_MEMORY];
 		err = 0;
 		break;
-	default:
+	case MPOL_LOCAL:
 		/*
-		 * MPOL_DEFAULT or MPOL_LOCAL
-		 * Don't allow a nodelist nor flags
+		 * Don't allow a nodelist;  mpol_new() checks flags
 		 */
-		if (!nodelist && !flags)
-			err = 0;
-		if (*mode == MPOL_DEFAULT)
+		if (nodelist)
 			goto out;
-		/* else MPOL_LOCAL */
-		*mode = MPOL_PREFERRED;
-		nodes_clear(*policy_nodes);
+		mode = MPOL_PREFERRED;
 		break;
+
+	/*
+	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
+	 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
+	 */
 	}
 
-	*mode_flags = 0;
+	mode_flags = 0;
 	if (flags) {
 		/*
 		 * Currently, we only support two mutually exclusive
 		 * mode flags.
 		 */
 		if (!strcmp(flags, "static"))
-			*mode_flags |= MPOL_F_STATIC_NODES;
+			mode_flags |= MPOL_F_STATIC_NODES;
 		else if (!strcmp(flags, "relative"))
-			*mode_flags |= MPOL_F_RELATIVE_NODES;
+			mode_flags |= MPOL_F_RELATIVE_NODES;
 		else
 			err = 1;
 	}
+
+	new = mpol_new(mode, mode_flags, &nodes);
+	if (IS_ERR(new))
+		err = 1;
+	else if (no_context)
+		new->w.user_nodemask = nodes;	/* save for contextualization */
+
 out:
 	/* Restore string for error message */
 	if (nodelist)
 		*--nodelist = ':';
 	if (flags)
 		*--flags = '=';
+	if (!err)
+		*mpol = new;
 	return err;
 }
 #endif /* CONFIG_TMPFS */
 
-/*
+/**
+ * mpol_to_str - format a mempolicy structure for printing
+ * @buffer:  to contain formatted mempolicy string
+ * @maxlen:  length of @buffer
+ * @pol:  pointer to mempolicy to be formatted
+ * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
+ *
  * Convert a mempolicy into a string.
  * Returns the number of characters in buffer (if positive)
  * or an error (negative)
  */
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
 {
 	char *p = buffer;
 	int l;
@@ -2100,7 +2131,10 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
-		nodes = pol->v.nodes;
+		if (no_context)
+			nodes = pol->w.user_nodemask;
+		else
+			nodes = pol->v.nodes;
 		break;
 
 	default:
@@ -2231,7 +2265,7 @@ int show_numa_map(struct seq_file *m, void *v)
 		return 0;
 
 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
-	mpol_to_str(buffer, sizeof(buffer), pol);
+	mpol_to_str(buffer, sizeof(buffer), pol, 0);
 	mpol_cond_put(pol);
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
diff --git a/mm/shmem.c b/mm/shmem.c
index 3c620dc10135..e6d9298aa22a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1079,23 +1079,29 @@ redirty:
 
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_TMPFS
-static void shmem_show_mpol(struct seq_file *seq, unsigned short mode,
-			unsigned short flags, const nodemask_t policy_nodes)
+static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
 {
-	struct mempolicy temp;
 	char buffer[64];
 
-	if (mode == MPOL_DEFAULT)
+	if (!mpol || mpol->mode == MPOL_DEFAULT)
 		return;		/* show nothing */
 
-	temp.mode = mode;
-	temp.flags = flags;
-	temp.v.nodes = policy_nodes;
-
-	mpol_to_str(buffer, sizeof(buffer), &temp);
+	mpol_to_str(buffer, sizeof(buffer), mpol, 1);
 
 	seq_printf(seq, ",mpol=%s", buffer);
 }
+
+static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+	struct mempolicy *mpol = NULL;
+	if (sbinfo->mpol) {
+		spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
+		mpol = sbinfo->mpol;
+		mpol_get(mpol);
+		spin_unlock(&sbinfo->stat_lock);
+	}
+	return mpol;
+}
 #endif /* CONFIG_TMPFS */
 
 static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
@@ -1135,8 +1141,7 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
-			unsigned short flags, const nodemask_t policy_nodes)
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
 {
 }
 #endif /* CONFIG_TMPFS */
@@ -1154,6 +1159,13 @@ static inline struct page *shmem_alloc_page(gfp_t gfp,
 }
 #endif /* CONFIG_NUMA */
 
+#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+	return NULL;
+}
+#endif
+
 /*
  * shmem_getpage - either get the page from swap or allocate a new one
  *
@@ -1508,8 +1520,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 		case S_IFREG:
 			inode->i_op = &shmem_inode_operations;
 			inode->i_fop = &shmem_file_operations;
-			mpol_shared_policy_init(&info->policy, sbinfo->policy,
-					sbinfo->flags, &sbinfo->policy_nodes);
+			mpol_shared_policy_init(&info->policy,
+						 shmem_get_sbmpol(sbinfo));
 			break;
 		case S_IFDIR:
 			inc_nlink(inode);
@@ -1523,8 +1535,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 			 * Must not load anything in the rbtree,
 			 * mpol_free_shared_policy will not be called.
 			 */
-			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0,
-						NULL);
+			mpol_shared_policy_init(&info->policy, NULL);
 			break;
 		}
 	} else
@@ -2139,8 +2150,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 			if (*rest)
 				goto bad_val;
 		} else if (!strcmp(this_char,"mpol")) {
-			if (mpol_parse_str(value, &sbinfo->policy,
-					 &sbinfo->flags, &sbinfo->policy_nodes))
+			if (mpol_parse_str(value, &sbinfo->mpol, 1))
 				goto bad_val;
 		} else {
 			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2191,9 +2201,9 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 	sbinfo->free_blocks = config.max_blocks - blocks;
 	sbinfo->max_inodes  = config.max_inodes;
 	sbinfo->free_inodes = config.max_inodes - inodes;
-	sbinfo->policy      = config.policy;
-	sbinfo->flags	    = config.flags;
-	sbinfo->policy_nodes = config.policy_nodes;
+
+	mpol_put(sbinfo->mpol);
+	sbinfo->mpol        = config.mpol;	/* transfers initial ref */
 out:
 	spin_unlock(&sbinfo->stat_lock);
 	return error;
@@ -2214,8 +2224,7 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",uid=%u", sbinfo->uid);
 	if (sbinfo->gid != 0)
 		seq_printf(seq, ",gid=%u", sbinfo->gid);
-	shmem_show_mpol(seq, sbinfo->policy, sbinfo->flags,
-			sbinfo->policy_nodes);
+	shmem_show_mpol(seq, sbinfo->mpol);
 	return 0;
 }
 #endif /* CONFIG_TMPFS */
@@ -2245,9 +2254,7 @@ static int shmem_fill_super(struct super_block *sb,
 	sbinfo->mode = S_IRWXUGO | S_ISVTX;
 	sbinfo->uid = current->fsuid;
 	sbinfo->gid = current->fsgid;
-	sbinfo->policy = MPOL_DEFAULT;
-	sbinfo->flags = 0;
-	sbinfo->policy_nodes = node_states[N_HIGH_MEMORY];
+	sbinfo->mpol = NULL;
 	sb->s_fs_info = sbinfo;
 
 #ifdef CONFIG_TMPFS
-- 
cgit v1.2.3


From 8fe627ec5b7c47b1654dff50536d9709863295a3 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 28 Apr 2008 02:13:28 -0700
Subject: hugetlbfs: add missing TLB flush to hugetlb_cow()

A cow break on a hugetlbfs page with page_count > 1 will set a new pte with
set_huge_pte_at(), w/o any tlb flush operation.  The old pte will remain in
the tlb and subsequent write access to the page will result in a page fault
loop, for as long as it may take until the tlb is flushed from somewhere else.
 This patch introduces an architecture-specific huge_ptep_clear_flush()
function, which is called before the the set_huge_pte_at() in hugetlb_cow().

ATTENTION: This is just a nop on all architectures for now, the s390
implementation will come with our large page patch later.  Other architectures
should define their own huge_ptep_clear_flush() if needed.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d36e1f11a5f2..262d0a93d2b6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -892,6 +892,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
 	if (likely(pte_same(*ptep, pte))) {
 		/* Break COW */
+		huge_ptep_clear_flush(vma, address, ptep);
 		set_huge_pte_at(mm, address, ptep,
 				make_huge_pte(vma, new_page, 1));
 		/* Make the old page be freed below */
-- 
cgit v1.2.3


From 7f2e9525ba55b1c42ad6c4a5a59d7eb7bdd9be72 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Mon, 28 Apr 2008 02:13:29 -0700
Subject: hugetlbfs: common code update for s390

Huge ptes have a special type on s390 and cannot be handled with the standard
pte functions in certain cases, e.g.  because of a different location of the
invalid bit.  This patch adds some new architecture- specific functions to
hugetlb common code, as a prerequisite for the s390 large page support.

This won't affect other architectures in functionality, but I need to add some
new dummy inline functions to the headers.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 262d0a93d2b6..df28c1773fb2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -132,6 +132,7 @@ static void update_and_free_page(struct page *page)
 	}
 	set_compound_page_dtor(page, NULL);
 	set_page_refcounted(page);
+	arch_release_hugepage(page);
 	__free_pages(page, HUGETLB_PAGE_ORDER);
 }
 
@@ -201,6 +202,10 @@ static struct page *alloc_fresh_huge_page_node(int nid)
 		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
 		HUGETLB_PAGE_ORDER);
 	if (page) {
+		if (arch_prepare_hugepage(page)) {
+			__free_pages(page, HUGETLB_PAGE_ORDER);
+			return 0;
+		}
 		set_compound_page_dtor(page, free_huge_page);
 		spin_lock(&hugetlb_lock);
 		nr_huge_pages++;
@@ -735,7 +740,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
 		entry =
 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 	} else {
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+		entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
 	}
 	entry = pte_mkyoung(entry);
 	entry = pte_mkhuge(entry);
@@ -748,8 +753,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
 {
 	pte_t entry;
 
-	entry = pte_mkwrite(pte_mkdirty(*ptep));
-	if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
+	entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
+	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
 		update_mmu_cache(vma, address, entry);
 	}
 }
@@ -779,10 +784,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 
 		spin_lock(&dst->page_table_lock);
 		spin_lock(&src->page_table_lock);
-		if (!pte_none(*src_pte)) {
+		if (!huge_pte_none(huge_ptep_get(src_pte))) {
 			if (cow)
-				ptep_set_wrprotect(src, addr, src_pte);
-			entry = *src_pte;
+				huge_ptep_set_wrprotect(src, addr, src_pte);
+			entry = huge_ptep_get(src_pte);
 			ptepage = pte_page(entry);
 			get_page(ptepage);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
@@ -826,7 +831,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 			continue;
 
 		pte = huge_ptep_get_and_clear(mm, address, ptep);
-		if (pte_none(pte))
+		if (huge_pte_none(pte))
 			continue;
 
 		page = pte_page(pte);
@@ -890,7 +895,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 
 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
-	if (likely(pte_same(*ptep, pte))) {
+	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
 		/* Break COW */
 		huge_ptep_clear_flush(vma, address, ptep);
 		set_huge_pte_at(mm, address, ptep,
@@ -960,7 +965,7 @@ retry:
 		goto backout;
 
 	ret = 0;
-	if (!pte_none(*ptep))
+	if (!huge_pte_none(huge_ptep_get(ptep)))
 		goto backout;
 
 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
@@ -1002,8 +1007,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * the same page in the page cache.
 	 */
 	mutex_lock(&hugetlb_instantiation_mutex);
-	entry = *ptep;
-	if (pte_none(entry)) {
+	entry = huge_ptep_get(ptep);
+	if (huge_pte_none(entry)) {
 		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
 		mutex_unlock(&hugetlb_instantiation_mutex);
 		return ret;
@@ -1013,7 +1018,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	spin_lock(&mm->page_table_lock);
 	/* Check for a racing update before calling hugetlb_cow */
-	if (likely(pte_same(entry, *ptep)))
+	if (likely(pte_same(entry, huge_ptep_get(ptep))))
 		if (write_access && !pte_write(entry))
 			ret = hugetlb_cow(mm, vma, address, ptep, entry);
 	spin_unlock(&mm->page_table_lock);
@@ -1043,7 +1048,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
 
-		if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) {
+		if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
+		    (write && !pte_write(huge_ptep_get(pte)))) {
 			int ret;
 
 			spin_unlock(&mm->page_table_lock);
@@ -1059,7 +1065,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 
 		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
-		page = pte_page(*pte);
+		page = pte_page(huge_ptep_get(pte));
 same_page:
 		if (pages) {
 			get_page(page);
@@ -1108,7 +1114,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 			continue;
 		if (huge_pmd_unshare(mm, &address, ptep))
 			continue;
-		if (!pte_none(*ptep)) {
+		if (!huge_pte_none(huge_ptep_get(ptep))) {
 			pte = huge_ptep_get_and_clear(mm, address, ptep);
 			pte = pte_mkhuge(pte_modify(pte, newprot));
 			set_huge_pte_at(mm, address, ptep, pte);
-- 
cgit v1.2.3


From 04753278769f3b6c3b79a080edb52f21d83bf6e2 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Mon, 28 Apr 2008 02:13:31 -0700
Subject: memory hotplug: register section/node id to free

This patch set is to free pages which is allocated by bootmem for
memory-hotremove.  Some structures of memory management are allocated by
bootmem.  ex) memmap, etc.

To remove memory physically, some of them must be freed according to
circumstance.  This patch set makes basis to free those pages, and free
memmaps.

Basic my idea is using remain members of struct page to remember information
of users of bootmem (section number or node id).  When the section is
removing, kernel can confirm it.  By this information, some issues can be
solved.

  1) When the memmap of removing section is allocated on other
     section by bootmem, it should/can be free.
  2) When the memmap of removing section is allocated on the
     same section, it shouldn't be freed. Because the section has to be
     logical memory offlined already and all pages must be isolated against
     page allocater. If it is freed, page allocator may use it which will
     be removed physically soon.
  3) When removing section has other section's memmap,
     kernel will be able to show easily which section should be removed
     before it for user. (Not implemented yet)
  4) When the above case 2), the page isolation will be able to check and skip
     memmap's page when logical memory offline (offline_pages()).
     Current page isolation code fails in this case because this page is
     just reserved page and it can't distinguish this pages can be
     removed or not. But, it will be able to do by this patch.
     (Not implemented yet.)
  5) The node information like pgdat has similar issues. But, this
     will be able to be solved too by this.
     (Not implemented yet, but, remembering node id in the pages.)

Fortunately, current bootmem allocator just keeps PageReserved flags,
and doesn't use any other members of page struct. The users of
bootmem doesn't use them too.

This patch:

This is to register information which is node or section's id.  Kernel can
distinguish which node/section uses the pages allcated by bootmem.  This is
basis for hot-remove sections or nodes.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/bootmem.c        |  1 +
 mm/memory_hotplug.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 mm/sparse.c         |  3 +-
 3 files changed, 100 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/bootmem.c b/mm/bootmem.c
index b6791646143e..369624d2789c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -461,6 +461,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 
 unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
+	register_page_bootmem_info_node(pgdat);
 	return free_all_bootmem_core(pgdat);
 }
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c8b3ca79de2d..cba36ef0d506 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -58,8 +58,105 @@ static void release_memory_resource(struct resource *res)
 	return;
 }
 
-
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+static void get_page_bootmem(unsigned long info,  struct page *page, int magic)
+{
+	atomic_set(&page->_mapcount, magic);
+	SetPagePrivate(page);
+	set_page_private(page, info);
+	atomic_inc(&page->_count);
+}
+
+void put_page_bootmem(struct page *page)
+{
+	int magic;
+
+	magic = atomic_read(&page->_mapcount);
+	BUG_ON(magic >= -1);
+
+	if (atomic_dec_return(&page->_count) == 1) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		reset_page_mapcount(page);
+		__free_pages_bootmem(page, 0);
+	}
+
+}
+
+void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+	unsigned long *usemap, mapsize, section_nr, i;
+	struct mem_section *ms;
+	struct page *page, *memmap;
+
+	if (!pfn_valid(start_pfn))
+		return;
+
+	section_nr = pfn_to_section_nr(start_pfn);
+	ms = __nr_to_section(section_nr);
+
+	/* Get section's memmap address */
+	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+	/*
+	 * Get page for the memmap's phys address
+	 * XXX: need more consideration for sparse_vmemmap...
+	 */
+	page = virt_to_page(memmap);
+	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
+	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
+
+	/* remember memmap's page */
+	for (i = 0; i < mapsize; i++, page++)
+		get_page_bootmem(section_nr, page, SECTION_INFO);
+
+	usemap = __nr_to_section(section_nr)->pageblock_flags;
+	page = virt_to_page(usemap);
+
+	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+
+	for (i = 0; i < mapsize; i++, page++)
+		get_page_bootmem(section_nr, page, MIX_INFO);
+
+}
+
+void register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+	unsigned long i, pfn, end_pfn, nr_pages;
+	int node = pgdat->node_id;
+	struct page *page;
+	struct zone *zone;
+
+	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
+	page = virt_to_page(pgdat);
+
+	for (i = 0; i < nr_pages; i++, page++)
+		get_page_bootmem(node, page, NODE_INFO);
+
+	zone = &pgdat->node_zones[0];
+	for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
+		if (zone->wait_table) {
+			nr_pages = zone->wait_table_hash_nr_entries
+				* sizeof(wait_queue_head_t);
+			nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
+			page = virt_to_page(zone->wait_table);
+
+			for (i = 0; i < nr_pages; i++, page++)
+				get_page_bootmem(node, page, NODE_INFO);
+		}
+	}
+
+	pfn = pgdat->node_start_pfn;
+	end_pfn = pfn + pgdat->node_spanned_pages;
+
+	/* register_section info */
+	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
+		register_page_bootmem_info_section(pfn);
+
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
 static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
diff --git a/mm/sparse.c b/mm/sparse.c
index 186a85bf7912..8903c484389a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -210,7 +210,6 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
 /*
  * Decode mem_map from the coded memmap
  */
-static
 struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
 {
 	/* mask off the extra low bits of information */
@@ -233,7 +232,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
 	return 1;
 }
 
-static unsigned long usemap_size(void)
+unsigned long usemap_size(void)
 {
 	unsigned long size_bytes;
 	size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
-- 
cgit v1.2.3


From 9d99217a02a06a7cc83f065b73e976970970c58c Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Mon, 28 Apr 2008 02:13:32 -0700
Subject: memory hotplug: align memmap to page size

To free memmap easier, this patch aligns it to page size.  Bootmem allocater
may mix some objects in one pages.  It's not good for freeing memmap of memory
hot-remove.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/sparse.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/sparse.c b/mm/sparse.c
index 8903c484389a..5398d48c360a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -273,8 +273,8 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
 	if (map)
 		return map;
 
-	map = alloc_bootmem_node(NODE_DATA(nid),
-			sizeof(struct page) * PAGES_PER_SECTION);
+	map = alloc_bootmem_pages_node(NODE_DATA(nid),
+		       PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
 	return map;
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-- 
cgit v1.2.3


From e70260aabea3af2a84b951e75166dcebe689b88e Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Mon, 28 Apr 2008 02:13:32 -0700
Subject: memory hotplug: make alloc_bootmem_section()

alloc_bootmem_section() can allocate specified section's area.  This is used
for usemap to keep same section with pgdat by later patch.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/bootmem.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'mm')

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 369624d2789c..e8fb927392b9 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -545,6 +545,37 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 	return __alloc_bootmem(size, align, goal);
 }
 
+#ifdef CONFIG_SPARSEMEM
+void * __init alloc_bootmem_section(unsigned long size,
+				    unsigned long section_nr)
+{
+	void *ptr;
+	unsigned long limit, goal, start_nr, end_nr, pfn;
+	struct pglist_data *pgdat;
+
+	pfn = section_nr_to_pfn(section_nr);
+	goal = PFN_PHYS(pfn);
+	limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1;
+	pgdat = NODE_DATA(early_pfn_to_nid(pfn));
+	ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
+				   limit);
+
+	if (!ptr)
+		return NULL;
+
+	start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr)));
+	end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size));
+	if (start_nr != section_nr || end_nr != section_nr) {
+		printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n",
+		       section_nr);
+		free_bootmem_core(pgdat->bdata, __pa(ptr), size);
+		ptr = NULL;
+	}
+
+	return ptr;
+}
+#endif
+
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT	0xffffffffUL
 #endif
-- 
cgit v1.2.3


From 86f6dae1377523689bd8468fed2f2dd180fc0560 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Mon, 28 Apr 2008 02:13:33 -0700
Subject: memory hotplug: allocate usemap on the section with pgdat

Usemaps are allocated on the section which has pgdat by this.

Because usemap size is very small, many other sections usemaps are allocated
on only one page.  If a section has usemap, it can't be removed until removing
other sections.  This dependency is not desirable for memory removing.

Pgdat has similar feature.  When a section has pgdat area, it must be the last
section for removing on the node.  So, if section A has pgdat and section B
has usemap for section A, Both sections can't be removed due to dependency
each other.

To solve this issue, this patch collects usemap on same section with pgdat.
If other sections doesn't have any dependency, this section will be able to be
removed finally.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/sparse.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/sparse.c b/mm/sparse.c
index 5398d48c360a..08f053218ee8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -249,11 +249,22 @@ static unsigned long *__kmalloc_section_usemap(void)
 
 static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
 {
-	unsigned long *usemap;
+	unsigned long *usemap, section_nr;
 	struct mem_section *ms = __nr_to_section(pnum);
 	int nid = sparse_early_nid(ms);
+	struct pglist_data *pgdat = NODE_DATA(nid);
 
-	usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+	/*
+	 * Usemap's page can't be freed until freeing other sections
+	 * which use it. And, Pgdat has same feature.
+	 * If section A has pgdat and section B has usemap for other
+	 * sections (includes section A), both sections can't be removed,
+	 * because there is the dependency each other.
+	 * To solve above issue, this collects all usemap on the same section
+	 * which has pgdat.
+	 */
+	section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+	usemap = alloc_bootmem_section(usemap_size(), section_nr);
 	if (usemap)
 		return usemap;
 
-- 
cgit v1.2.3


From 0c0a4a517a31e05efb38304668198a873bfec6ca Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Mon, 28 Apr 2008 02:13:34 -0700
Subject: memory hotplug: free memmaps allocated by bootmem

This patch is to free memmaps which is allocated by bootmem.

Freeing usemap is not necessary.  The pages of usemap may be necessary for
other sections.

If removing section is last section on the node, its section is the final user
of usemap page.  (usemaps are allocated on its section by previous patch.) But
it shouldn't be freed too, because the section must be logical offline state
which all pages are isolated against page allocater.  If it is freed, page
alloctor may use it which will be removed physically soon.  It will be
disaster.  So, this patch keeps it as it is.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h       |  3 +--
 mm/memory_hotplug.c | 11 +++++++++++
 mm/page_alloc.c     |  2 +-
 mm/sparse.c         | 51 +++++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 60 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/internal.h b/mm/internal.h
index 789727309f4d..0034e947e4bc 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -34,8 +34,7 @@ static inline void __put_page(struct page *page)
 	atomic_dec(&page->_count);
 }
 
-extern void __init __free_pages_bootmem(struct page *page,
-						unsigned int order);
+extern void __free_pages_bootmem(struct page *page, unsigned int order);
 
 /*
  * function for dealing with page's order in buddy system.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index cba36ef0d506..c4ba85c8cb00 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -198,6 +198,16 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 	return register_new_memory(__pfn_to_section(phys_start_pfn));
 }
 
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+	/*
+	 * XXX: Freeing memmap with vmemmap is not implement yet.
+	 *      This should be removed later.
+	 */
+	return -EBUSY;
+}
+#else
 static int __remove_section(struct zone *zone, struct mem_section *ms)
 {
 	unsigned long flags;
@@ -216,6 +226,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
 	pgdat_resize_unlock(pgdat, &flags);
 	return 0;
 }
+#endif
 
 /*
  * Reasonably generic function for adding memory.  It is
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e0fc3baba843..d3358efdf4e6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -546,7 +546,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 /*
  * permit the bootmem allocator to evade page validation on high-order frees
  */
-void __init __free_pages_bootmem(struct page *page, unsigned int order)
+void __free_pages_bootmem(struct page *page, unsigned int order)
 {
 	if (order == 0) {
 		__ClearPageReserved(page);
diff --git a/mm/sparse.c b/mm/sparse.c
index 08f053218ee8..dff71f173ae9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
+#include "internal.h"
 #include <asm/dma.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
@@ -376,6 +377,9 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
 	return; /* XXX: Not implemented yet */
 }
+static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+{
+}
 #else
 static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
 {
@@ -413,17 +417,47 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 		free_pages((unsigned long)memmap,
 			   get_order(sizeof(struct page) * nr_pages));
 }
+
+static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+{
+	unsigned long maps_section_nr, removing_section_nr, i;
+	int magic;
+
+	for (i = 0; i < nr_pages; i++, page++) {
+		magic = atomic_read(&page->_mapcount);
+
+		BUG_ON(magic == NODE_INFO);
+
+		maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
+		removing_section_nr = page->private;
+
+		/*
+		 * When this function is called, the removing section is
+		 * logical offlined state. This means all pages are isolated
+		 * from page allocator. If removing section's memmap is placed
+		 * on the same section, it must not be freed.
+		 * If it is freed, page allocator may allocate it which will
+		 * be removed physically soon.
+		 */
+		if (maps_section_nr != removing_section_nr)
+			put_page_bootmem(page);
+	}
+}
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 static void free_section_usemap(struct page *memmap, unsigned long *usemap)
 {
+	struct page *usemap_page;
+	unsigned long nr_pages;
+
 	if (!usemap)
 		return;
 
+	usemap_page = virt_to_page(usemap);
 	/*
 	 * Check to see if allocation came from hot-plug-add
 	 */
-	if (PageSlab(virt_to_page(usemap))) {
+	if (PageSlab(usemap_page)) {
 		kfree(usemap);
 		if (memmap)
 			__kfree_section_memmap(memmap, PAGES_PER_SECTION);
@@ -431,10 +465,19 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
 	}
 
 	/*
-	 * TODO: Allocations came from bootmem - how do I free up ?
+	 * The usemap came from bootmem. This is packed with other usemaps
+	 * on the section which has pgdat at boot time. Just keep it as is now.
 	 */
-	printk(KERN_WARNING "Not freeing up allocations from bootmem "
-			"- leaking memory\n");
+
+	if (memmap) {
+		struct page *memmap_page;
+		memmap_page = virt_to_page(memmap);
+
+		nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+			>> PAGE_SHIFT;
+
+		free_map_bootmem(memmap_page, nr_pages);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 97d87c9710bc6c5f2585fb9dc58f5bedbe996f10 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 28 Apr 2008 02:13:35 -0700
Subject: oom_kill: remove unused parameter in badness()

In commit 4c4a22148909e4c003562ea7ffe0a06e26919e3c, we moved the
memcontroller-related code from badness() to select_bad_process(), so the
parameter 'mem' in badness() is unused now.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e41504aa5da9..8a5467ee6265 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -53,8 +53,7 @@ static DEFINE_SPINLOCK(zone_scan_mutex);
  *    of least surprise ... (be careful when you change it)
  */
 
-unsigned long badness(struct task_struct *p, unsigned long uptime,
-			struct mem_cgroup *mem)
+unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
 	struct mm_struct *mm;
@@ -256,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
 
-		points = badness(p, uptime.tv_sec, mem);
+		points = badness(p, uptime.tv_sec);
 		if (points > *ppoints || !chosen) {
 			chosen = p;
 			*ppoints = points;
-- 
cgit v1.2.3


From 2309f9e6fe3f1de661eab9613f7903ab4420c753 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Mon, 28 Apr 2008 02:13:35 -0700
Subject: mm/page_alloc.c: remove hand-coded get_order()

Remove hand-coded get_order() from page_alloc.c.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d3358efdf4e6..d1cf4f05dcda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4345,9 +4345,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 		else if (hashdist)
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
-			unsigned long order;
-			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
-				;
+			unsigned long order = get_order(size);
 			table = (void*) __get_free_pages(GFP_ATOMIC, order);
 			/*
 			 * If bucketsize is not a power-of-two, we may free
-- 
cgit v1.2.3


From 468fd62ed9090ccbe872489df5d0d099510df4b5 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Mon, 28 Apr 2008 02:13:37 -0700
Subject: vmstats: add cond_resched() to refresh_cpu_vm_stats()

We've found that it can take quite a bit of time (100's of usec) to get
through the zone loop in refresh_cpu_vm_stats().

Adding a cond_resched() to allow other threads to run in the non-preemptive
case.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4c21670f8d91..ec6035eda933 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -322,6 +322,7 @@ void refresh_cpu_vm_stats(int cpu)
 				p->expire = 3;
 #endif
 			}
+		cond_resched();
 #ifdef CONFIG_NUMA
 		/*
 		 * Deal with draining the remote pageset of this
-- 
cgit v1.2.3


From 4016a1390d07f15b267eecb20e76a48fd5c524ef Mon Sep 17 00:00:00 2001
From: Michael Hennerich <Michael.Hennerich@analog.com>
Date: Mon, 28 Apr 2008 02:13:38 -0700
Subject: mm/nommu.c: return 0 from kobjsize with invalid objects

Don't perform kobjsize operations on objects the kernel doesn't manage.

On Blackfin, drivers can get dma coherent memory by calling a function
dma_alloc_coherent(). We do this in nommu by configuring a chunk of uncached
memory at the top of memory.

Since we don't want the kernel to use the uncached memory, we lie to the
kernel, and tell it that it's max memory is between 0, and the start of the
uncached dma coherent section.

this all works well, until this memory gets exposed into userspace (with a
frame buffer), when you look at the process's maps, it shows the framebuf:

root:/proc> cat maps
[snip]
03f0ef00-03f34700 rw-p 00000000 1f:00 192        /dev/fb0
root:/proc>

This is outside the "normal" range for the kernel. When the kernel tries to
find the size of this object (when you run ps), it dies in nommu.c in
kobjsize.

BUG_ON(page->index >= MAX_ORDER);

since the page we are referring to is outside what the kernel thinks is it's
max valid memory.

root:~> while [ 1 ]; ps > /dev/null; done
kernel BUG at mm/nommu.c:119!
Kernel panic - not syncing: BUG!

We fixed this by adding a check to reject out of range object pointers as it
already does that for NULL pointers.

Signed-off-by: Michael Hennerich <Michael.Hennerich@analog.com>
Signed-off-by: Robin Getz <rgetz@blackfin.uclinux.org>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nommu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 5d8ae086f74e..1d32fe89d57b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -105,7 +105,11 @@ unsigned int kobjsize(const void *objp)
 {
 	struct page *page;
 
-	if (!objp || !((page = virt_to_page(objp))))
+	/*
+	 * If the object we have should not have ksize performed on it,
+	 * return size of 0
+	 */
+	if (!objp || (unsigned long)objp >= memory_end || !((page = virt_to_page(objp))))
 		return 0;
 
 	if (PageSlab(page))
-- 
cgit v1.2.3


From 1e5ad9a3b9b78767a2eb1345201e46f41f9457ef Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 28 Apr 2008 20:40:08 +0300
Subject: mm/memory_hotplug.c must #include "internal.h"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes the following compile error caused by commit
04753278769f3b6c3b79a080edb52f21d83bf6e2 ("memory hotplug: register
section/node id to free"):

    CC      mm/memory_hotplug.o
  /home/bunk/linux/kernel-2.6/git/linux-2.6/mm/memory_hotplug.c: In function ‘put_page_bootmem’:
  /home/bunk/linux/kernel-2.6/git/linux-2.6/mm/memory_hotplug.c:82: error: implicit declaration of function ‘__free_pages_bootmem’
  /home/bunk/linux/kernel-2.6/git/linux-2.6/mm/memory_hotplug.c: At top level:
  /home/bunk/linux/kernel-2.6/git/linux-2.6/mm/memory_hotplug.c:87: warning: no previous prototype for ‘register_page_bootmem_info_section’
  make[2]: *** [mm/memory_hotplug.o] Error 1

[ Andrew: "Argh.  The -mm-only memory-hotplug-add-removable-to-sysfs-
  to-show-memblock-removability.patch debugging patch adds that include
  so nobody hit this before. ]

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c4ba85c8cb00..b17dca7249f8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,8 @@
 
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 /* add this memory to iomem resource */
 static struct resource *register_memory_resource(u64 start, u64 size)
 {
-- 
cgit v1.2.3


From 7b8ee84d8926e6c6ec584548d23a12f1410b4db7 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 28 Apr 2008 14:13:19 -0700
Subject: mm: fix integer as NULL pointer warnings

mm/hugetlb.c:207:11: warning: Using plain integer as NULL pointer

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index df28c1773fb2..2c37c67ed8c9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -204,7 +204,7 @@ static struct page *alloc_fresh_huge_page_node(int nid)
 	if (page) {
 		if (arch_prepare_hugepage(page)) {
 			__free_pages(page, HUGETLB_PAGE_ORDER);
-			return 0;
+			return NULL;
 		}
 		set_compound_page_dtor(page, free_huge_page);
 		spin_lock(&hugetlb_lock);
-- 
cgit v1.2.3