From 41d54d3bf83f62d3ff5948cb788fe6007e66a0d0 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Thu, 3 Jul 2008 09:14:26 -0500
Subject: slub: Do not use 192 byte sized cache if minimum alignment is 128
 byte

The 192 byte cache is not necessary if we have a basic alignment of 128
byte. If it would be used then the 192 would be aligned to the next 128 byte
boundary which would result in another 256 byte cache. Two 256 kmalloc caches
cause sysfs to complain about a duplicate entry.

MIPS needs 128 byte aligned kmalloc caches and spits out warnings on boot without
this patch.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 0987d1cd943c..2c9a62d1f429 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2995,8 +2995,6 @@ void __init kmem_cache_init(void)
 		create_kmalloc_cache(&kmalloc_caches[1],
 				"kmalloc-96", 96, GFP_KERNEL);
 		caches++;
-	}
-	if (KMALLOC_MIN_SIZE <= 128) {
 		create_kmalloc_cache(&kmalloc_caches[2],
 				"kmalloc-192", 192, GFP_KERNEL);
 		caches++;
@@ -3026,6 +3024,16 @@ void __init kmem_cache_init(void)
 	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
 		size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
 
+	if (KMALLOC_MIN_SIZE == 128) {
+		/*
+		 * The 192 byte sized cache is not used if the alignment
+		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
+		 * instead.
+		 */
+		for (i = 128 + 8; i <= 192; i += 8)
+			size_index[(i - 1) / 8] = 8;
+	}
+
 	slab_state = UP;
 
 	/* Provide the correct kmalloc names now that the caches are up */
-- 
cgit v1.2.3


From 494de90098784b8e2797598cefdd34188884ec2e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 3 Jul 2008 05:27:51 +0100
Subject: Do not overwrite nr_zones on !NUMA when initialising zlcache_ptr

The non-NUMA case of build_zonelist_cache() would initialize the
zlcache_ptr for both node_zonelists[] to NULL.

Which is problematic, since non-NUMA only has a single node_zonelists[]
entry, and trying to zero the non-existent second one just overwrote the
nr_zones field instead.

As kswapd uses this value to determine what reclaim work is necessary,
the result is that kswapd never reclaims.  This causes processes to
stall frequently in low-memory situations as they always direct reclaim.
This patch initialises zlcache_ptr correctly.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Tested-by: Dan Williams <dan.j.williams@intel.com>
[ Simplified patch a bit ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2f552955a02f..f32fae3121f0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2328,7 +2328,6 @@ static void build_zonelists(pg_data_t *pgdat)
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	pgdat->node_zonelists[0].zlcache_ptr = NULL;
-	pgdat->node_zonelists[1].zlcache_ptr = NULL;
 }
 
 #endif	/* CONFIG_NUMA */
-- 
cgit v1.2.3


From cde53535991fbb5c34a1566f25955297c1487b8d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 4 Jul 2008 09:59:22 -0700
Subject: Christoph has moved

Remove all clameter@sgi.com addresses from the kernel tree since they will
become invalid on June 27th.  Change my maintainer email address for the
slab allocators to cl@linux-foundation.org (which will be the new email
address for the future).

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/allocpercpu.c    | 2 +-
 mm/migrate.c        | 2 +-
 mm/slub.c           | 2 +-
 mm/sparse-vmemmap.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index f4026bae6eed..05f2b4009ccc 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -1,7 +1,7 @@
 /*
  * linux/mm/allocpercpu.c
  *
- * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
+ * Separated from slab.c August 11, 2006 Christoph Lameter
  */
 #include <linux/mm.h>
 #include <linux/module.h>
diff --git a/mm/migrate.c b/mm/migrate.c
index 112bcaeaa104..55bd355d170d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -9,7 +9,7 @@
  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  * Hirokazu Takahashi <taka@valinux.co.jp>
  * Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter
  */
 
 #include <linux/migrate.h>
diff --git a/mm/slub.c b/mm/slub.c
index 2c9a62d1f429..1a427c0ae83b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5,7 +5,7 @@
  * The allocator synchronizes using per slab locks and only
  * uses a centralized lock to manage a pool of partial slabs.
  *
- * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
+ * (C) 2007 SGI, Christoph Lameter
  */
 
 #include <linux/mm.h>
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 99c4f36eb8a3..a91b5f8fcaf6 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -1,7 +1,7 @@
 /*
  * Virtual Memory Map support
  *
- * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>.
+ * (C) 2007 sgi. Christoph Lameter.
  *
  * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
  * virt_to_page, page_address() to be implemented as a base offset
-- 
cgit v1.2.3


From 251b97f552b1ad414cc5a9ccc8e4e94503edd5fc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 4 Jul 2008 09:59:24 -0700
Subject: mm: dirty page accounting vs VM_MIXEDMAP

Dirty page accounting accurately measures the amound of dirty pages in
writable shared mappings by mapping the pages RO (as indicated by
vma_wants_writenotify).  We then trap on first write and call
set_page_dirty() on the page, after which we map the page RW and
continue execution.

When we launder dirty pages, we call clear_page_dirty_for_io() which
clears both the dirty flag, and maps the page RO again before we start
writeout so that the story can repeat itself.

vma_wants_writenotify() excludes VM_PFNMAP on the basis that we cannot
do the regular dirty page stuff on raw PFNs and the memory isn't going
anywhere anyway.

The recently introduced VM_MIXEDMAP mixes both !pfn_valid() and
pfn_valid() pages in a single mapping.

We can't do dirty page accounting on !pfn_valid() pages as stated
above, and mapping them RO causes them to be COW'ed on write, which
breaks VM_SHARED semantics.

Excluding VM_MIXEDMAP in vma_wants_writenotify() would mean we don't do
the regular dirty page accounting for the pfn_valid() pages, which
would bring back all the head-aches from inaccurate dirty page
accounting.

So instead, we let the !pfn_valid() pages get mapped RO, but fix them
up unconditionally in the fault path.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: "Jared Hulbert" <jaredeh@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index d14b251a25a6..350e646032f5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1697,8 +1697,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *dirty_page = NULL;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
-	if (!old_page)
+	if (!old_page) {
+		/*
+		 * VM_MIXEDMAP !pfn_valid() case
+		 *
+		 * We should not cow pages in a shared writeable mapping.
+		 * Just mark the pages writable as we can't do any dirty
+		 * accounting on raw pfn maps.
+		 */
+		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+				     (VM_WRITE|VM_SHARED))
+			goto reuse;
 		goto gotten;
+	}
 
 	/*
 	 * Take out anonymous pages first, anonymous shared vmas are
@@ -1751,6 +1762,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	if (reuse) {
+reuse:
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = pte_mkyoung(orig_pte);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-- 
cgit v1.2.3


From 7a36a752d006f6874049da510297eeb7f09d92a7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 4 Jul 2008 09:59:28 -0700
Subject: get_user_pages(): fix possible page leak on oom

get_user_pages() must not return the error when i != 0.  When pages !=
NULL we have i get_page()'ed pages.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 350e646032f5..2302d228fe04 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1151,7 +1151,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			 * be processed until returning to user space.
 			 */
 			if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
-				return -ENOMEM;
+				return i ? i : -ENOMEM;
 
 			if (write)
 				foll_flags |= FOLL_WRITE;
-- 
cgit v1.2.3


From d79df630f622806c4d0e116fbaf6ebf6baf53461 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 4 Jul 2008 12:24:13 -0700
Subject: mempolicy: mask off internal flags for userspace API

Flags considered internal to the mempolicy kernel code are stored as part
of the "flags" member of struct mempolicy.

Before exposing a policy type to userspace via get_mempolicy(), these
internal flags must be masked.  Flags exposed to userspace, however,
should still be returned to the user.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a37a5034f63d..c94e58b192c3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -729,7 +729,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	} else {
 		*policy = pol == &default_policy ? MPOL_DEFAULT :
 						pol->mode;
-		*policy |= pol->flags;
+		/*
+		 * Internal mempolicy flags must be masked off before exposing
+		 * the policy to userspace.
+		 */
+		*policy |= (pol->flags & MPOL_MODE_FLAGS);
 	}
 
 	if (vma) {
-- 
cgit v1.2.3


From bdb21928512a860a60e6a24a849dc5b63cbaf96a Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Thu, 10 Jul 2008 22:21:58 +0200
Subject: slub: Fix use-after-preempt of per-CPU data structure

Vegard Nossum reported a crash in kmem_cache_alloc():

	BUG: unable to handle kernel paging request at da87d000
	IP: [<c01991c7>] kmem_cache_alloc+0xc7/0xe0
	*pde = 28180163 *pte = 1a87d160
	Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC
	Pid: 3850, comm: grep Not tainted (2.6.26-rc9-00059-gb190333 #5)
	EIP: 0060:[<c01991c7>] EFLAGS: 00210203 CPU: 0
	EIP is at kmem_cache_alloc+0xc7/0xe0
	EAX: 00000000 EBX: da87c100 ECX: 1adad71a EDX: 6b6b6b6b
	ESI: 00200282 EDI: da87d000 EBP: f60bfe74 ESP: f60bfe54
	DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068

and analyzed it:

  "The register %ecx looks innocent but is very important here. The disassembly:

       mov    %edx,%ecx
       shr    $0x2,%ecx
       rep stos %eax,%es:(%edi) <-- the fault

   So %ecx has been loaded from %edx... which is 0x6b6b6b6b/POISON_FREE.
   (0x6b6b6b6b >> 2 == 0x1adadada.)

   %ecx is the counter for the memset, from here:

       memset(object, 0, c->objsize);

  i.e. %ecx was loaded from c->objsize, so "c" must have been freed.
  Where did "c" come from? Uh-oh...

       c = get_cpu_slab(s, smp_processor_id());

  This looks like it has very much to do with CPU hotplug/unplug. Is
  there a race between SLUB/hotplug since the CPU slab is used after it
  has been freed?"

Good analysis.

Yeah, it's possible that a caller of kmem_cache_alloc() -> slab_alloc()
can be migrated on another CPU right after local_irq_restore() and
before memset().  The inital cpu can become offline in the mean time (or
a migration is a consequence of the CPU going offline) so its
'kmem_cache_cpu' structure gets freed ( slab_cpuup_callback).

At some point of time the caller continues on another CPU having an
obsolete pointer...

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Reported-by: Vegard Nossum <vegard.nossum@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 1a427c0ae83b..315c392253c7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1628,9 +1628,11 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 	void **object;
 	struct kmem_cache_cpu *c;
 	unsigned long flags;
+	unsigned int objsize;
 
 	local_irq_save(flags);
 	c = get_cpu_slab(s, smp_processor_id());
+	objsize = c->objsize;
 	if (unlikely(!c->freelist || !node_match(c, node)))
 
 		object = __slab_alloc(s, gfpflags, node, addr, c);
@@ -1643,7 +1645,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 	local_irq_restore(flags);
 
 	if (unlikely((gfpflags & __GFP_ZERO) && object))
-		memset(object, 0, c->objsize);
+		memset(object, 0, objsize);
 
 	return object;
 }
-- 
cgit v1.2.3