From 42a886af728c089df8da1b0017b0e7e6c81b5335 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 17 Jun 2008 17:47:50 -0700
Subject: x86-64: Fix "bytes left to copy" return value for copy_from_user()

Most users by far do not care about the exact return value (they only
really care about whether the copy succeeded in its entirety or not),
but a few special core routines actually care deeply about exactly how
many bytes were copied from user space.

And the unrolled versions of the x86-64 user copy routines would
sometimes report that it had copied more bytes than it actually had.

Very few uses actually have partial copies to begin with, but to make
this bug even harder to trigger, most x86 CPU's use the "rep string"
instructions for normal user copies, and that version didn't have this
issue.

To make it even harder to hit, the one user of this that really cared
about the return value (and used the uncached version of the copy that
doesn't use the "rep string" instructions) was the generic write
routine, which pre-populated its source, once more hiding the problem by
avoiding the exception case that triggers the bug.

In other words, very special thanks to Bron Gondwana who not only
triggered this, but created a test-program to show it, and bisected the
behavior down to commit 08291429cfa6258c4cd95d8833beb40f828b194e ("mm:
fix pagecache write deadlocks") which changed the access pattern just
enough that you can now trigger it with 'writev()' with multiple
iovec's.

That commit itself was not the cause of the bug, it just allowed all the
stars to align just right that you could trigger the problem.

[ Side note: this is just the minimal fix to make the copy routines
  (with __copy_from_user_inatomic_nocache as the particular version that
  was involved in showing this) have the right return values.

  We really should improve on the exceptional case further - to make the
  copy do a byte-accurate copy up to the exact page limit that causes it
  to fail.  As it is, the callers have to do extra work to handle the
  limit case gracefully. ]

Reported-by: Bron Gondwana <brong@fastmail.fm>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

 (which didn't have this problem), and since
most users that do the carethis was very hard to trigger, but
---
 arch/x86/lib/copy_user_64.S         | 25 +++++++++++--------------
 arch/x86/lib/copy_user_nocache_64.S | 25 +++++++++++--------------
 2 files changed, 22 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 70bebd310408..ee1c3f635157 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -217,19 +217,19 @@ ENTRY(copy_user_generic_unrolled)
 	/* table sorted by exception address */
 	.section __ex_table,"a"
 	.align 8
-	.quad .Ls1,.Ls1e
-	.quad .Ls2,.Ls2e
-	.quad .Ls3,.Ls3e
-	.quad .Ls4,.Ls4e
-	.quad .Ld1,.Ls1e
+	.quad .Ls1,.Ls1e	/* Ls1-Ls4 have copied zero bytes */
+	.quad .Ls2,.Ls1e
+	.quad .Ls3,.Ls1e
+	.quad .Ls4,.Ls1e
+	.quad .Ld1,.Ls1e	/* Ld1-Ld4 have copied 0-24 bytes */
 	.quad .Ld2,.Ls2e
 	.quad .Ld3,.Ls3e
 	.quad .Ld4,.Ls4e
-	.quad .Ls5,.Ls5e
-	.quad .Ls6,.Ls6e
-	.quad .Ls7,.Ls7e
-	.quad .Ls8,.Ls8e
-	.quad .Ld5,.Ls5e
+	.quad .Ls5,.Ls5e	/* Ls5-Ls8 have copied 32 bytes */
+	.quad .Ls6,.Ls5e
+	.quad .Ls7,.Ls5e
+	.quad .Ls8,.Ls5e
+	.quad .Ld5,.Ls5e	/* Ld5-Ld8 have copied 32-56 bytes */
 	.quad .Ld6,.Ls6e
 	.quad .Ld7,.Ls7e
 	.quad .Ld8,.Ls8e
@@ -244,11 +244,8 @@ ENTRY(copy_user_generic_unrolled)
 	.quad .Le5,.Le_zero
 	.previous
 
-	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
-	   pessimistic side. this is gross. it would be better to fix the
-	   interface. */
 	/* eax: zero, ebx: 64 */
-.Ls1e: 	addl $8,%eax
+.Ls1e: 	addl $8,%eax		/* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */
 .Ls2e: 	addl $8,%eax
 .Ls3e: 	addl $8,%eax
 .Ls4e: 	addl $8,%eax
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 5196762b3b0e..9d3d1ab83763 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -145,19 +145,19 @@ ENTRY(__copy_user_nocache)
 	/* table sorted by exception address */
 	.section __ex_table,"a"
 	.align 8
-	.quad .Ls1,.Ls1e
-	.quad .Ls2,.Ls2e
-	.quad .Ls3,.Ls3e
-	.quad .Ls4,.Ls4e
-	.quad .Ld1,.Ls1e
+	.quad .Ls1,.Ls1e	/* .Ls[1-4] - 0 bytes copied */
+	.quad .Ls2,.Ls1e
+	.quad .Ls3,.Ls1e
+	.quad .Ls4,.Ls1e
+	.quad .Ld1,.Ls1e	/* .Ld[1-4] - 0..24 bytes coped */
 	.quad .Ld2,.Ls2e
 	.quad .Ld3,.Ls3e
 	.quad .Ld4,.Ls4e
-	.quad .Ls5,.Ls5e
-	.quad .Ls6,.Ls6e
-	.quad .Ls7,.Ls7e
-	.quad .Ls8,.Ls8e
-	.quad .Ld5,.Ls5e
+	.quad .Ls5,.Ls5e	/* .Ls[5-8] - 32 bytes copied */
+	.quad .Ls6,.Ls5e
+	.quad .Ls7,.Ls5e
+	.quad .Ls8,.Ls5e
+	.quad .Ld5,.Ls5e	/* .Ld[5-8] - 32..56 bytes copied */
 	.quad .Ld6,.Ls6e
 	.quad .Ld7,.Ls7e
 	.quad .Ld8,.Ls8e
@@ -172,11 +172,8 @@ ENTRY(__copy_user_nocache)
 	.quad .Le5,.Le_zero
 	.previous
 
-	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
-	   pessimistic side. this is gross. it would be better to fix the
-	   interface. */
 	/* eax: zero, ebx: 64 */
-.Ls1e: 	addl $8,%eax
+.Ls1e: 	addl $8,%eax	/* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */
 .Ls2e: 	addl $8,%eax
 .Ls3e: 	addl $8,%eax
 .Ls4e: 	addl $8,%eax
-- 
cgit v1.2.3


From 75118a82e21cafb4a82b53bb85d1c7689787e046 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 13 Jun 2008 15:47:12 -0700
Subject: x86: fix NULL pointer deref in __switch_to

Patrick McHardy reported a crash:

> > I get this oops once a day, its apparently triggered by something
> > run by cron, but the process is a different one each time.
> >
> > Kernel is -git from yesterday shortly before the -rc6 release
> > (last commit is the usb-2.6 merge, the x86 patches are missing),
> > .config is attached.
> >
> > I'll retry with current -git, but the patches that have gone in
> > since I last updated don't look related.
> >
> > [62060.043009] BUG: unable to handle kernel NULL pointer dereference at
> > 000001ff
> > [62060.043009] IP: [<c0102a9b>] __switch_to+0x2f/0x118
> > [62060.043009] *pde = 00000000
> > [62060.043009] Oops: 0002 [#1] PREEMPT

Vegard Nossum analyzed it:

> This decodes to
>
>    0:   0f ae 00                fxsave (%eax)
>
> so it's related to the floating-point context. This is the exact
> location of the crash:
>
> $ addr2line -e arch/x86/kernel/process_32.o -i ab0
> include/asm/i387.h:232
> include/asm/i387.h:262
> arch/x86/kernel/process_32.c:595
>
> ...so it looks like prev_task->thread.xstate->fxsave has become NULL.
> Or maybe it never had any other value.

Somehow (as described below) TS_USEDFPU is set but the fpu is not
allocated or freed.

Another possible FPU pre-emption issue with the sleazy FPU optimization
which was benign before but not so anymore, with the dynamic FPU allocation
patch.

New task is getting exec'd and it is prempted at the below point.

flush_thread() {
	...
	/*
	* Forget coprocessor state..
	*/
	clear_fpu(tsk);
		<----- Preemption point
	clear_used_math();
	...
}

Now when it context switches in again, as the used_math() is still set
and fpu_counter can be > 5, we will do a math_state_restore() which sets
the task's TS_USEDFPU. After it continues from the above preemption point
it does clear_used_math() and much later free_thread_xstate().

Now, at the next context switch, it is quite possible that xstate is
null, used_math() is not set and TS_USEDFPU is still set. This will
trigger unlazy_fpu() causing kernel oops.

Fix this  by clearing tsk's fpu_counter before clearing task's fpu.

Reported-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process_32.c | 1 +
 arch/x86/kernel/process_64.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6d5483356e74..e2db9ac5c61c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -333,6 +333,7 @@ void flush_thread(void)
 	/*
 	 * Forget coprocessor state..
 	 */
+	tsk->fpu_counter = 0;
 	clear_fpu(tsk);
 	clear_used_math();
 }
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ac54ff56df80..c6eb5c91e5f6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -294,6 +294,7 @@ void flush_thread(void)
 	/*
 	 * Forget coprocessor state..
 	 */
+	tsk->fpu_counter = 0;
 	clear_fpu(tsk);
 	clear_used_math();
 }
-- 
cgit v1.2.3


From df17b1d990fc214f033c5588e58216ec941591e0 Mon Sep 17 00:00:00 2001
From: Mikael Pettersson <mikpe@it.uu.se>
Date: Sun, 15 Jun 2008 02:19:56 +0200
Subject: x86, 32-bit: fix boot failure on TSC-less processors

Booting 2.6.26-rc6 on my 486 DX/4 fails with a "BUG: Int 6"
(invalid opcode) and a kernel halt immediately after the
kernel has been uncompressed. The BUG shows EIP pointing
to an rdtsc instruction in native_read_tsc(), invoked from
native_sched_clock().

(This error occurs so early that not even the serial console
can capture it.)

A bisection showed that this bug first occurs in 2.6.26-rc3-git7,
via commit 9ccc906c97e34fd91dc6aaf5b69b52d824386910:

>x86: distangle user disabled TSC from unstable
>
>tsc_enabled is set to 0 from the command line switch "notsc" and from
>the mark_tsc_unstable code. Seperate those functionalities and replace
>tsc_enable with tsc_disable. This makes also the native_sched_clock()
>decision when to use TSC understandable.
>
>Preparatory patch to solve the sched_clock() issue on 32 bit.
>
>Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

The core reason for this bug is that native_sched_clock() gets
called before tsc_init().

Before the commit above, tsc_32.c used a "tsc_enabled" variable
which defaulted to 0 == disabled, and which only got enabled late
in tsc_init(). Thus early calls to native_sched_clock() would skip
the TSC and use jiffies instead.

After the commit above, tsc_32.c uses a "tsc_disabled" variable
which defaults to 0, meaning that the TSC is Ok to use. Early calls
to native_sched_clock() now erroneously try to use the TSC on
!cpu_has_tsc processors, leading to invalid opcode exceptions.

My proposed fix is to initialise tsc_disabled to a "soft disabled"
state distinct from the hard disabled state set up by the "notsc"
kernel option. This fixes the native_sched_clock() problem. It also
allows tsc_init() to be simplified: instead of setting tsc_disabled = 1
on every error return, we just set tsc_disabled = 0 once when all
checks have succeeded.

I've verified that this lets my 486 boot again. I've also verified
that a Core2 machine still uses the TSC as clocksource after the patch.

Signed-off-by: Mikael Pettersson <mikpe@it.uu.se>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc_32.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 068759db63dd..65b70637ad97 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -14,7 +14,10 @@
 
 #include "mach_timer.h"
 
-static int tsc_disabled;
+/* native_sched_clock() is called before tsc_init(), so
+   we must start with the TSC soft disabled to prevent
+   erroneous rdtsc usage on !cpu_has_tsc processors */
+static int tsc_disabled = -1;
 
 /*
  * On some systems the TSC frequency does not
@@ -402,25 +405,20 @@ void __init tsc_init(void)
 {
 	int cpu;
 
-	if (!cpu_has_tsc || tsc_disabled) {
-		/* Disable the TSC in case of !cpu_has_tsc */
-		tsc_disabled = 1;
+	if (!cpu_has_tsc || tsc_disabled > 0)
 		return;
-	}
 
 	cpu_khz = calculate_cpu_khz();
 	tsc_khz = cpu_khz;
 
 	if (!cpu_khz) {
 		mark_tsc_unstable("could not calculate TSC khz");
-		/*
-		 * We need to disable the TSC completely in this case
-		 * to prevent sched_clock() from using it.
-		 */
-		tsc_disabled = 1;
 		return;
 	}
 
+	/* now allow native_sched_clock() to use rdtsc */
+	tsc_disabled = 0;
+
 	printk("Detected %lu.%03lu MHz processor.\n",
 				(unsigned long)cpu_khz / 1000,
 				(unsigned long)cpu_khz % 1000);
-- 
cgit v1.2.3


From d3942cff620bea073fc4e3c8ed878eb1e84615ce Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Sun, 8 Jun 2008 16:16:07 +0200
Subject: x86: use BOOTMEM_EXCLUSIVE on 32-bit

This patch uses the BOOTMEM_EXCLUSIVE for crashkernel reservation also for
i386 and prints a error message on failure.

The patch is still for 2.6.26 since it is only bug fixing. The unification
of reserve_crashkernel() between i386 and x86_64 should be done for 2.6.27.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
---
 arch/x86/kernel/setup_32.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 2c5f8b213e86..5a2f8e063887 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -532,10 +532,16 @@ static void __init reserve_crashkernel(void)
 					(unsigned long)(crash_size >> 20),
 					(unsigned long)(crash_base >> 20),
 					(unsigned long)(total_mem >> 20));
+
+			if (reserve_bootmem(crash_base, crash_size,
+					BOOTMEM_EXCLUSIVE) < 0) {
+				printk(KERN_INFO "crashkernel reservation "
+					"failed - memory is in use\n");
+				return;
+			}
+
 			crashk_res.start = crash_base;
 			crashk_res.end   = crash_base + crash_size - 1;
-			reserve_bootmem(crash_base, crash_size,
-					BOOTMEM_DEFAULT);
 		} else
 			printk(KERN_INFO "crashkernel reservation failed - "
 					"you have to specify a base address\n");
-- 
cgit v1.2.3


From ffe6e1da86d21d7855495b5a772c93f050258f6e Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jordan.crouse@amd.com>
Date: Wed, 18 Jun 2008 11:34:38 -0600
Subject: x86, geode: add a VSA2 ID for General Software

General Software writes their own VSA2 module for their version
of the Geode BIOS, which returns a different ID then the standard
VSA2.  This was causing the framebuffer driver to break for most
GSW boards.

Signed-off-by: Jordan Crouse <jordan.crouse@amd.com>
Cc: tglx@linutronix.de
Cc: linux-geode@lists.infradead.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/geode_32.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
index e8edd63ab000..9b08e852fd1a 100644
--- a/arch/x86/kernel/geode_32.c
+++ b/arch/x86/kernel/geode_32.c
@@ -166,6 +166,8 @@ int geode_has_vsa2(void)
 	static int has_vsa2 = -1;
 
 	if (has_vsa2 == -1) {
+		u16 val;
+
 		/*
 		 * The VSA has virtual registers that we can query for a
 		 * signature.
@@ -173,7 +175,8 @@ int geode_has_vsa2(void)
 		outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
 		outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
 
-		has_vsa2 = (inw(VSA_VRC_DATA) == VSA_SIG);
+		val = inw(VSA_VRC_DATA);
+		has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
 	}
 
 	return has_vsa2;
-- 
cgit v1.2.3