From cba6d0d64ee53772b285d0c0c288deefbeaf7775 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 2 Jul 2012 07:08:42 -0700
Subject: Revert "rcu: Move PREEMPT_RCU preemption to switch_to() invocation"

This reverts commit 616c310e83b872024271c915c1b9ab505b9efad9.
(Move PREEMPT_RCU preemption to switch_to() invocation).
Testing by Sasha Levin <levinsasha928@gmail.com> showed that this
can result in deadlock due to invoking the scheduler when one of
the runqueue locks is held.  Because this commit was simply a
performance optimization, revert it.

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
---
 include/linux/rcupdate.h |  1 -
 include/linux/rcutiny.h  |  6 ++++++
 include/linux/sched.h    | 10 ----------
 3 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 26d1a47591f1..9cac722b169c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -184,7 +184,6 @@ static inline int rcu_preempt_depth(void)
 /* Internal to kernel */
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
-extern void rcu_preempt_note_context_switch(void);
 extern void rcu_check_callbacks(int cpu, int user);
 struct notifier_block;
 extern void rcu_idle_enter(void);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 854dc4c5c271..4e56a9c69a35 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -87,6 +87,10 @@ static inline void kfree_call_rcu(struct rcu_head *head,
 
 #ifdef CONFIG_TINY_RCU
 
+static inline void rcu_preempt_note_context_switch(void)
+{
+}
+
 static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
 	*delta_jiffies = ULONG_MAX;
@@ -95,6 +99,7 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 
 #else /* #ifdef CONFIG_TINY_RCU */
 
+void rcu_preempt_note_context_switch(void);
 int rcu_preempt_needs_cpu(void);
 
 static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
@@ -108,6 +113,7 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 static inline void rcu_note_context_switch(int cpu)
 {
 	rcu_sched_qs(cpu);
+	rcu_preempt_note_context_switch();
 }
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4059c0f33f07..06a4c5f4f55c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1871,22 +1871,12 @@ static inline void rcu_copy_process(struct task_struct *p)
 	INIT_LIST_HEAD(&p->rcu_node_entry);
 }
 
-static inline void rcu_switch_from(struct task_struct *prev)
-{
-	if (prev->rcu_read_lock_nesting != 0)
-		rcu_preempt_note_context_switch();
-}
-
 #else
 
 static inline void rcu_copy_process(struct task_struct *p)
 {
 }
 
-static inline void rcu_switch_from(struct task_struct *prev)
-{
-}
-
 #endif
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 5a081caa0414b9bbb82c17ffab9d6fe66edbb72f Mon Sep 17 00:00:00 2001
From: Ohad Ben-Cohen <ohad@wizery.com>
Date: Wed, 6 Jun 2012 10:09:25 +0300
Subject: rpmsg: avoid premature deallocation of endpoints

When an inbound message arrives, the rpmsg core looks up its
associated endpoint and invokes the registered callback.

If a message arrives while its endpoint is being removed (because
the rpmsg driver was removed, or a recovery of a remote processor
has kicked in) we must ensure atomicity, i.e.:

- Either the ept is removed before it is found

or

- The ept is found but will not be freed until the callback returns

This is achieved by maintaining a per-ept reference count, which,
when drops to zero, will trigger deallocation of the ept.

With this in hand, it is now forbidden to directly deallocate
epts once they have been added to the endpoints idr.

Cc: stable <stable@vger.kernel.org>
Reported-by: Fernando Guzman Lugo <fernando.lugo@ti.com>
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
---
 include/linux/rpmsg.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index a8e50e44203c..195f373590b8 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -38,6 +38,7 @@
 #include <linux/types.h>
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
+#include <linux/kref.h>
 
 /* The feature bitmap for virtio rpmsg */
 #define VIRTIO_RPMSG_F_NS	0 /* RP supports name service notifications */
@@ -120,6 +121,7 @@ typedef void (*rpmsg_rx_cb_t)(struct rpmsg_channel *, void *, int, void *, u32);
 /**
  * struct rpmsg_endpoint - binds a local rpmsg address to its user
  * @rpdev: rpmsg channel device
+ * @refcount: when this drops to zero, the ept is deallocated
  * @cb: rx callback handler
  * @addr: local rpmsg address
  * @priv: private data for the driver's use
@@ -140,6 +142,7 @@ typedef void (*rpmsg_rx_cb_t)(struct rpmsg_channel *, void *, int, void *, u32);
  */
 struct rpmsg_endpoint {
 	struct rpmsg_channel *rpdev;
+	struct kref refcount;
 	rpmsg_rx_cb_t cb;
 	u32 addr;
 	void *priv;
-- 
cgit v1.2.3


From 15fd943af50dbc5f7f4de33835795c72595f7bf4 Mon Sep 17 00:00:00 2001
From: Ohad Ben-Cohen <ohad@wizery.com>
Date: Thu, 7 Jun 2012 15:39:35 +0300
Subject: rpmsg: make sure inflight messages don't invoke just-removed
 callbacks

When inbound messages arrive, rpmsg core looks up their associated
endpoint (by destination address) and then invokes their callback.

We've made sure that endpoints will never be de-allocated after they
were found by rpmsg core, but we also need to protect against the
(rare) scenario where the rpmsg driver was just removed, and its
callback function isn't available anymore.

This is achieved by introducing a callback mutex, which must be taken
before the callback is invoked, and, obviously, before it is removed.

Cc: stable <stable@vger.kernel.org>
Reported-by: Fernando Guzman Lugo <fernando.lugo@ti.com>
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
---
 include/linux/rpmsg.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index 195f373590b8..82a673905edb 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -39,6 +39,7 @@
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 #include <linux/kref.h>
+#include <linux/mutex.h>
 
 /* The feature bitmap for virtio rpmsg */
 #define VIRTIO_RPMSG_F_NS	0 /* RP supports name service notifications */
@@ -123,6 +124,7 @@ typedef void (*rpmsg_rx_cb_t)(struct rpmsg_channel *, void *, int, void *, u32);
  * @rpdev: rpmsg channel device
  * @refcount: when this drops to zero, the ept is deallocated
  * @cb: rx callback handler
+ * @cb_lock: must be taken before accessing/changing @cb
  * @addr: local rpmsg address
  * @priv: private data for the driver's use
  *
@@ -144,6 +146,7 @@ struct rpmsg_endpoint {
 	struct rpmsg_channel *rpdev;
 	struct kref refcount;
 	rpmsg_rx_cb_t cb;
+	struct mutex cb_lock;
 	u32 addr;
 	void *priv;
 };
-- 
cgit v1.2.3


From f567fde24640cf6f2d6416196bfc8b3fefc8e433 Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Wed, 20 Jun 2012 14:14:05 +0530
Subject: gpio: fix bits conflict for gpio flags

The bit 2 and 3 in GPIO flag are allocated for the
flag OPEN_DRAIN/OPEN_SOURCE. These bits are reused
for the flag EXPORT/EXPORT_CHANGEABLE and so creating
conflict.
Fix this conflict by assigning bit 4 and 5 for the
flag EXPORT/EXPORT_CHANGEABLE.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index f07fc2d08159..2e31e8b3a190 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -22,8 +22,8 @@
 /* Gpio pin is open source */
 #define GPIOF_OPEN_SOURCE	(1 << 3)
 
-#define GPIOF_EXPORT		(1 << 2)
-#define GPIOF_EXPORT_CHANGEABLE	(1 << 3)
+#define GPIOF_EXPORT		(1 << 4)
+#define GPIOF_EXPORT_CHANGEABLE	(1 << 5)
 #define GPIOF_EXPORT_DIR_FIXED	(GPIOF_EXPORT)
 #define GPIOF_EXPORT_DIR_CHANGEABLE (GPIOF_EXPORT | GPIOF_EXPORT_CHANGEABLE)
 
-- 
cgit v1.2.3


From 5167e8d5417bf5c322a703d2927daec727ea40dd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 22 Jun 2012 15:52:09 +0200
Subject: sched/nohz: Rewrite and fix load-avg computation -- again

Thanks to Charles Wang for spotting the defects in the current code:

 - If we go idle during the sample window -- after sampling, we get a
   negative bias because we can negate our own sample.

 - If we wake up during the sample window we get a positive bias
   because we push the sample to a known active period.

So rewrite the entire nohz load-avg muck once again, now adding
copious documentation to the code.

Reported-and-tested-by: Doug Smythies <dsmythies@telus.net>
Reported-and-tested-by: Charles Wang <muming.wq@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: stable@kernel.org
Link: http://lkml.kernel.org/r/1340373782.18025.74.camel@twins
[ minor edits ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4059c0f33f07..20cb7497c59c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1909,6 +1909,14 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 }
 #endif
 
+#ifdef CONFIG_NO_HZ
+void calc_load_enter_idle(void);
+void calc_load_exit_idle(void);
+#else
+static inline void calc_load_enter_idle(void) { }
+static inline void calc_load_exit_idle(void) { }
+#endif /* CONFIG_NO_HZ */
+
 #ifndef CONFIG_CPUMASK_OFFSTACK
 static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 {
-- 
cgit v1.2.3


From 222a806af830fda34ad1f6bc991cd226916de060 Mon Sep 17 00:00:00 2001
From: Mark Rustad <mark.d.rustad@intel.com>
Date: Thu, 21 Jun 2012 12:23:42 -0700
Subject: [SCSI] Fix NULL dereferences in scsi_cmd_to_driver

Avoid crashing if the private_data pointer happens to be NULL. This has
been seen sometimes when a host reset happens, notably when there are
many LUNs:

host3: Assigned Port ID 0c1601
scsi host3: libfc: Host reset succeeded on port (0c1601)
BUG: unable to handle kernel NULL pointer dereference at 0000000000000350
IP: [<ffffffff81352bb8>] scsi_send_eh_cmnd+0x58/0x3a0
<snip>
Process scsi_eh_3 (pid: 4144, threadinfo ffff88030920c000, task ffff880326b160c0)
Stack:
 000000010372e6ba 0000000000000282 000027100920dca0 ffffffffa0038ee0
 0000000000000000 0000000000030003 ffff88030920dc80 ffff88030920dc80
 00000002000e0000 0000000a00004000 ffff8803242f7760 ffff88031326ed80
Call Trace:
 [<ffffffff8105b590>] ? lock_timer_base+0x70/0x70
 [<ffffffff81352fbe>] scsi_eh_tur+0x3e/0xc0
 [<ffffffff81353a36>] scsi_eh_test_devices+0x76/0x170
 [<ffffffff81354125>] scsi_eh_host_reset+0x85/0x160
 [<ffffffff81354291>] scsi_eh_ready_devs+0x91/0x110
 [<ffffffff813543fd>] scsi_unjam_host+0xed/0x1f0
 [<ffffffff813546a8>] scsi_error_handler+0x1a8/0x200
 [<ffffffff81354500>] ? scsi_unjam_host+0x1f0/0x1f0
 [<ffffffff8106ec3e>] kthread+0x9e/0xb0
 [<ffffffff81509264>] kernel_thread_helper+0x4/0x10
 [<ffffffff8106eba0>] ? kthread_freezable_should_stop+0x70/0x70
 [<ffffffff81509260>] ? gs_change+0x13/0x13
Code: 25 28 00 00 00 48 89 45 c8 31 c0 48 8b 87 80 00 00 00 48 8d b5 60 ff ff ff 89 d1 48 89 fb 41 89 d6 4c 89 fa 48 8b 80 b8 00 00 00
 <48> 8b 80 50 03 00 00 48 8b 00 48 89 85 38 ff ff ff 48 8b 07 4c
RIP  [<ffffffff81352bb8>] scsi_send_eh_cmnd+0x58/0x3a0
 RSP <ffff88030920dc50>
CR2: 0000000000000350


Signed-off-by: Mark Rustad <mark.d.rustad@intel.com>
Tested-by: Marcus Dennis <marcusx.e.dennis@intel.com>
Cc: <stable@kernel.org>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 include/scsi/scsi_cmnd.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 1e1198546c72..ac06cc595890 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -134,10 +134,16 @@ struct scsi_cmnd {
 
 static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd)
 {
+	struct scsi_driver **sdp;
+
 	if (!cmd->request->rq_disk)
 		return NULL;
 
-	return *(struct scsi_driver **)cmd->request->rq_disk->private_data;
+	sdp = (struct scsi_driver **)cmd->request->rq_disk->private_data;
+	if (!sdp)
+		return NULL;
+
+	return *sdp;
 }
 
 extern struct scsi_cmnd *scsi_get_command(struct scsi_device *, gfp_t);
-- 
cgit v1.2.3


From 6ef1b512f4e6f936d89aa20be3d97a7ec7c290ac Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 22 Jun 2012 10:52:34 -0700
Subject: [SCSI] libsas: fix taskfile corruption in sas_ata_qc_fill_rtf

fill_result_tf() grabs the taskfile flags from the originating qc which
sas_ata_qc_fill_rtf() promptly overwrites.  The presence of an
ata_taskfile in the sata_device makes it tempting to just copy the full
contents in sas_ata_qc_fill_rtf().  However, libata really only wants
the fis contents and expects the other portions of the taskfile to not
be touched by ->qc_fill_rtf.  To that end store a fis buffer in the
sata_device and use ata_tf_from_fis() like every other ->qc_fill_rtf()
implementation.

Cc: <stable@vger.kernel.org>
Reported-by: Praveen Murali <pmurali@logicube.com>
Tested-by: Praveen Murali <pmurali@logicube.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 include/scsi/libsas.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index f4f1c96dca72..10ce74f589c5 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -163,6 +163,8 @@ enum ata_command_set {
         ATAPI_COMMAND_SET = 1,
 };
 
+#define ATA_RESP_FIS_SIZE 24
+
 struct sata_device {
         enum   ata_command_set command_set;
         struct smp_resp        rps_resp; /* report_phy_sata_resp */
@@ -171,7 +173,7 @@ struct sata_device {
 
 	struct ata_port *ap;
 	struct ata_host ata_host;
-	struct ata_taskfile tf;
+	u8     fis[ATA_RESP_FIS_SIZE];
 };
 
 enum {
@@ -537,7 +539,7 @@ enum exec_status {
  */
 struct ata_task_resp {
 	u16  frame_len;
-	u8   ending_fis[24];	  /* dev to host or data-in */
+	u8   ending_fis[ATA_RESP_FIS_SIZE];	  /* dev to host or data-in */
 };
 
 #define SAS_STATUS_BUF_SIZE 96
-- 
cgit v1.2.3


From dbf0e4c7257f8d684ec1a3c919853464293de66e Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 9 Jul 2012 11:09:21 -0400
Subject: PCI: EHCI: fix crash during suspend on ASUS computers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Quite a few ASUS computers experience a nasty problem, related to the
EHCI controllers, when going into system suspend.  It was observed
that the problem didn't occur if the controllers were not put into the
D3 power state before starting the suspend, and commit
151b61284776be2d6f02d48c23c3625678960b97 (USB: EHCI: fix crash during
suspend on ASUS computers) was created to do this.

It turned out this approach messed up other computers that didn't have
the problem -- it prevented USB wakeup from working.  Consequently
commit c2fb8a3fa25513de8fedb38509b1f15a5bbee47b (USB: add
NO_D3_DURING_SLEEP flag and revert 151b61284776be2) was merged; it
reverted the earlier commit and added a whitelist of known good board
names.

Now we know the actual cause of the problem.  Thanks to AceLan Kao for
tracking it down.

According to him, an engineer at ASUS explained that some of their
BIOSes contain a bug that was added in an attempt to work around a
problem in early versions of Windows.  When the computer goes into S3
suspend, the BIOS tries to verify that the EHCI controllers were first
quiesced by the OS.  Nothing's wrong with this, but the BIOS does it
by checking that the PCI COMMAND registers contain 0 without checking
the controllers' power state.  If the register isn't 0, the BIOS
assumes the controller needs to be quiesced and tries to do so.  This
involves making various MMIO accesses to the controller, which don't
work very well if the controller is already in D3.  The end result is
a system hang or memory corruption.

Since the value in the PCI COMMAND register doesn't matter once the
controller has been suspended, and since the value will be restored
anyway when the controller is resumed, we can work around the BIOS bug
simply by setting the register to 0 during system suspend.  This patch
(as1590) does so and also reverts the second commit mentioned above,
which is now unnecessary.

In theory we could do this for every PCI device.  However to avoid
introducing new problems, the patch restricts itself to EHCI host
controllers.

Finally the affected systems can suspend with USB wakeup working
properly.

Reference: https://bugzilla.kernel.org/show_bug.cgi?id=37632
Reference: https://bugzilla.kernel.org/show_bug.cgi?id=42728
Based-on-patch-by: AceLan Kao <acelan.kao@canonical.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Tested-by: Dâniel Fraga <fragabr@gmail.com>
Tested-by: Javier Marcet <jmarcet@gmail.com>
Tested-by: Andrey Rahmatullin <wrar@wrar.name>
Tested-by: Oleksij Rempel <bug-track@fisher-privat.net>
Tested-by: Pavel Pisa <pisa@cmp.felk.cvut.cz>
Cc: stable <stable@vger.kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/pci.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index fefb4e19bf6a..d8c379dba6ad 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -176,8 +176,6 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_NO_D3 = (__force pci_dev_flags_t) 2,
 	/* Provide indication device is assigned by a Virtual Machine Manager */
 	PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) 4,
-	/* Device causes system crash if in D3 during S3 sleep */
-	PCI_DEV_FLAGS_NO_D3_DURING_SLEEP = (__force pci_dev_flags_t) 8,
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3


From f55a6faa384304c89cfef162768e88374d3312cb Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 10 Jul 2012 18:43:19 -0400
Subject: hrtimer: Provide clock_was_set_delayed()

clock_was_set() cannot be called from hard interrupt context because
it calls on_each_cpu().

For fixing the widely reported leap seconds issue it is necessary to
call it from hard interrupt context, i.e. the timer tick code, which
does the timekeeping updates.

Provide a new function which denotes it in the hrtimer cpu base
structure of the cpu on which it is called and raise the hrtimer
softirq. We then execute the clock_was_set() notificiation from
softirq context in run_hrtimer_softirq(). The hrtimer softirq is
rarely used, so polling the flag there is not a performance issue.

[ tglx: Made it depend on CONFIG_HIGH_RES_TIMERS. We really should get
  rid of all this ifdeffery ASAP ]

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Reported-by: Jan Engelhardt <jengelh@inai.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1341960205-56738-2-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index fd0dc30c9f15..c9ec9400ee5b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -165,6 +165,7 @@ enum  hrtimer_base_type {
  * @lock:		lock protecting the base and associated clock bases
  *			and timers
  * @active_bases:	Bitfield to mark bases with active timers
+ * @clock_was_set:	Indicates that clock was set from irq context.
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
  * @hres_active:	State of high resolution mode
@@ -177,7 +178,8 @@ enum  hrtimer_base_type {
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
-	unsigned long			active_bases;
+	unsigned int			active_bases;
+	unsigned int			clock_was_set;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t				expires_next;
 	int				hres_active;
@@ -286,6 +288,8 @@ extern void hrtimer_peek_ahead_timers(void);
 # define MONOTONIC_RES_NSEC	HIGH_RES_NSEC
 # define KTIME_MONOTONIC_RES	KTIME_HIGH_RES
 
+extern void clock_was_set_delayed(void);
+
 #else
 
 # define MONOTONIC_RES_NSEC	LOW_RES_NSEC
@@ -306,6 +310,9 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 {
 	return 0;
 }
+
+static inline void clock_was_set_delayed(void) { }
+
 #endif
 
 extern void clock_was_set(void);
-- 
cgit v1.2.3


From f6c06abfb3972ad4914cef57d8348fcb2932bc3b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Jul 2012 18:43:24 -0400
Subject: timekeeping: Provide hrtimer update function

To finally fix the infamous leap second issue and other race windows
caused by functions which change the offsets between the various time
bases (CLOCK_MONOTONIC, CLOCK_REALTIME and CLOCK_BOOTTIME) we need a
function which atomically gets the current monotonic time and updates
the offsets of CLOCK_REALTIME and CLOCK_BOOTTIME with minimalistic
overhead. The previous patch which provides ktime_t offsets allows us
to make this function almost as cheap as ktime_get() which is going to
be replaced in hrtimer_interrupt().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Link: http://lkml.kernel.org/r/1341960205-56738-7-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index c9ec9400ee5b..cc07d2777bbe 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -327,6 +327,7 @@ extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_real(void);
 extern ktime_t ktime_get_boottime(void);
 extern ktime_t ktime_get_monotonic_offset(void);
+extern ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot);
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 
-- 
cgit v1.2.3


From d8adde17e5f858427504725218c56aef90e90fc7 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@huawei.com>
Date: Wed, 11 Jul 2012 14:01:52 -0700
Subject: memory hotplug: fix invalid memory access caused by stale kswapd
 pointer

kswapd_stop() is called to destroy the kswapd work thread when all memory
of a NUMA node has been offlined.  But kswapd_stop() only terminates the
work thread without resetting NODE_DATA(nid)->kswapd to NULL.  The stale
pointer will prevent kswapd_run() from creating a new work thread when
adding memory to the memory-less NUMA node again.  Eventually the stale
pointer may cause invalid memory access.

An example stack dump as below. It's reproduced with 2.6.32, but latest
kernel has the same issue.

  BUG: unable to handle kernel NULL pointer dereference at (null)
  IP: [<ffffffff81051a94>] exit_creds+0x12/0x78
  PGD 0
  Oops: 0000 [#1] SMP
  last sysfs file: /sys/devices/system/memory/memory391/state
  CPU 11
  Modules linked in: cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq microcode fuse loop dm_mod tpm_tis rtc_cmos i2c_i801 rtc_core tpm serio_raw pcspkr sg tpm_bios igb i2c_core iTCO_wdt rtc_lib mptctl iTCO_vendor_support button dca bnx2 usbhid hid uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan ide_pci_generic ide_core ata_generic ata_piix libata thermal processor thermal_sys hwmon mptsas mptscsih mptbase scsi_transport_sas scsi_mod
  Pid: 7949, comm: sh Not tainted 2.6.32.12-qiuxishi-5-default #92 Tecal RH2285
  RIP: 0010:exit_creds+0x12/0x78
  RSP: 0018:ffff8806044f1d78  EFLAGS: 00010202
  RAX: 0000000000000000 RBX: ffff880604f22140 RCX: 0000000000019502
  RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000
  RBP: ffff880604f22150 R08: 0000000000000000 R09: ffffffff81a4dc10
  R10: 00000000000032a0 R11: ffff880006202500 R12: 0000000000000000
  R13: 0000000000c40000 R14: 0000000000008000 R15: 0000000000000001
  FS:  00007fbc03d066f0(0000) GS:ffff8800282e0000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: 0000000000000000 CR3: 000000060f029000 CR4: 00000000000006e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  Process sh (pid: 7949, threadinfo ffff8806044f0000, task ffff880603d7c600)
  Stack:
   ffff880604f22140 ffffffff8103aac5 ffff880604f22140 ffffffff8104d21e
   ffff880006202500 0000000000008000 0000000000c38000 ffffffff810bd5b1
   0000000000000000 ffff880603d7c600 00000000ffffdd29 0000000000000003
  Call Trace:
    __put_task_struct+0x5d/0x97
    kthread_stop+0x50/0x58
    offline_pages+0x324/0x3da
    memory_block_change_state+0x179/0x1db
    store_mem_state+0x9e/0xbb
    sysfs_write_file+0xd0/0x107
    vfs_write+0xad/0x169
    sys_write+0x45/0x6e
    system_call_fastpath+0x16/0x1b
  Code: ff 4d 00 0f 94 c0 84 c0 74 08 48 89 ef e8 1f fd ff ff 5b 5d 31 c0 41 5c c3 53 48 8b 87 20 06 00 00 48 89 fb 48 8b bf 18 06 00 00 <8b> 00 48 c7 83 18 06 00 00 00 00 00 00 f0 ff 0f 0f 94 c0 84 c0
  RIP  exit_creds+0x12/0x78
   RSP <ffff8806044f1d78>
  CR2: 0000000000000000

[akpm@linux-foundation.org: add pglist_data.kswapd locking comments]
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2427706f78b4..68c569fcbb66 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -694,7 +694,7 @@ typedef struct pglist_data {
 					     range, including holes */
 	int node_id;
 	wait_queue_head_t kswapd_wait;
-	struct task_struct *kswapd;
+	struct task_struct *kswapd;	/* Protected by lock_memory_hotplug() */
 	int kswapd_max_order;
 	enum zone_type classzone_idx;
 } pg_data_t;
-- 
cgit v1.2.3


From 99ab7b19440a72ebdf225f99b20f8ef40decee86 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 11 Jul 2012 14:02:53 -0700
Subject: mm: sparse: fix usemap allocation above node descriptor section

After commit f5bf18fa22f8 ("bootmem/sparsemem: remove limit constraint
in alloc_bootmem_section"), usemap allocations may easily be placed
outside the optimal section that holds the node descriptor, even if
there is space available in that section.  This results in unnecessary
hotplug dependencies that need to have the node unplugged before the
section holding the usemap.

The reason is that the bootmem allocator doesn't guarantee a linear
search starting from the passed allocation goal but may start out at a
much higher address absent an upper limit.

Fix this by trying the allocation with the limit at the section end,
then retry without if that fails.  This keeps the fix from f5bf18fa22f8
of not panicking if the allocation does not fit in the section, but
still makes sure to try to stay within the section at first.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>	[3.3.x, 3.4.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 324fe08ea3b1..6d6795d46a75 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -91,6 +91,11 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal);
+void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+				  unsigned long size,
+				  unsigned long align,
+				  unsigned long goal,
+				  unsigned long limit);
 extern void *__alloc_bootmem_low(unsigned long size,
 				 unsigned long align,
 				 unsigned long goal);
-- 
cgit v1.2.3


From 29f6738609e40227dabcc63bfb3b84b3726a75bd Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 11 Jul 2012 14:02:56 -0700
Subject: memblock: free allocated memblock_reserved_regions later

memblock_free_reserved_regions() calls memblock_free(), but
memblock_free() would double reserved.regions too, so we could free the
old range for reserved.regions.

Also tj said there is another bug which could be related to this.

| I don't think we're saving any noticeable
| amount by doing this "free - give it to page allocator - reserve
| again" dancing.  We should just allocate regions aligned to page
| boundaries and free them later when memblock is no longer in use.

in that case, when DEBUG_PAGEALLOC, will get panic:

     memblock_free: [0x0000102febc080-0x0000102febf080] memblock_free_reserved_regions+0x37/0x39
  BUG: unable to handle kernel paging request at ffff88102febd948
  IP: [<ffffffff836a5774>] __next_free_mem_range+0x9b/0x155
  PGD 4826063 PUD cf67a067 PMD cf7fa067 PTE 800000102febd160
  Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
  CPU 0
  Pid: 0, comm: swapper Not tainted 3.5.0-rc2-next-20120614-sasha #447
  RIP: 0010:[<ffffffff836a5774>]  [<ffffffff836a5774>] __next_free_mem_range+0x9b/0x155

See the discussion at https://lkml.org/lkml/2012/6/13/469

So try to allocate with PAGE_SIZE alignment and free it later.

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index a6bb10235148..19dc455b4f3d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -50,9 +50,7 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end,
 				phys_addr_t size, phys_addr_t align, int nid);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
 				   phys_addr_t size, phys_addr_t align);
-int memblock_free_reserved_regions(void);
-int memblock_reserve_reserved_regions(void);
-
+phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
-- 
cgit v1.2.3