summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorXunlei Pang <xlpang@redhat.com>2017-03-13 10:50:19 +0100
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2018-03-22 09:17:38 +0100
commitfdfc3fa74511d687dbf626248b671cbe861c7c04 (patch)
tree29a8afdc151c6c31da9e1568e03e958a9829e7b7 /arch
parent56eac6842374eea81ce30147124cc005d491d722 (diff)
x86/mce: Handle broadcasted MCE gracefully with kexec
[ Upstream commit 5bc329503e8191c91c4c40836f062ef771d8ba83 ] When we are about to kexec a crash kernel and right then and there a broadcasted MCE fires while we're still in the first kernel and while the other CPUs remain in a holding pattern, the #MC handler of the first kernel will timeout and then panic due to never completing MCE synchronization. Handle this in a similar way as to when the CPUs are offlined when that broadcasted MCE happens. [ Boris: rewrote commit message and comments. ] Suggested-by: Borislav Petkov <bp@alien8.de> Signed-off-by: Xunlei Pang <xlpang@redhat.com> Signed-off-by: Borislav Petkov <bp@suse.de> Acked-by: Tony Luck <tony.luck@intel.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: kexec@lists.infradead.org Cc: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/1487857012-9059-1-git-send-email-xlpang@redhat.com Link: http://lkml.kernel.org/r/20170313095019.19351-1-bp@alien8.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/include/asm/reboot.h1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c18
-rw-r--r--arch/x86/kernel/reboot.c5
3 files changed, 20 insertions, 4 deletions
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 2cb1cc253d51..fc62ba8dce93 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -15,6 +15,7 @@ struct machine_ops {
};
extern struct machine_ops machine_ops;
+extern int crashing_cpu;
void native_machine_crash_shutdown(struct pt_regs *regs);
void native_machine_shutdown(void);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 684d9fd191e0..88a996d80ca6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -48,6 +48,7 @@
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include <asm/reboot.h>
#include "mce-internal.h"
@@ -1081,9 +1082,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* on Intel.
*/
int lmce = 1;
+ int cpu = smp_processor_id();
- /* If this CPU is offline, just bail out. */
- if (cpu_is_offline(smp_processor_id())) {
+ /*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ * skip those CPUs which remain looping in the 1st kernel - see
+ * crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+ if (cpu_is_offline(cpu) ||
+ (crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus;
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index ce020a69bba9..03f21dbfaa9d 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -769,10 +769,11 @@ void machine_crash_shutdown(struct pt_regs *regs)
#endif
+/* This is the CPU performing the emergency shutdown work. */
+int crashing_cpu = -1;
+
#if defined(CONFIG_SMP)
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
static nmi_shootdown_cb shootdown_callback;
static atomic_t waiting_for_crash_ipi;