diff options
author | Tony Luck <tony.luck@intel.com> | 2016-02-17 10:20:13 -0800 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2016-02-18 09:22:42 +0100 |
commit | b2f9d678e28ca71ce650eac82f26dd287b47e89a (patch) | |
tree | 78559fcff433de13a8412e4e726abd8ff5cc0548 /arch/x86/kernel/cpu/mcheck/mce.c | |
parent | 548acf19234dbda5a52d5a8e7e205af46e9da840 (diff) |
x86/mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries
Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel tagged with _ASM_EXTABLE_FAULT() so that the ex_handler_fault()
handler will provide the fixup code with the trap number.
Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.
Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/89d243d05a7943bb187d1074bb30d9c4f482d5f5.1455732970.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce.c')
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 70 |
1 files changed, 36 insertions, 34 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b7180801ea33..524f2a8492d7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear) } } +static int do_memory_failure(struct mce *m) +{ + int flags = MF_ACTION_REQUIRED; + int ret; + + pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); + if (!(m->mcgstatus & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags); + if (ret) + pr_err("Memory error not recovered"); + return ret; +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(toclear, MAX_NR_BANKS); DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - u64 recover_paddr = ~0ull; - int flags = MF_ACTION_REQUIRED; int lmce = 0; /* If this CPU is offline, just bail out. */ @@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code) } /* - * At insane "tolerant" levels we take no action. Otherwise - * we only die if we have no other choice. For less serious - * issues we try to recover, or limit damage to the current - * process. + * If tolerant is at an insane level we drop requests to kill + * processes and continue even when there is no way out. */ - if (cfg->tolerant < 3) { - if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); - if (worst == MCE_AR_SEVERITY) { - recover_paddr = m.addr; - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - } else if (kill_it) { - force_sig(SIGBUS, current); - } - } + if (cfg->tolerant == 3) + kill_it = 0; + else if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); if (worst > 0) mce_report_event(regs); @@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code) out: sync_core(); - if (recover_paddr == ~0ull) - goto done; + if (worst != MCE_AR_SEVERITY && !kill_it) + goto out_ist; - pr_err("Uncorrected hardware memory error in user-access at %llx", - recover_paddr); - /* - * We must call memory_failure() here even if the current process is - * doomed. We still need to mark the page as poisoned and alert any - * other users of the page. - */ - ist_begin_non_atomic(regs); - local_irq_enable(); - if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) { - pr_err("Memory error not recovered"); - force_sig(SIGBUS, current); + /* Fault was in user mode and we need to take some action */ + if ((m.cs & 3) == 3) { + ist_begin_non_atomic(regs); + local_irq_enable(); + + if (kill_it || do_memory_failure(&m)) + force_sig(SIGBUS, current); + local_irq_disable(); + ist_end_non_atomic(); + } else { + if (!fixup_exception(regs, X86_TRAP_MC)) + mce_panic("Failed kernel mode recovery", &m, NULL); } - local_irq_disable(); - ist_end_non_atomic(); -done: + +out_ist: ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); |