From 0a2409aad38e97b1db55e6515b990be7b17060f6 Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Wed, 11 Jun 2014 13:57:27 -0700 Subject: trace, AER: Move trace into unified interface AER uses a separate trace interface by now. To make it consistent, move it into unified RAS trace interface. Signed-off-by: Chen, Gong Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- include/ras/ras_event.h | 64 ++++++++++++++++++++++++++++++++++++++ include/trace/events/ras.h | 77 ---------------------------------------------- 2 files changed, 64 insertions(+), 77 deletions(-) delete mode 100644 include/trace/events/ras.h (limited to 'include') diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 21cdb0b7b0fb..acbcbb88eaaa 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -8,6 +8,7 @@ #include #include #include +#include /* * Hardware Events Report @@ -94,6 +95,69 @@ TRACE_EVENT(mc_event, __get_str(driver_detail)) ); +/* + * PCIe AER Trace event + * + * These events are generated when hardware detects a corrected or + * uncorrected event on a PCIe device. The event report has + * the following structure: + * + * char * dev_name - The name of the slot where the device resides + * ([domain:]bus:device.function). + * u32 status - Either the correctable or uncorrectable register + * indicating what error or errors have been seen + * u8 severity - error severity 0:NONFATAL 1:FATAL 2:CORRECTED + */ + +#define aer_correctable_errors \ + {BIT(0), "Receiver Error"}, \ + {BIT(6), "Bad TLP"}, \ + {BIT(7), "Bad DLLP"}, \ + {BIT(8), "RELAY_NUM Rollover"}, \ + {BIT(12), "Replay Timer Timeout"}, \ + {BIT(13), "Advisory Non-Fatal"} + +#define aer_uncorrectable_errors \ + {BIT(4), "Data Link Protocol"}, \ + {BIT(12), "Poisoned TLP"}, \ + {BIT(13), "Flow Control Protocol"}, \ + {BIT(14), "Completion Timeout"}, \ + {BIT(15), "Completer Abort"}, \ + {BIT(16), "Unexpected Completion"}, \ + {BIT(17), "Receiver Overflow"}, \ + {BIT(18), "Malformed TLP"}, \ + {BIT(19), "ECRC"}, \ + {BIT(20), "Unsupported Request"} + +TRACE_EVENT(aer_event, + TP_PROTO(const char *dev_name, + const u32 status, + const u8 severity), + + TP_ARGS(dev_name, status, severity), + + TP_STRUCT__entry( + __string( dev_name, dev_name ) + __field( u32, status ) + __field( u8, severity ) + ), + + TP_fast_assign( + __assign_str(dev_name, dev_name); + __entry->status = status; + __entry->severity = severity; + ), + + TP_printk("%s PCIe Bus Error: severity=%s, %s\n", + __get_str(dev_name), + __entry->severity == AER_CORRECTABLE ? "Corrected" : + __entry->severity == AER_FATAL ? + "Fatal" : "Uncorrected, non-fatal", + __entry->severity == AER_CORRECTABLE ? + __print_flags(__entry->status, "|", aer_correctable_errors) : + __print_flags(__entry->status, "|", aer_uncorrectable_errors)) +); + #endif /* _TRACE_HW_EVENT_MC_H */ /* This part must be outside protection */ diff --git a/include/trace/events/ras.h b/include/trace/events/ras.h deleted file mode 100644 index 1c875ad1ee5f..000000000000 --- a/include/trace/events/ras.h +++ /dev/null @@ -1,77 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM ras - -#if !defined(_TRACE_AER_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_AER_H - -#include -#include - - -/* - * PCIe AER Trace event - * - * These events are generated when hardware detects a corrected or - * uncorrected event on a PCIe device. The event report has - * the following structure: - * - * char * dev_name - The name of the slot where the device resides - * ([domain:]bus:device.function). - * u32 status - Either the correctable or uncorrectable register - * indicating what error or errors have been seen - * u8 severity - error severity 0:NONFATAL 1:FATAL 2:CORRECTED - */ - -#define aer_correctable_errors \ - {BIT(0), "Receiver Error"}, \ - {BIT(6), "Bad TLP"}, \ - {BIT(7), "Bad DLLP"}, \ - {BIT(8), "RELAY_NUM Rollover"}, \ - {BIT(12), "Replay Timer Timeout"}, \ - {BIT(13), "Advisory Non-Fatal"} - -#define aer_uncorrectable_errors \ - {BIT(4), "Data Link Protocol"}, \ - {BIT(12), "Poisoned TLP"}, \ - {BIT(13), "Flow Control Protocol"}, \ - {BIT(14), "Completion Timeout"}, \ - {BIT(15), "Completer Abort"}, \ - {BIT(16), "Unexpected Completion"}, \ - {BIT(17), "Receiver Overflow"}, \ - {BIT(18), "Malformed TLP"}, \ - {BIT(19), "ECRC"}, \ - {BIT(20), "Unsupported Request"} - -TRACE_EVENT(aer_event, - TP_PROTO(const char *dev_name, - const u32 status, - const u8 severity), - - TP_ARGS(dev_name, status, severity), - - TP_STRUCT__entry( - __string( dev_name, dev_name ) - __field( u32, status ) - __field( u8, severity ) - ), - - TP_fast_assign( - __assign_str(dev_name, dev_name); - __entry->status = status; - __entry->severity = severity; - ), - - TP_printk("%s PCIe Bus Error: severity=%s, %s\n", - __get_str(dev_name), - __entry->severity == AER_CORRECTABLE ? "Corrected" : - __entry->severity == AER_FATAL ? - "Fatal" : "Uncorrected, non-fatal", - __entry->severity == AER_CORRECTABLE ? - __print_flags(__entry->status, "|", aer_correctable_errors) : - __print_flags(__entry->status, "|", aer_uncorrectable_errors)) -); - -#endif /* _TRACE_AER_H */ - -/* This part must be outside protection */ -#include -- cgit v1.2.3 From 3760cd20402d4c131e1994c968ecb055fa0f74bc Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Wed, 11 Jun 2014 13:59:45 -0700 Subject: CPER: Adjust code flow of some functions Some codes can be reorganzied as a common function for other usages. Signed-off-by: Chen, Gong Signed-off-by: Tony Luck --- include/linux/cper.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/cper.h b/include/linux/cper.h index 2fc0ec3d89cc..ed088b9c1298 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -35,6 +35,13 @@ */ #define CPER_RECORD_REV 0x0100 +/* + * CPER record length contains the CPER fields which are relevant for further + * handling of a memory error in userspace (we don't carry all the fields + * defined in the UEFI spec because some of them don't make any sense.) + * Currently, a length of 256 should be more than enough. + */ +#define CPER_REC_LEN 256 /* * Severity difinition for error_severity in struct cper_record_header * and section_severity in struct cper_section_descriptor @@ -395,6 +402,8 @@ struct cper_sec_pcie { #pragma pack() u64 cper_next_record_id(void); +const char *cper_severity_str(unsigned int); +const char *cper_mem_err_type_str(unsigned int); void cper_print_bits(const char *prefix, unsigned int bits, const char * const strs[], unsigned int strs_size); -- cgit v1.2.3 From d963cd95bea93b7db9390a71d1e2cabbb3b2c3ea Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Wed, 11 Jun 2014 14:02:20 -0700 Subject: RAS, debugfs: Add debugfs interface for RAS subsystem Implement a new debugfs interface for RAS susbsystem. A file named daemon_active is added there accordingly. This file is used to track if user space daemon accesses perf/trace interface or not. One can track which daemon opens it via "lsof /path/to/debugfs/ras/daemon_active". Signed-off-by: Chen, Gong Link: http://lkml.kernel.org/r/1402475691-30045-5-git-send-email-gong.chen@linux.intel.com Signed-off-by: Borislav Petkov Signed-off-by: Tony Luck --- include/linux/ras.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 include/linux/ras.h (limited to 'include') diff --git a/include/linux/ras.h b/include/linux/ras.h new file mode 100644 index 000000000000..2aceeafd6fe5 --- /dev/null +++ b/include/linux/ras.h @@ -0,0 +1,14 @@ +#ifndef __RAS_H__ +#define __RAS_H__ + +#ifdef CONFIG_DEBUG_FS +int ras_userspace_consumers(void); +void ras_debugfs_init(void); +int ras_add_daemon_trace(void); +#else +static inline int ras_userspace_consumers(void) { return 0; } +static inline void ras_debugfs_init(void) { return; } +static inline int ras_add_daemon_trace(void) { return 0; } +#endif + +#endif -- cgit v1.2.3 From 2dfb7d51a61d7ca91b131c8db612f27d9390f2d5 Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Tue, 17 Jun 2014 22:33:07 -0400 Subject: trace, RAS: Add eMCA trace event interface Add trace interface to elaborate all H/W error related information. Signed-off-by: Chen, Gong Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- include/linux/cper.h | 23 ++++++++++++++++++ include/ras/ras_event.h | 64 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) (limited to 'include') diff --git a/include/linux/cper.h b/include/linux/cper.h index ed088b9c1298..76abba4b238e 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -22,6 +22,7 @@ #define LINUX_CPER_H #include +#include /* CPER record signature and the size */ #define CPER_SIG_RECORD "CPER" @@ -363,6 +364,24 @@ struct cper_sec_mem_err { __u16 mem_dev_handle; /* module handle in UEFI 2.4 */ }; +struct cper_mem_err_compact { + __u64 validation_bits; + __u16 node; + __u16 card; + __u16 module; + __u16 bank; + __u16 device; + __u16 row; + __u16 column; + __u16 bit_pos; + __u64 requestor_id; + __u64 responder_id; + __u64 target_id; + __u16 rank; + __u16 mem_array_handle; + __u16 mem_dev_handle; +}; + struct cper_sec_pcie { __u64 validation_bits; __u32 port_type; @@ -406,5 +425,9 @@ const char *cper_severity_str(unsigned int); const char *cper_mem_err_type_str(unsigned int); void cper_print_bits(const char *prefix, unsigned int bits, const char * const strs[], unsigned int strs_size); +void cper_mem_err_pack(const struct cper_sec_mem_err *, + struct cper_mem_err_compact *); +const char *cper_mem_err_unpack(struct trace_seq *, + struct cper_mem_err_compact *); #endif diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index acbcbb88eaaa..47da53c27ffa 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -9,6 +9,70 @@ #include #include #include +#include + +/* + * MCE Extended Error Log trace event + * + * These events are generated when hardware detects a corrected or + * uncorrected event. + */ + +/* memory trace event */ + +#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE) +TRACE_EVENT(extlog_mem_event, + TP_PROTO(struct cper_sec_mem_err *mem, + u32 err_seq, + const uuid_le *fru_id, + const char *fru_text, + u8 sev), + + TP_ARGS(mem, err_seq, fru_id, fru_text, sev), + + TP_STRUCT__entry( + __field(u32, err_seq) + __field(u8, etype) + __field(u8, sev) + __field(u64, pa) + __field(u8, pa_mask_lsb) + __field_struct(uuid_le, fru_id) + __string(fru_text, fru_text) + __field_struct(struct cper_mem_err_compact, data) + ), + + TP_fast_assign( + __entry->err_seq = err_seq; + if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) + __entry->etype = mem->error_type; + else + __entry->etype = ~0; + __entry->sev = sev; + if (mem->validation_bits & CPER_MEM_VALID_PA) + __entry->pa = mem->physical_addr; + else + __entry->pa = ~0ull; + + if (mem->validation_bits & CPER_MEM_VALID_PA_MASK) + __entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask); + else + __entry->pa_mask_lsb = ~0; + __entry->fru_id = *fru_id; + __assign_str(fru_text, fru_text); + cper_mem_err_pack(mem, &__entry->data); + ), + + TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s", + __entry->err_seq, + cper_severity_str(__entry->sev), + cper_mem_err_type_str(__entry->etype), + __entry->pa, + __entry->pa_mask_lsb, + cper_mem_err_unpack(p, &__entry->data), + &__entry->fru_id, + __get_str(fru_text)) +); +#endif /* * Hardware Events Report -- cgit v1.2.3 From 9dae3d0d9e64c3cb8bb172f041d4e66d4b92088a Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 22 Jul 2014 11:20:11 +0200 Subject: apei, mce: Factor out APEI architecture specific MCE calls. This commit abstracts MCE calls and provides weak corresponding default implementation for those architectures which do not need arch specific actions. Each platform willing to do additional architectural actions should provides desired function definition. It allows us to avoid wrap code into #ifdef in generic code and prevent new platform from introducing dummy stub function too. Initially, there are two APEI arch-specific calls: - arch_apei_enable_cmcff() - arch_apei_report_mem_error() Both interact with MCE driver for X86 architecture. Signed-off-by: Tomasz Nowicki Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- include/acpi/apei.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/acpi/apei.h b/include/acpi/apei.h index 04f349d8da73..8a23c95109c6 100644 --- a/include/acpi/apei.h +++ b/include/acpi/apei.h @@ -42,5 +42,8 @@ ssize_t erst_read(u64 record_id, struct cper_record_header *record, size_t buflen); int erst_clear(u64 record_id); +int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data); +void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err); + #endif #endif -- cgit v1.2.3 From 44a69f6195628f6f940566d133a72987559e102d Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 22 Jul 2014 11:20:12 +0200 Subject: acpi, apei, ghes: Make NMI error notification to be GHES architecture extension. Currently APEI depends on x86 architecture. It is because of NMI hardware error notification of GHES which is currently supported by x86 only. However, many other APEI features can be still used perfectly by other architectures. This commit adds two symbols: 1. HAVE_ACPI_APEI for those archs which support APEI. 2. HAVE_ACPI_APEI_NMI which is used for NMI code isolation in ghes.c file. NMI related data and functions are grouped so they can be wrapped inside one #ifdef section. Appropriate function stubs are provided for !NMI case. Note there is no functional changes for x86 due to hard selected HAVE_ACPI_APEI and HAVE_ACPI_APEI_NMI symbols. Signed-off-by: Tomasz Nowicki Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- include/linux/nmi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 447775ee2c4b..1d2a6ab6b8bb 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -63,4 +63,8 @@ extern int proc_dowatchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); #endif +#ifdef CONFIG_HAVE_ACPI_APEI_NMI +#include +#endif + #endif -- cgit v1.2.3 From 594c7255dce7a13cac50cf2470cc56e2c3b0494e Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 22 Jul 2014 11:20:13 +0200 Subject: acpi, apei, ghes: Factor out ioremap virtual memory for IRQ and NMI context. GHES currently maps two pages with atomic_ioremap. From now on, NMI is architectural depended so there is no need to allocate an NMI page for platforms without NMI support. To make it possible to not use a second page, swap the existing page order so that the IRQ context page is first, and the optional NMI context page is second. Then, use HAVE_ACPI_APEI_NMI to decide how many pages are to be allocated. Signed-off-by: Tomasz Nowicki Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- include/acpi/apei.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/acpi/apei.h b/include/acpi/apei.h index 8a23c95109c6..76284bb560a6 100644 --- a/include/acpi/apei.h +++ b/include/acpi/apei.h @@ -44,6 +44,7 @@ int erst_clear(u64 record_id); int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data); void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err); +void arch_apei_flush_tlb_one(unsigned long addr); #endif #endif -- cgit v1.2.3 From 5ccb8225abf2ac51cd023a99f28366ac9823bd0d Mon Sep 17 00:00:00 2001 From: Mike Qiu Date: Tue, 29 Jul 2014 10:49:25 -0700 Subject: x86/ras: Fix build warnings in MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build warning due to a missing forward declaration in . We need struct pci_dev to be forward declared so we can define pointers to it, but we don't need to pull in the whole definition. build log: In file included from include/ras/ras_event.h:11:0, from drivers/ras/ras.c:13: include/linux/aer.h:42:129: warning: ‘struct pci_dev’ declared inside parameter list [enabled by default] include/linux/aer.h:42:129: warning: its scope is only this definition or declaration, which is probably not what you want [enabled by default] include/linux/aer.h:46:130: warning: ‘struct pci_dev’ declared inside parameter list [enabled by default] include/linux/aer.h:50:136: warning: ‘struct pci_dev’ declared inside parameter list [enabled by default] include/linux/aer.h:57:14: warning: ‘struct pci_dev’ declared inside parameter list [enabled by default] Signed-off-by: Mike Qiu Link: http://lkml.kernel.org/r/53d7dea511471321bb@agluck-desk.sc.intel.com Acked-by: Randy Dunlap Tested-by: Randy Dunlap Acked-by: Bjorn Helgaas Signed-off-by: Tony Luck Signed-off-by: H. Peter Anvin --- include/linux/aer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/aer.h b/include/linux/aer.h index 4dbaa7081530..c826d1c28f9c 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -11,6 +11,8 @@ #define AER_FATAL 1 #define AER_CORRECTABLE 2 +struct pci_dev; + struct aer_header_log_regs { unsigned int dw0; unsigned int dw1; -- cgit v1.2.3