From 7b16bbf97375d9fb7fc107b3f80afeb94a204e44 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 18 Oct 2012 14:33:23 +0800 Subject: Revert "x86/mm: Fix the size calculation of mapping tables" Commit: 722bc6b16771 x86/mm: Fix the size calculation of mapping tables Tried to address the issue that the first 2/4M should use 4k pages if PSE enabled, but extra counts should only be valid for x86_32. This commit caused a kdump regression: the kdump kernel hangs. Work is in progress to fundamentally fix the various page table initialization issues that we have, via the design suggested by H. Peter Anvin, but it's not ready yet to be merged. So, to get a working kdump revert to the last known working version, which is the revert of this commit and of a followup fix (which was incomplete): bd2753b2dda7 x86/mm: Only add extra pages count for the first memory range during pre-allocation Tested kdump on physical and virtual machines. Signed-off-by: Dave Young Acked-by: Yinghai Lu Acked-by: Cong Wang Acked-by: Flavio Leitner Tested-by: Flavio Leitner Cc: Dan Carpenter Cc: Cong Wang Cc: Flavio Leitner Cc: Tejun Heo Cc: ianfang.cn@gmail.com Cc: Vivek Goyal Cc: Linus Torvalds Cc: Andrew Morton Cc: Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ab1f6a93b527..8653b3a722be 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -29,14 +29,8 @@ int direct_gbpages #endif ; -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; - -static void __init find_early_table_space(struct map_range *mr, unsigned long end, - int use_pse, int use_gbpages) +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) { unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; phys_addr_t base; @@ -61,10 +55,6 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en #ifdef CONFIG_X86_32 extra += PMD_SIZE; #endif - /* The first 2/4M doesn't use large pages. */ - if (mr->start < PMD_SIZE) - extra += mr->end - mr->start; - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -95,6 +85,12 @@ void __init native_pagetable_reserve(u64 start, u64 end) memblock_reserve(start, end - start); } +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 #else /* CONFIG_X86_64 */ @@ -267,7 +263,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * nodes are discovered. */ if (!after_bootmem) - find_early_table_space(&mr[0], end, use_pse, use_gbpages); + find_early_table_space(end, use_pse, use_gbpages); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, -- cgit v1.2.3 From 7991c9ca40d3127dd2ffa3a9c1e33f7d4005495a Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 19 Oct 2012 17:30:01 -0400 Subject: perf/x86: Fix P6 FP_ASSIST event constraint According to Intel SDM Volume 3B, FP_ASSIST is limited to Counter 1 only, not Counter 0. Tested on a Pentium II. Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210191728570.14552@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index e4dd0f7a0453..0ff5f7fb64cd 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -34,7 +34,7 @@ static struct event_constraint p6_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ -- cgit v1.2.3 From e09df47885d767e418902067ce1885aafa3b27db Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 19 Oct 2012 17:31:54 -0400 Subject: perf/x86: Update/fix generic events on P6 PMU This patch updates the generic events on p6, including some new extended cache events. Values for these events were taken from the equivelant PAPI predefined events. Tested on a Pentium II. Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210191730080.14552@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p6.c | 111 +++++++++++++++++++++++++++++++++--- 1 file changed, 104 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 0ff5f7fb64cd..9582fcbcd8ec 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -8,13 +8,106 @@ */ static const u64 p6_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */ + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */ + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */ + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */ + +}; + +static __initconst u64 p6_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static u64 p6_pmu_event_map(int hw_event) @@ -158,5 +251,9 @@ __init int p6_pmu_init(void) x86_pmu = p6_pmu; + memcpy(hw_cache_event_ids, p6_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; } -- cgit v1.2.3 From 58e9eaf06f5476cb2192ec1d012674ce5e79dd21 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 19 Oct 2012 17:33:38 -0400 Subject: perf/x86: Remove P6 cpuc->enabled check Between 2.6.33 and 2.6.34 the PMU code was made modular. The x86_pmu_enable() call was extended to disable cpuc->enabled and iterate the counters, enabling one at a time, before calling enable_all() at the end, followed by re-enabling cpuc->enabled. Since cpuc->enabled was set to 0, that change effectively caused the "val |= ARCH_PERFMON_EVENTSEL_ENABLE;" code in p6_pmu_enable_event() and p6_pmu_disable_event() to be dead code that was never called. This change removes this code (which was confusing) and adds some extra commentary to make it more clear what is going on. Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210191732000.14552@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p6.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 9582fcbcd8ec..7d0270bd793e 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -157,25 +157,25 @@ static void p6_pmu_enable_all(int added) static inline void p6_pmu_disable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val = P6_NOP_EVENT; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)wrmsrl_safe(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + /* + * p6 only has a global event enable, set on PerfEvtSel0 + * We "disable" events by programming P6_NOP_EVENT + * and we rely on p6_pmu_enable_all() being called + * to actually enable the events. + */ (void)wrmsrl_safe(hwc->config_base, val); } -- cgit v1.2.3 From 876ee61aadf01aa0db981b5d249cbdd53dc28b5e Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 4 Oct 2012 14:48:10 +0100 Subject: x86-64: Fix page table accounting Commit 20167d3421a089a1bf1bd680b150dc69c9506810 ("x86-64: Fix accounting in kernel_physical_mapping_init()") went a little too far by entirely removing the counting of pre-populated page tables: this should be done at boot time (to cover the page tables set up in early boot code), but shouldn't be done during memory hot add. Hence, re-add the removed increments of "pages", but make them and the one in phys_pte_init() conditional upon !after_bootmem. Reported-Acked-and-Tested-by: Hugh Dickins Signed-off-by: Jan Beulich Cc: Link: http://lkml.kernel.org/r/506DAFBA020000780009FA8C@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 2b6b4a3c8beb..3baff255adac 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -386,7 +386,8 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, * these mappings are more intelligent. */ if (pte_val(*pte)) { - pages++; + if (!after_bootmem) + pages++; continue; } @@ -451,6 +452,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, * attributes. */ if (page_size_mask & (1 << PG_LEVEL_2M)) { + if (!after_bootmem) + pages++; last_map_addr = next; continue; } @@ -526,6 +529,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, * attributes. */ if (page_size_mask & (1 << PG_LEVEL_1G)) { + if (!after_bootmem) + pages++; last_map_addr = next; continue; } -- cgit v1.2.3 From 032c3851f51141e30de02ed0bc50a7743dfd776d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 24 Oct 2012 16:42:20 +0800 Subject: perf/x86/uncore: Handle pci_read_config_dword() errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This, beyond handling corner cases, also fixes some build warnings: arch/x86/kernel/cpu/perf_event_intel_uncore.c: In function ‘snbep_uncore_pci_disable_box’: arch/x86/kernel/cpu/perf_event_intel_uncore.c:124:9: warning: ‘config’ is used uninitialized in this function [-Wuninitialized] arch/x86/kernel/cpu/perf_event_intel_uncore.c: In function ‘snbep_uncore_pci_enable_box’: arch/x86/kernel/cpu/perf_event_intel_uncore.c:135:9: warning: ‘config’ is used uninitialized in this function [-Wuninitialized] arch/x86/kernel/cpu/perf_event_intel_uncore.c: In function ‘snbep_uncore_pci_read_counter’: arch/x86/kernel/cpu/perf_event_intel_uncore.c:164:2: warning: ‘count’ is used uninitialized in this function [-Wuninitialized] Signed-off-by: Yan, Zheng Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/1351068140-13456-1-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 43 +++++++++++++++++---------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 5df8d32ba91e..6f874771ee65 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -118,22 +118,24 @@ static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; int box_ctl = uncore_pci_box_ctl(box); - u32 config; + u32 config = 0; - pci_read_config_dword(pdev, box_ctl, &config); - config |= SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } } static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; int box_ctl = uncore_pci_box_ctl(box); - u32 config; + u32 config = 0; - pci_read_config_dword(pdev, box_ctl, &config); - config &= ~SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } } static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) @@ -156,7 +158,7 @@ static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct pe { struct pci_dev *pdev = box->pci_dev; struct hw_perf_event *hwc = &event->hw; - u64 count; + u64 count = 0; pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); @@ -603,11 +605,12 @@ static struct pci_driver snbep_uncore_pci_driver = { /* * build pci bus to socket mapping */ -static void snbep_pci2phy_map_init(void) +static int snbep_pci2phy_map_init(void) { struct pci_dev *ubox_dev = NULL; int i, bus, nodeid; - u32 config; + int err = 0; + u32 config = 0; while (1) { /* find the UBOX device */ @@ -618,10 +621,14 @@ static void snbep_pci2phy_map_init(void) break; bus = ubox_dev->bus->number; /* get the Node ID of the local register */ - pci_read_config_dword(ubox_dev, 0x40, &config); + err = pci_read_config_dword(ubox_dev, 0x40, &config); + if (err) + break; nodeid = config; /* get the Node ID mapping */ - pci_read_config_dword(ubox_dev, 0x54, &config); + err = pci_read_config_dword(ubox_dev, 0x54, &config); + if (err) + break; /* * every three bits in the Node ID mapping register maps * to a particular node. @@ -633,7 +640,11 @@ static void snbep_pci2phy_map_init(void) } } }; - return; + + if (ubox_dev) + pci_dev_put(ubox_dev); + + return err ? pcibios_err_to_errno(err) : 0; } /* end of Sandy Bridge-EP uncore support */ @@ -2578,9 +2589,11 @@ static int __init uncore_pci_init(void) switch (boot_cpu_data.x86_model) { case 45: /* Sandy Bridge-EP */ + ret = snbep_pci2phy_map_init(); + if (ret) + return ret; pci_uncores = snbep_pci_uncores; uncore_pci_driver = &snbep_uncore_pci_driver; - snbep_pci2phy_map_init(); break; default: return 0; -- cgit v1.2.3 From ae5ba47a990a18c869d66916fd72fb334c45cf91 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 17 Oct 2012 13:03:21 -0400 Subject: perf/x86: Make Intel KNC use full 40-bit width of counters Early versions of Intel KNC chips have a bug where bits above 32 were not properly set. We worked around this by only using the bottom 32 bits (out of 40 that should be available). It turns out this workaround breaks overflow handling. The buggy silicon will in theory never be used in production systems, so remove this workaround so we get proper overflow support. Signed-off-by: Vince Weaver Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: eranian@gmail.com Cc: Meadows Lawrence F Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210171302140.23243@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_knc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index 7c46bfdbc373..73bcfbdedd50 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -226,12 +226,11 @@ static __initconst struct x86_pmu knc_pmu = { .event_map = knc_pmu_event_map, .max_events = ARRAY_SIZE(knc_perfmon_event_map), .apic = 1, - .max_period = (1ULL << 31) - 1, + .max_period = (1ULL << 39) - 1, .version = 0, .num_counters = 2, - /* in theory 40 bits, early silicon is buggy though */ - .cntval_bits = 32, - .cntval_mask = (1ULL << 32) - 1, + .cntval_bits = 40, + .cntval_mask = (1ULL << 40) - 1, .get_event_constraints = x86_get_event_constraints, .event_constraints = knc_event_constraints, .format_attrs = intel_knc_formats_attr, -- cgit v1.2.3 From 7d011962afbaa6e572cd8e0dbb7abf773e166e64 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 17 Oct 2012 13:04:33 -0400 Subject: perf/x86: Remove cpuc->enable check on Intl KNC event enable/disable x86_pmu.enable() is called from x86_pmu_enable() with cpuc->enabled set to 0. This means we weren't re-enabling the counters after a context switch. This patch just removes the check, as it should't be necessary (and the equivelent x86_ generic code does not have the checks). The origin of this problem is the KNC driver being based on the P6 one. The P6 driver also has this issue, but works anyway due to various lucky accidents. Signed-off-by: Vince Weaver Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: eranian@gmail.com Cc: Meadows Cc: Lawrence F Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210171303290.23243@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_knc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index 73bcfbdedd50..f3a2af41552d 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -173,26 +173,22 @@ static void knc_pmu_enable_all(int added) static inline void knc_pmu_disable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } static void knc_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } -- cgit v1.2.3 From e4074b3049f99c6ad6e1a33e6d93d8ec0652e2c1 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 17 Oct 2012 13:05:45 -0400 Subject: perf/x86: Enable overflow on Intel KNC with a custom knc_pmu_handle_irq() Although based on the Intel P6 design, the interrupt mechnanism for KNC more closely resembles the Intel architectural perfmon one. We can't just re-use that code though, because KNC has different MSR numbers for the status and ack registers. In this case we just cut-and paste from perf_event_intel.c with some minor changes, as it looks like it would not be worth the trouble to change that code to be MSR-configurable. Signed-off-by: Vince Weaver Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: eranian@gmail.com Cc: Meadows Lawrence F Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210171304410.23243@vincent-weaver-1.um.maine.edu [ Small stylistic edits. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_knc.c | 78 +++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index f3a2af41552d..4b7731bf23a8 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -3,6 +3,8 @@ #include #include +#include + #include "perf_event.h" static const u64 knc_perfmon_event_map[] = @@ -193,6 +195,80 @@ static void knc_pmu_enable_event(struct perf_event *event) (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } +static inline u64 knc_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void knc_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack); +} + +static int knc_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int handled = 0; + int bit, loops; + u64 status; + + cpuc = &__get_cpu_var(cpu_hw_events); + + knc_pmu_disable_all(); + + status = knc_pmu_get_status(); + if (!status) { + knc_pmu_enable_all(0); + return handled; + } + + loops = 0; +again: + knc_pmu_ack_status(status); + if (++loops > 100) { + WARN_ONCE(1, "perf: irq loop stuck!\n"); + perf_event_print_debug(); + goto done; + } + + inc_irq_stat(apic_perf_irqs); + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = knc_pmu_get_status(); + if (status) + goto again; + +done: + knc_pmu_enable_all(0); + + return handled; +} + + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -210,7 +286,7 @@ static struct attribute *intel_knc_formats_attr[] = { static __initconst struct x86_pmu knc_pmu = { .name = "knc", - .handle_irq = x86_pmu_handle_irq, + .handle_irq = knc_pmu_handle_irq, .disable_all = knc_pmu_disable_all, .enable_all = knc_pmu_enable_all, .enable = knc_pmu_enable_event, -- cgit v1.2.3 From 3e8fa263a97079c74880675c451587bb6899e661 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 19 Oct 2012 13:25:46 +0100 Subject: x86/efi: Fix oops caused by incorrect set_memory_uc() usage Calling __pa() with an ioremap'd address is invalid. If we encounter an efi_memory_desc_t without EFI_MEMORY_WB set in ->attribute we currently call set_memory_uc(), which in turn calls __pa() on a potentially ioremap'd address. On CONFIG_X86_32 this results in the following oops: BUG: unable to handle kernel paging request at f7f22280 IP: [] reserve_ram_pages_type+0x89/0x210 *pdpt = 0000000001978001 *pde = 0000000001ffb067 *pte = 0000000000000000 Oops: 0000 [#1] PREEMPT SMP Modules linked in: Pid: 0, comm: swapper Not tainted 3.0.0-acpi-efi-0805 #3 EIP: 0060:[] EFLAGS: 00010202 CPU: 0 EIP is at reserve_ram_pages_type+0x89/0x210 EAX: 0070e280 EBX: 38714000 ECX: f7814000 EDX: 00000000 ESI: 00000000 EDI: 38715000 EBP: c189fef0 ESP: c189fea8 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 Process swapper (pid: 0, ti=c189e000 task=c18bbe60 task.ti=c189e000) Stack: 80000200 ff108000 00000000 c189ff00 00038714 00000000 00000000 c189fed0 c104f8ca 00038714 00000000 00038715 00000000 00000000 00038715 00000000 00000010 38715000 c189ff48 c1025aff 38715000 00000000 00000010 00000000 Call Trace: [] ? page_is_ram+0x1a/0x40 [] reserve_memtype+0xdf/0x2f0 [] set_memory_uc+0x49/0xa0 [] efi_enter_virtual_mode+0x1c2/0x3aa [] start_kernel+0x291/0x2f2 [] ? loglevel+0x1b/0x1b [] i386_start_kernel+0xbf/0xc8 The only time we can call set_memory_uc() for a memory region is when it is part of the direct kernel mapping. For the case where we ioremap a memory region we must leave it alone. This patch reimplements the fix from e8c7106280a3 ("x86, efi: Calling __pa() with an ioremap()ed address is invalid") which was reverted in e1ad783b12ec because it caused a regression on some MacBooks (they hung at boot). The regression was caused because the commit only marked EFI_RUNTIME_SERVICES_DATA as E820_RESERVED_EFI, when it should have marked all regions that have the EFI_MEMORY_RUNTIME attribute. Despite first impressions, it's not possible to use ioremap_cache() to map all cached memory regions on CONFIG_X86_64 because of the way that the memory map might be configured as detailed in the following bug report, https://bugzilla.redhat.com/show_bug.cgi?id=748516 e.g. some of the EFI memory regions *need* to be mapped as part of the direct kernel mapping. Signed-off-by: Matt Fleming Cc: Matthew Garrett Cc: Zhang Rui Cc: Huang Ying Cc: Keith Packard Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1350649546-23541-1-git-send-email-matt@console-pimps.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/efi.h | 5 +++-- arch/x86/platform/efi/efi.c | 29 ++++++++++++++++++----------- arch/x86/platform/efi/efi_64.c | 7 +++++-- 3 files changed, 26 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index c9dcc181d4d1..36ff332da130 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -35,7 +35,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ efi_call_virt(f, a1, a2, a3, a4, a5, a6) -#define efi_ioremap(addr, size, type) ioremap_cache(addr, size) +#define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) #else /* !CONFIG_X86_32 */ @@ -89,7 +89,7 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, - u32 type); + u32 type, u64 attribute); #endif /* CONFIG_X86_32 */ @@ -98,6 +98,7 @@ extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); +extern void efi_memory_uc(u64 addr, unsigned long size); #ifndef CONFIG_EFI /* diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index aded2a91162a..cb34839c97c5 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -810,6 +810,16 @@ void __iomem *efi_lookup_mapped_addr(u64 phys_addr) return NULL; } +void efi_memory_uc(u64 addr, unsigned long size) +{ + unsigned long page_shift = 1UL << EFI_PAGE_SHIFT; + u64 npages; + + npages = round_up(size, page_shift) / page_shift; + memrange_efi_to_native(&addr, &npages); + set_memory_uc(addr, npages); +} + /* * This function will switch the EFI runtime services to virtual mode. * Essentially, look through the EFI memmap and map every region that @@ -823,7 +833,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md, *prev_md = NULL; efi_status_t status; unsigned long size; - u64 end, systab, addr, npages, end_pfn; + u64 end, systab, end_pfn; void *p, *va, *new_memmap = NULL; int count = 0; @@ -879,10 +889,14 @@ void __init efi_enter_virtual_mode(void) end_pfn = PFN_UP(end); if (end_pfn <= max_low_pfn_mapped || (end_pfn > (1UL << (32 - PAGE_SHIFT)) - && end_pfn <= max_pfn_mapped)) + && end_pfn <= max_pfn_mapped)) { va = __va(md->phys_addr); - else - va = efi_ioremap(md->phys_addr, size, md->type); + + if (!(md->attribute & EFI_MEMORY_WB)) + efi_memory_uc((u64)(unsigned long)va, size); + } else + va = efi_ioremap(md->phys_addr, size, + md->type, md->attribute); md->virt_addr = (u64) (unsigned long) va; @@ -892,13 +906,6 @@ void __init efi_enter_virtual_mode(void) continue; } - if (!(md->attribute & EFI_MEMORY_WB)) { - addr = md->virt_addr; - npages = md->num_pages; - memrange_efi_to_native(&addr, &npages); - set_memory_uc(addr, npages); - } - systab = (u64) (unsigned long) efi_phys.systab; if (md->phys_addr <= systab && systab < end) { systab += md->virt_addr - md->phys_addr; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ac3aa54e2654..95fd505dfeb6 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -82,7 +82,7 @@ void __init efi_call_phys_epilog(void) } void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, - u32 type) + u32 type, u64 attribute) { unsigned long last_map_pfn; @@ -92,8 +92,11 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) { unsigned long top = last_map_pfn << PAGE_SHIFT; - efi_ioremap(top, size - (top - phys_addr), type); + efi_ioremap(top, size - (top - phys_addr), type, attribute); } + if (!(attribute & EFI_MEMORY_WB)) + efi_memory_uc((u64)(unsigned long)__va(phys_addr), size); + return (void __iomem *)__va(phys_addr); } -- cgit v1.2.3 From 64dfab8e83644902ad2fd559a56c411b47e3ef3c Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 22 Oct 2012 16:51:38 +0800 Subject: perf/x86: Remove unused variable in nhmex_rbox_alter_er() The variable port is initialized but never used otherwise, so remove the unused variable. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Cc: Yan, Zheng Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Cc: acme@ghostprotocols.net Link: http://lkml.kernel.org/r/CAPgLHd8NZkYSkZm22FpZxiEh6HcA0q-V%3D29vdnheiDhgrJZ%2Byw@mail.gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 6f874771ee65..3cf3d97cce3a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -1558,7 +1558,6 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - int port; /* adjust the main event selector and extra register index */ if (reg1->idx % 2) { @@ -1570,7 +1569,6 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) } /* adjust extra register config */ - port = reg1->idx / 6 + box->pmu->pmu_idx * 4; switch (reg1->idx % 6) { case 2: /* shift the 8~15 bits to the 0~7 bits */ -- cgit v1.2.3 From 94777fc51b3ad85ff9f705ddf7cdd0eb3bbad5a6 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 16 Oct 2012 07:50:21 -0500 Subject: x86/irq/ioapic: Check for valid irq_cfg pointer in smp_irq_move_cleanup_interrupt Posting this patch to fix an issue concerning sparse irq's that I raised a while back. There was discussion about adding refcounting to sparse irqs (to fix other potential race conditions), but that does not appear to have been addressed yet. This covers the only issue of this type that I've encountered in this area. A NULL pointer dereference can occur in smp_irq_move_cleanup_interrupt() if we haven't yet setup the irq_cfg pointer in the irq_desc.irq_data.chip_data. In create_irq_nr() there is a window where we have set vector_irq in __assign_irq_vector(), but not yet called irq_set_chip_data() to set the irq_cfg pointer. Should an IRQ_MOVE_CLEANUP_VECTOR hit the cpu in question during this time, smp_irq_move_cleanup_interrupt() will attempt to process the aforementioned irq, but panic when accessing irq_cfg. Only continue processing the irq if irq_cfg is non-NULL. Signed-off-by: Dimitri Sivanich Cc: Suresh Siddha Cc: Joerg Roedel Cc: Yinghai Lu Cc: Alexander Gordeev Link: http://lkml.kernel.org/r/20121016125021.GA22935@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c265593ec2cd..1817fa911024 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2257,6 +2257,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) continue; cfg = irq_cfg(irq); + if (!cfg) + continue; + raw_spin_lock(&desc->lock); /* -- cgit v1.2.3 From 6ede1fd3cb404c0016de6ac529df46d561bd558b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 22 Oct 2012 16:35:18 -0700 Subject: x86, mm: Trim memory in memblock to be page aligned We will not map partial pages, so need to make sure memblock allocation will not allocate those bytes out. Also we will use for_each_mem_pfn_range() to loop to map memory range to keep them consistent. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQVZirvaBMFYRfXMmWEcHbKSicQEHz4VAwUv0xFCk51ZNw@mail.gmail.com Acked-by: Jacob Shin Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/e820.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index ed858e9e9a74..df06ade26bef 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1077,6 +1077,9 @@ void __init memblock_x86_fill(void) memblock_add(ei->addr, ei->size); } + /* throw away partial pages */ + memblock_trim_memory(PAGE_SIZE); + memblock_dump_all(); } -- cgit v1.2.3 From 1f2ff682ac951ed82cc043cf140d2851084512df Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 22 Oct 2012 16:35:18 -0700 Subject: x86, mm: Use memblock memory loop instead of e820_RAM We need to handle E820_RAM and E820_RESERVED_KERNEL at the same time. Also memblock has page aligned range for ram, so we could avoid mapping partial pages. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQVZirvaBMFYRfXMmWEcHbKSicQEHz4VAwUv0xFCk51ZNw@mail.gmail.com Acked-by: Jacob Shin Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/setup.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 468e98dfd44e..5d888af8682a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -921,18 +921,19 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + unsigned long start, end; + unsigned long start_pfn, end_pfn; - if (ei->addr + ei->size <= 1UL << 32) - continue; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, + NULL) { - if (ei->type == E820_RESERVED) + end = PFN_PHYS(end_pfn); + if (end <= (1UL<<32)) continue; + start = PFN_PHYS(start_pfn); max_pfn_mapped = init_memory_mapping( - ei->addr < 1UL << 32 ? 1UL << 32 : ei->addr, - ei->addr + ei->size); + max((1UL<<32), start), end); } /* can we preseve max_low_pfn ?*/ -- cgit v1.2.3 From 844ab6f993b1d32eb40512503d35ff6ad0c57030 Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Wed, 24 Oct 2012 14:24:44 -0500 Subject: x86, mm: Find_early_table_space based on ranges that are actually being mapped Current logic finds enough space for direct mapping page tables from 0 to end. Instead, we only need to find enough space to cover mr[0].start to mr[nr_range].end -- the range that is actually being mapped by init_memory_mapping() This is needed after 1bbbbe779aabe1f0768c2bf8f8c0a5583679b54a, to address the panic reported here: https://lkml.org/lkml/2012/10/20/160 https://lkml.org/lkml/2012/10/21/157 Signed-off-by: Jacob Shin Link: http://lkml.kernel.org/r/20121024195311.GB11779@jshin-Toonie Tested-by: Tom Rini Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 70 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8653b3a722be..bc287d62bf1e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -29,36 +29,54 @@ int direct_gbpages #endif ; -static void __init find_early_table_space(unsigned long end, int use_pse, - int use_gbpages) +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +/* + * First calculate space needed for kernel direct mapping page tables to cover + * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB + * pages. Then find enough contiguous space for those page tables. + */ +static void __init find_early_table_space(struct map_range *mr, int nr_range) { - unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; + int i; + unsigned long puds = 0, pmds = 0, ptes = 0, tables; + unsigned long start = 0, good_end; phys_addr_t base; - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + for (i = 0; i < nr_range; i++) { + unsigned long range, extra; - if (use_gbpages) { - unsigned long extra; + range = mr[i].end - mr[i].start; + puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; - extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); - pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { + extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); + pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else { + pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; + } - if (use_pse) { - unsigned long extra; - - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); + if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { + extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); #ifdef CONFIG_X86_32 - extra += PMD_SIZE; + extra += PMD_SIZE; #endif - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + /* The first 2/4M doesn't use large pages. */ + if (mr[i].start < PMD_SIZE) + extra += range; + + ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else { + ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + } + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); #ifdef CONFIG_X86_32 @@ -76,7 +94,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse, pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", - end - 1, pgt_buf_start << PAGE_SHIFT, + mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, (pgt_buf_top << PAGE_SHIFT) - 1); } @@ -85,12 +103,6 @@ void __init native_pagetable_reserve(u64 start, u64 end) memblock_reserve(start, end - start); } -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; - #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 #else /* CONFIG_X86_64 */ @@ -263,7 +275,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * nodes are discovered. */ if (!after_bootmem) - find_early_table_space(end, use_pse, use_gbpages); + find_early_table_space(mr, nr_range); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, -- cgit v1.2.3 From 5189c2a7c7769ee9d037d76c1a7b8550ccf3481c Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Wed, 24 Oct 2012 10:00:44 -0700 Subject: x86: efi: Turn off efi_enabled after setup on mixed fw/kernel When 32-bit EFI is used with 64-bit kernel (or vice versa), turn off efi_enabled once setup is done. Beyond setup, it is normally used to determine if runtime services are available and we will have none. This will resolve issues stemming from efivars modprobe panicking on a 32/64-bit setup, as well as some reboot issues on similar setups. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=45991 Reported-by: Marko Kohtala Reported-by: Maxim Kammerer Signed-off-by: Olof Johansson Acked-by: Maarten Lankhorst Cc: stable@kernel.org # 3.4 - 3.6 Cc: Matthew Garrett Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 1 + arch/x86/kernel/setup.c | 12 ++++++++++++ arch/x86/platform/efi/efi.c | 18 ++++++++++-------- 3 files changed, 23 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index c9dcc181d4d1..029189db84de 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -98,6 +98,7 @@ extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); +extern void efi_unmap_memmap(void); #ifndef CONFIG_EFI /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a2bb18e02839..6fd7752cee96 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1035,6 +1035,18 @@ void __init setup_arch(char **cmdline_p) arch_init_ideal_nops(); register_refined_jiffies(CLOCK_TICK_RATE); + +#ifdef CONFIG_EFI + /* Once setup is done above, disable efi_enabled on mismatched + * firmware/kernel archtectures since there is no support for + * runtime services. + */ + if (efi_enabled && IS_ENABLED(CONFIG_X86_64) != efi_64bit) { + pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); + efi_unmap_memmap(); + efi_enabled = 0; + } +#endif } #ifdef CONFIG_X86_32 diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index aded2a91162a..9bae3b73fccc 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -70,11 +70,15 @@ EXPORT_SYMBOL(efi); struct efi_memory_map memmap; bool efi_64bit; -static bool efi_native; static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; +static inline bool efi_is_native(void) +{ + return IS_ENABLED(CONFIG_X86_64) == efi_64bit; +} + static int __init setup_noefi(char *arg) { efi_enabled = 0; @@ -420,7 +424,7 @@ void __init efi_reserve_boot_services(void) } } -static void __init efi_unmap_memmap(void) +void __init efi_unmap_memmap(void) { if (memmap.map) { early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); @@ -432,7 +436,7 @@ void __init efi_free_boot_services(void) { void *p; - if (!efi_native) + if (!efi_is_native()) return; for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { @@ -684,12 +688,10 @@ void __init efi_init(void) return; } efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; - efi_native = !efi_64bit; #else efi_phys.systab = (efi_system_table_t *) (boot_params.efi_info.efi_systab | ((__u64)boot_params.efi_info.efi_systab_hi<<32)); - efi_native = efi_64bit; #endif if (efi_systab_init(efi_phys.systab)) { @@ -723,7 +725,7 @@ void __init efi_init(void) * that doesn't match the kernel 32/64-bit mode. */ - if (!efi_native) + if (!efi_is_native()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); else if (efi_runtime_init()) { efi_enabled = 0; @@ -735,7 +737,7 @@ void __init efi_init(void) return; } #ifdef CONFIG_X86_32 - if (efi_native) { + if (efi_is_native()) { x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; } @@ -834,7 +836,7 @@ void __init efi_enter_virtual_mode(void) * non-native EFI */ - if (!efi_native) { + if (!efi_is_native()) { efi_unmap_memmap(); return; } -- cgit v1.2.3 From f82f64dd9f485e13f29f369772d4a0e868e5633a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 25 Oct 2012 15:45:26 -0700 Subject: x86, mm: Undo incorrect revert in arch/x86/mm/init.c Commit 844ab6f9 x86, mm: Find_early_table_space based on ranges that are actually being mapped added back some lines back wrongly that has been removed in commit 7b16bbf97 Revert "x86/mm: Fix the size calculation of mapping tables" remove them again. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQW_vuaYQbmagVnxT2DGsYc=9tNeAbdBq53sYkitPOwxSQ@mail.gmail.com Acked-by: Jacob Shin Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bc287d62bf1e..d7aea41563b3 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -65,10 +65,6 @@ static void __init find_early_table_space(struct map_range *mr, int nr_range) #ifdef CONFIG_X86_32 extra += PMD_SIZE; #endif - /* The first 2/4M doesn't use large pages. */ - if (mr[i].start < PMD_SIZE) - extra += range; - ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else { ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; -- cgit v1.2.3 From 2c5594df344cd1ff0cc9bf007dea3235582b3acf Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 26 Oct 2012 11:40:28 +0200 Subject: rcu: Fix unrecovered RCU user mode in syscall_trace_leave() On x86-64 syscall exit, 3 non exclusive events may happen looping in the following order: 1) Check if we need resched for user preemption, if so call schedule_user() 2) Check if we have pending signals, if so call do_notify_resume() 3) Check if we do syscall tracing, if so call syscall_trace_leave() However syscall_trace_leave() has been written assuming it directly follows the syscall and forget about the above possible 1st and 2nd steps. Now schedule_user() and do_notify_resume() exit in RCU user mode because they have most chances to resume userspace immediately and this avoids an rcu_user_enter() call in the syscall fast path. So by the time we call syscall_trace_leave(), we may well be in RCU user mode. To fix this up, simply call rcu_user_exit() in the beginning of this function. This fixes some reported RCU uses in extended quiescent state. Reported-by: Dave Jones Reported-by: Sergey Senozhatsky Signed-off-by: Frederic Weisbecker Tested-by: Sergey Senozhatsky Signed-off-by: Paul E. McKenney --- arch/x86/kernel/ptrace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b00b33a18390..eff5b8c68652 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1511,6 +1511,13 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; + /* + * We may come here right after calling schedule_user() + * or do_notify_resume(), in which case we can be in RCU + * user mode. + */ + rcu_user_exit(); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) -- cgit v1.2.3 From e6d41e8c697e07832efa4a85bf23438bc4c4e1b2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 29 Oct 2012 18:40:08 +0100 Subject: x86, AMD: Change Boris' email address Move to private email and put in maintained status. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1351532410-4887-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 698b6ec12e0f..1ac581f38dfa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -6,7 +6,7 @@ * * Written by Jacob Shin - AMD, Inc. * - * Support: borislav.petkov@amd.com + * Maintained by: Borislav Petkov * * April 2006 * - added support for AMD Family 0x10 processors -- cgit v1.2.3 From 943482d07e926128eed0482b879736f912c429e4 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Mon, 29 Oct 2012 18:51:38 +0100 Subject: x86, microcode_amd: Change email addresses, MAINTAINERS entry Signed-off-by: Andreas Herrmann Cc: lm-sensors@lm-sensors.org Cc: oprofile-list@lists.sf.net Cc: Stephane Eranian Cc: Robert Richter Cc: Borislav Petkov Cc: Jorg Roedel Cc: Rafael J. Wysocki Cc: Jean Delvare Cc: Guenter Roeck Link: http://lkml.kernel.org/r/20121029175138.GC5024@tweety Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 7720ff5a9ee2..b3e67ba55b77 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -8,8 +8,8 @@ * Tigran Aivazian * * Maintainers: - * Andreas Herrmann - * Borislav Petkov + * Andreas Herrmann + * Borislav Petkov * * This driver allows to upgrade microcode on F10h AMD * CPUs and later. -- cgit v1.2.3 From f49f4ab95c301dbccad0efe85296d908b8ae7ad4 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 29 Oct 2012 14:40:18 +0100 Subject: x86/ce4100: Fix pm_poweroff The CE4100 platform is currently missing a proper pm_poweroff implementation leading to poweroff making the CPU spin forever and the CE4100 platform does not enter a low-power mode where the external Power Management Unit can properly power off the system. Power off on this platform is implemented pretty much like reboot, by writing to the SoC built-in 8051 microcontroller mapped at I/O port 0xcf9, the value 0x4. Signed-off-by: Florian Fainelli Acked-by: Sebastian Andrzej Siewior Cc: rui.zhang@intel.com Cc: alan@linux.intel.com Link: http://lkml.kernel.org/r/1351518020-25556-2-git-send-email-ffainelli@freebox.fr Signed-off-by: Ingo Molnar --- arch/x86/platform/ce4100/ce4100.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index 4c61b52191eb..5de16e359fe6 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -27,6 +27,18 @@ static int ce4100_i8042_detect(void) return 0; } +/* + * The CE4100 platform has an internal 8051 Microcontroller which is + * responsible for signaling to the external Power Management Unit the + * intention to reset, reboot or power off the system. This 8051 device has + * its command register mapped at I/O port 0xcf9 and the value 0x4 is used + * to power off the system. + */ +static void ce4100_power_off(void) +{ + outb(0x4, 0xcf9); +} + #ifdef CONFIG_SERIAL_8250 static unsigned int mem_serial_in(struct uart_port *p, int offset) @@ -143,4 +155,6 @@ void __init x86_ce4100_early_setup(void) x86_init.pci.init_irq = sdv_pci_init; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; #endif + + pm_power_off = ce4100_power_off; } -- cgit v1.2.3 From d7959916026aaae60e1878ae33c7503b2cc4471d Mon Sep 17 00:00:00 2001 From: Maxime Bizon Date: Mon, 29 Oct 2012 14:40:19 +0100 Subject: x86/ce4100: Fix reboot by forcing the reboot method to be KBD The default reboot is via ACPI for this platform, and the CEFDK bootloader actually supports this, but will issue a system power off instead of a real reboot. Setting the reboot method to be KBD instead of ACPI ensures proper system reboot. Acked-by: Sebastian Andrzej Siewior Signed-off-by: Maxime Bizon Signed-off-by: Florian Fainelli Cc: rui.zhang@intel.com Cc: alan@linux.intel.com Link: http://lkml.kernel.org/r/1351518020-25556-3-git-send-email-ffainelli@freebox.fr Signed-off-by: Ingo Molnar --- arch/x86/platform/ce4100/ce4100.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index 5de16e359fe6..92525cb8e54c 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -21,6 +21,7 @@ #include #include #include +#include static int ce4100_i8042_detect(void) { @@ -151,6 +152,15 @@ void __init x86_ce4100_early_setup(void) x86_init.mpparse.find_smp_config = x86_init_noop; x86_init.pci.init = ce4100_pci_init; + /* + * By default, the reboot method is ACPI which is supported by the + * CE4100 bootloader CEFDK using FADT.ResetReg Address and ResetValue + * the bootloader will however issue a system power off instead of + * reboot. By using BOOT_KBD we ensure proper system reboot as + * expected. + */ + reboot_type = BOOT_KBD; + #ifdef CONFIG_X86_IO_APIC x86_init.pci.init_irq = sdv_pci_init; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; -- cgit v1.2.3 From 37aeec36220c39f1b2e7118287d951fd9cfdd6b7 Mon Sep 17 00:00:00 2001 From: Maxime Bizon Date: Mon, 29 Oct 2012 14:40:20 +0100 Subject: x86/ce4100: Fix PCI configuration register access for devices without interrupts Some CE4100 devices such as the: - DFX module (01:0b.7) - entertainment encryption device (01:10.0) - multimedia controller (01:12.0) do not have a device interrupt at all. This patch fixes the PCI controller code to declare the missing PCI configuration register space, as well as a fixup method for forcing the interrupt pin to be 0 for these devices. This is required to ensure that pci drivers matching on these devices will be able to honor the various PCI subsystem calls touching the configuration space. Signed-off-by: Maxime Bizon Signed-off-by: Florian Fainelli Acked-by: Sebastian Andrzej Siewior Cc: rui.zhang@intel.com Cc: alan@linux.intel.com Link: http://lkml.kernel.org/r/1351518020-25556-4-git-send-email-ffainelli@freebox.fr Signed-off-by: Ingo Molnar --- arch/x86/pci/ce4100.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 41bd2a2d2c50..b914e20b5a00 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -115,6 +115,16 @@ static void sata_revid_read(struct sim_dev_reg *reg, u32 *value) reg_read(reg, value); } +static void reg_noirq_read(struct sim_dev_reg *reg, u32 *value) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&pci_config_lock, flags); + /* force interrupt pin value to 0 */ + *value = reg->sim_reg.value & 0xfff00ff; + raw_spin_unlock_irqrestore(&pci_config_lock, flags); +} + static struct sim_dev_reg bus1_fixups[] = { DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write) DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write) @@ -144,6 +154,7 @@ static struct sim_dev_reg bus1_fixups[] = { DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write) DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write) DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(11, 7, 0x3c, 256, reg_init, reg_noirq_read, reg_write) DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write) DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write) @@ -161,8 +172,10 @@ static struct sim_dev_reg bus1_fixups[] = { DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write) DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write) DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write) + DEFINE_REG(16, 0, 0x3c, 256, reg_init, reg_noirq_read, reg_write) DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write) + DEFINE_REG(18, 0, 0x3c, 256, reg_init, reg_noirq_read, reg_write) }; static void __init init_sim_regs(void) -- cgit v1.2.3 From b6514633bdc6a511f7c44b3ecb86d6071374239d Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Thu, 25 Oct 2012 11:00:24 +0200 Subject: x86: remove obsolete comment from asm/xen/hypervisor.h Signed-off-by: Olaf Hering Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/hypervisor.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 66d0fff1ee84..125f344f06a9 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -33,7 +33,6 @@ #ifndef _ASM_X86_XEN_HYPERVISOR_H #define _ASM_X86_XEN_HYPERVISOR_H -/* arch/i386/kernel/setup.c */ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; -- cgit v1.2.3 From 85b97637bb40a9f486459dd254598759af9c3d50 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Mon, 29 Oct 2012 11:01:50 +0800 Subject: x86/mce: Do not change worker's running cpu in cmci_rediscover(). cmci_rediscover() used set_cpus_allowed_ptr() to change the current process's running cpu, and migrate itself to the dest cpu. But worker processes are not allowed to be migrated. If current is a worker, the worker will be migrated to another cpu, but the corresponding worker_pool is still on the original cpu. In this case, the following BUG_ON in try_to_wake_up_local() will be triggered: BUG_ON(rq != this_rq()); This will cause the kernel panic. The call trace is like the following: [ 6155.451107] ------------[ cut here ]------------ [ 6155.452019] kernel BUG at kernel/sched/core.c:1654! ...... [ 6155.452019] RIP: 0010:[] [] try_to_wake_up_local+0x115/0x130 ...... [ 6155.452019] Call Trace: [ 6155.452019] [] __schedule+0x764/0x880 [ 6155.452019] [] schedule+0x29/0x70 [ 6155.452019] [] schedule_timeout+0x235/0x2d0 [ 6155.452019] [] ? mark_held_locks+0x8d/0x140 [ 6155.452019] [] ? __lock_release+0x133/0x1a0 [ 6155.452019] [] ? _raw_spin_unlock_irq+0x30/0x50 [ 6155.452019] [] ? trace_hardirqs_on_caller+0x105/0x190 [ 6155.452019] [] wait_for_common+0x12b/0x180 [ 6155.452019] [] ? try_to_wake_up+0x2f0/0x2f0 [ 6155.452019] [] wait_for_completion+0x1d/0x20 [ 6155.452019] [] stop_one_cpu+0x8a/0xc0 [ 6155.452019] [] ? __migrate_task+0x1a0/0x1a0 [ 6155.452019] [] ? complete+0x28/0x60 [ 6155.452019] [] set_cpus_allowed_ptr+0x128/0x130 [ 6155.452019] [] cmci_rediscover+0xf5/0x140 [ 6155.452019] [] mce_cpu_callback+0x18d/0x19d [ 6155.452019] [] notifier_call_chain+0x67/0x150 [ 6155.452019] [] __raw_notifier_call_chain+0xe/0x10 [ 6155.452019] [] __cpu_notify+0x20/0x40 [ 6155.452019] [] cpu_notify_nofail+0x15/0x30 [ 6155.452019] [] _cpu_down+0x262/0x2e0 [ 6155.452019] [] cpu_down+0x36/0x50 [ 6155.452019] [] acpi_processor_remove+0x50/0x11e [ 6155.452019] [] acpi_device_remove+0x90/0xb2 [ 6155.452019] [] __device_release_driver+0x7c/0xf0 [ 6155.452019] [] device_release_driver+0x2f/0x50 [ 6155.452019] [] acpi_bus_remove+0x32/0x6d [ 6155.452019] [] acpi_bus_trim+0x87/0xee [ 6155.452019] [] acpi_bus_hot_remove_device+0x88/0x16b [ 6155.452019] [] acpi_os_execute_deferred+0x27/0x34 [ 6155.452019] [] process_one_work+0x219/0x680 [ 6155.452019] [] ? process_one_work+0x1b8/0x680 [ 6155.452019] [] ? acpi_os_wait_events_complete+0x23/0x23 [ 6155.452019] [] worker_thread+0x12e/0x320 [ 6155.452019] [] ? manage_workers+0x110/0x110 [ 6155.452019] [] kthread+0xc6/0xd0 [ 6155.452019] [] kernel_thread_helper+0x4/0x10 [ 6155.452019] [] ? retint_restore_args+0x13/0x13 [ 6155.452019] [] ? __init_kthread_worker+0x70/0x70 [ 6155.452019] [] ? gs_change+0x13/0x13 This patch removes the set_cpus_allowed_ptr() call, and put the cmci rediscover jobs onto all the other cpus using system_wq. This could bring some delay for the jobs. Signed-off-by: Tang Chen Signed-off-by: Miao Xie Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 5f88abf07e9c..4f9a3cbfc4a3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -285,34 +285,39 @@ void cmci_clear(void) raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } +static long cmci_rediscover_work_func(void *arg) +{ + int banks; + + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks); + + return 0; +} + /* * After a CPU went down cycle through all the others and rediscover * Must run in process context. */ void cmci_rediscover(int dying) { - int banks; - int cpu; - cpumask_var_t old; + int cpu, banks; if (!cmci_supported(&banks)) return; - if (!alloc_cpumask_var(&old, GFP_KERNEL)) - return; - cpumask_copy(old, ¤t->cpus_allowed); for_each_online_cpu(cpu) { if (cpu == dying) continue; - if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) + + if (cpu == smp_processor_id()) { + cmci_rediscover_work_func(NULL); continue; - /* Recheck banks in case CPUs don't all have the same */ - if (cmci_supported(&banks)) - cmci_discover(banks); - } + } - set_cpus_allowed_ptr(current, old); - free_cpumask_var(old); + work_on_cpu(cpu, cmci_rediscover_work_func, NULL); + } } /* -- cgit v1.2.3 From 95a7d76897c1e7243d4137037c66d15cbf2cce76 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 31 Oct 2012 12:38:31 -0400 Subject: xen/mmu: Use Xen specific TLB flush instead of the generic one. As Mukesh explained it, the MMUEXT_TLB_FLUSH_ALL allows the hypervisor to do a TLB flush on all active vCPUs. If instead we were using the generic one (which ends up being xen_flush_tlb) we end up making the MMUEXT_TLB_FLUSH_LOCAL hypercall. But before we make that hypercall the kernel will IPI all of the vCPUs (even those that were asleep from the hypervisor perspective). The end result is that we needlessly wake them up and do a TLB flush when we can just let the hypervisor do it correctly. This patch gives around 50% speed improvement when migrating idle guest's from one host to another. Oracle-bug: 14630170 CC: stable@vger.kernel.org Tested-by: Jingjie Jiang Suggested-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 6226c99729b9..dcf5f2dd91ec 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1288,6 +1288,25 @@ unsigned long xen_read_cr2_direct(void) return this_cpu_read(xen_vcpu_info.arch.cr2); } +void xen_flush_tlb_all(void) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb_all(0); + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_ALL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} static void xen_flush_tlb(void) { struct mmuext_op *op; @@ -2518,7 +2537,7 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, err = 0; out: - flush_tlb_all(); + xen_flush_tlb_all(); return err; } -- cgit v1.2.3 From 2bbf0a1427c377350f001fbc6260995334739ad7 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 31 Oct 2012 17:20:50 +0100 Subject: x86, amd: Disable way access filter on Piledriver CPUs The Way Access Filter in recent AMD CPUs may hurt the performance of some workloads, caused by aliasing issues in the L1 cache. This patch disables it on the affected CPUs. The issue is similar to that one of last year: http://lkml.indiana.edu/hypermail/linux/kernel/1107.3/00041.html This new patch does not replace the old one, we just need another quirk for newer CPUs. The performance penalty without the patch depends on the circumstances, but is a bit less than the last year's 3%. The workloads affected would be those that access code from the same physical page under different virtual addresses, so different processes using the same libraries with ASLR or multiple instances of PIE-binaries. The code needs to be accessed simultaneously from both cores of the same compute unit. More details can be found here: http://developer.amd.com/Assets/SharedL1InstructionCacheonAMD15hCPU.pdf CPUs affected are anything with the core known as Piledriver. That includes the new parts of the AMD A-Series (aka Trinity) and the just released new CPUs of the FX-Series (aka Vishera). The model numbering is a bit odd here: FX CPUs have model 2, A-Series has model 10h, with possible extensions to 1Fh. Hence the range of model ids. Signed-off-by: Andre Przywara Link: http://lkml.kernel.org/r/1351700450-9277-1-git-send-email-osp@andrep.de Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f7e98a2c0d12..1b7d1656a042 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -631,6 +631,20 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } + /* + * The way access filter has a performance penalty on some workloads. + * Disable it on the affected CPUs. + */ + if ((c->x86 == 0x15) && + (c->x86_model >= 0x02) && (c->x86_model < 0x20)) { + u64 val; + + if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) { + val |= 0x1E; + wrmsrl_safe(0xc0011021, val); + } + } + cpu_detect_cache_sizes(c); /* Multi core CPU? */ -- cgit v1.2.3 From 87da7e66a40532b743cd50972fcf85a1f15b14ea Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 24 Oct 2012 14:07:59 +0800 Subject: KVM: x86: fix vcpu->mmio_fragments overflow After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling), the pieces of io data can be collected and write them to the guest memory or MMIO together Unfortunately, kvm splits the mmio access into 8 bytes and store them to vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it will cause vcpu->mmio_fragments overflow The bug can be exposed by isapc (-M isapc): [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC [ ......] [23154.858083] Call Trace: [23154.859874] [] kvm_get_cr8+0x1d/0x28 [kvm] [23154.861677] [] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm] [23154.863604] [] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm] Actually, we can use one mmio_fragment to store a large mmio access then split it when we pass the mmio-exit-info to userspace. After that, we only need two entries to store mmio info for the cross-mmio pages access Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 60 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1eefebe5d727..224a7e78cb6c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3779,7 +3779,7 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, { struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; - memcpy(vcpu->run->mmio.data, frag->data, frag->len); + memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); return X86EMUL_CONTINUE; } @@ -3832,18 +3832,11 @@ mmio: bytes -= handled; val += handled; - while (bytes) { - unsigned now = min(bytes, 8U); - - frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; - frag->gpa = gpa; - frag->data = val; - frag->len = now; - - gpa += now; - val += now; - bytes -= now; - } + WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); + frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; + frag->gpa = gpa; + frag->data = val; + frag->len = bytes; return X86EMUL_CONTINUE; } @@ -3890,7 +3883,7 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, vcpu->mmio_needed = 1; vcpu->mmio_cur_fragment = 0; - vcpu->run->mmio.len = vcpu->mmio_fragments[0].len; + vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len); vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = gpa; @@ -5522,28 +5515,44 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu) * * read: * for each fragment - * write gpa, len - * exit - * copy data + * for each mmio piece in the fragment + * write gpa, len + * exit + * copy data * execute insn * * write: * for each fragment - * write gpa, len - * copy data - * exit + * for each mmio piece in the fragment + * write gpa, len + * copy data + * exit */ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; struct kvm_mmio_fragment *frag; + unsigned len; BUG_ON(!vcpu->mmio_needed); /* Complete previous fragment */ - frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; + len = min(8u, frag->len); if (!vcpu->mmio_is_write) - memcpy(frag->data, run->mmio.data, frag->len); + memcpy(frag->data, run->mmio.data, len); + + if (frag->len <= 8) { + /* Switch to the next fragment. */ + frag++; + vcpu->mmio_cur_fragment++; + } else { + /* Go forward to the next mmio piece. */ + frag->data += len; + frag->gpa += len; + frag->len -= len; + } + if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { vcpu->mmio_needed = 0; if (vcpu->mmio_is_write) @@ -5551,13 +5560,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) vcpu->mmio_read_completed = 1; return complete_emulated_io(vcpu); } - /* Initiate next fragment */ - ++frag; + run->exit_reason = KVM_EXIT_MMIO; run->mmio.phys_addr = frag->gpa; if (vcpu->mmio_is_write) - memcpy(run->mmio.data, frag->data, frag->len); - run->mmio.len = frag->len; + memcpy(run->mmio.data, frag->data, min(8u, frag->len)); + run->mmio.len = min(8u, frag->len); run->mmio.is_write = vcpu->mmio_is_write; vcpu->arch.complete_userspace_io = complete_emulated_mmio; return 0; -- cgit v1.2.3 From cf47a83fb06e42ae1b572ed68326068c7feaceae Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 19 Oct 2012 15:25:37 -0400 Subject: xen/hypercall: fix hypercall fallback code for very old hypervisors While copying the argument structures in HYPERVISOR_event_channel_op() and HYPERVISOR_physdev_op() into the local variable is sufficiently safe even if the actual structure is smaller than the container one, copying back eventual output values the same way isn't: This may collide with on-stack variables (particularly "rc") which may change between the first and second memcpy() (i.e. the second memcpy() could discard that change). Move the fallback code into out-of-line functions, and handle all of the operations known by this old a hypervisor individually: Some don't require copying back anything at all, and for the rest use the individual argument structures' sizes rather than the container's. Reported-by: Dan Carpenter Signed-off-by: Jan Beulich [v2: Reduce #define/#undef usage in HYPERVISOR_physdev_op_compat().] [v3: Fix compile errors when modules use said hypercalls] [v4: Add xen_ prefix to the HYPERCALL_..] [v5: Alter the name and only EXPORT_SYMBOL_GPL one of them] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/hypercall.h | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 59c226d120cd..c20d1ce62dc6 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -359,18 +359,14 @@ HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val, return _hypercall4(int, update_va_mapping, va, new_val.pte, new_val.pte >> 32, flags); } +extern int __must_check xen_event_channel_op_compat(int, void *); static inline int HYPERVISOR_event_channel_op(int cmd, void *arg) { int rc = _hypercall2(int, event_channel_op, cmd, arg); - if (unlikely(rc == -ENOSYS)) { - struct evtchn_op op; - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, event_channel_op_compat, &op); - memcpy(arg, &op.u, sizeof(op.u)); - } + if (unlikely(rc == -ENOSYS)) + rc = xen_event_channel_op_compat(cmd, arg); return rc; } @@ -386,17 +382,14 @@ HYPERVISOR_console_io(int cmd, int count, char *str) return _hypercall3(int, console_io, cmd, count, str); } +extern int __must_check HYPERVISOR_physdev_op_compat(int, void *); + static inline int HYPERVISOR_physdev_op(int cmd, void *arg) { int rc = _hypercall2(int, physdev_op, cmd, arg); - if (unlikely(rc == -ENOSYS)) { - struct physdev_op op; - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, physdev_op_compat, &op); - memcpy(arg, &op.u, sizeof(op.u)); - } + if (unlikely(rc == -ENOSYS)) + rc = HYPERVISOR_physdev_op_compat(cmd, arg); return rc; } -- cgit v1.2.3 From 6d369a09ccd8104b35646b507112d3ddca4344eb Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Nov 2012 11:26:34 +0000 Subject: x86: Export asm/{svm.h,vmx.h,perf_regs.h} Export asm/{svm.h,vmx.h,perf_regs.h} so that they can be disintegrated. It looks from previous commits that the first two should have been exported, but the header-y lines weren't added to the Kbuild. I'm guessing that asm/perf_regs.h should be exported too. Signed-off-by: David Howells --- arch/x86/include/asm/Kbuild | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 66e5f0ef0523..79fd8a3418f9 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -12,6 +12,7 @@ header-y += mce.h header-y += msr-index.h header-y += msr.h header-y += mtrr.h +header-y += perf_regs.h header-y += posix_types_32.h header-y += posix_types_64.h header-y += posix_types_x32.h @@ -19,8 +20,10 @@ header-y += prctl.h header-y += processor-flags.h header-y += ptrace-abi.h header-y += sigcontext32.h +header-y += svm.h header-y += ucontext.h header-y += vm86.h +header-y += vmx.h header-y += vsyscall.h genhdr-y += unistd_32.h -- cgit v1.2.3 From 6d1068b3a98519247d8ba4ec85cd40ac136dbdf9 Mon Sep 17 00:00:00 2001 From: Petr Matousek Date: Tue, 6 Nov 2012 19:24:07 +0100 Subject: KVM: x86: invalid opcode oops on SET_SREGS with OSXSAVE bit set (CVE-2012-4461) On hosts without the XSAVE support unprivileged local user can trigger oops similar to the one below by setting X86_CR4_OSXSAVE bit in guest cr4 register using KVM_SET_SREGS ioctl and later issuing KVM_RUN ioctl. invalid opcode: 0000 [#2] SMP Modules linked in: tun ip6table_filter ip6_tables ebtable_nat ebtables ... Pid: 24935, comm: zoog_kvm_monito Tainted: G D 3.2.0-3-686-pae EIP: 0060:[] EFLAGS: 00210246 CPU: 0 EIP is at kvm_arch_vcpu_ioctl_run+0x92a/0xd13 [kvm] EAX: 00000001 EBX: 000f387e ECX: 00000000 EDX: 00000000 ESI: 00000000 EDI: 00000000 EBP: ef5a0060 ESP: d7c63e70 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 Process zoog_kvm_monito (pid: 24935, ti=d7c62000 task=ed84a0c0 task.ti=d7c62000) Stack: 00000001 f70a1200 f8b940a9 ef5a0060 00000000 00200202 f8769009 00000000 ef5a0060 000f387e eda5c020 8722f9c8 00015bae 00000000 ed84a0c0 ed84a0c0 c12bf02d 0000ae80 ef7f8740 fffffffb f359b740 ef5a0060 f8b85dc1 0000ae80 Call Trace: [] ? kvm_arch_vcpu_ioctl_set_sregs+0x2fe/0x308 [kvm] ... [] ? syscall_call+0x7/0xb Code: 89 e8 e8 14 ee ff ff ba 00 00 04 00 89 e8 e8 98 48 ff ff 85 c0 74 1e 83 7d 48 00 75 18 8b 85 08 07 00 00 31 c9 8b 95 0c 07 00 00 <0f> 01 d1 c7 45 48 01 00 00 00 c7 45 1c 01 00 00 00 0f ae f0 89 EIP: [] kvm_arch_vcpu_ioctl_run+0x92a/0xd13 [kvm] SS:ESP 0068:d7c63e70 QEMU first retrieves the supported features via KVM_GET_SUPPORTED_CPUID and then sets them later. So guest's X86_FEATURE_XSAVE should be masked out on hosts without X86_FEATURE_XSAVE, making kvm_set_cr4 with X86_CR4_OSXSAVE fail. Userspaces that allow specifying guest cpuid with X86_FEATURE_XSAVE even on hosts that do not support it, might be susceptible to this attack from inside the guest as well. Allow setting X86_CR4_OSXSAVE bit only if host has XSAVE support. Signed-off-by: Petr Matousek Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/cpuid.h | 3 +++ arch/x86/kvm/x86.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a10e46016851..58fc51488828 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -24,6 +24,9 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + if (!static_cpu_has(X86_FEATURE_XSAVE)) + return 0; + best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 224a7e78cb6c..4f7641756be2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5781,6 +5781,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int pending_vec, max_bits, idx; struct desc_ptr dt; + if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + dt.size = sregs->idt.limit; dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); -- cgit v1.2.3 From ddd32b4289a6feda9ad9e68b0d85641c389a72ba Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 13 Nov 2012 02:17:36 +0900 Subject: x86, mm: Correct vmflag test for checking VM_HUGETLB commit 611ae8e3f5204f7480b3b405993b3352cfa16662('enable tlb flush range support for x86') change flush_tlb_mm_range() considerably. After this, we test whether vmflag equal to VM_HUGETLB and it may be always failed, because vmflag usually has other flags simultaneously. Our intention is to check whether this vma is for hughtlb, so correct it according to this purpose. Signed-off-by: Joonsoo Kim Acked-by: Alex Shi Link: http://lkml.kernel.org/r/1352740656-19417-1-git-send-email-js1304@gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 0777f042e400..60f926cd8b0e 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -197,7 +197,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, } if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 - || vmflag == VM_HUGETLB) { + || vmflag & VM_HUGETLB) { local_flush_tlb(); goto flush_all; } -- cgit v1.2.3 From 29282fde80d44e587f8c152b10049a56e61659f0 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 9 Nov 2012 15:20:17 +0100 Subject: KVM: x86: Fix invalid secondary exec controls in vmx_cpuid_update() The commit [ad756a16: KVM: VMX: Implement PCID/INVPCID for guests with EPT] introduced the unconditional access to SECONDARY_VM_EXEC_CONTROL, and this triggers kernel warnings like below on old CPUs: vmwrite error: reg 401e value a0568000 (err 12) Pid: 13649, comm: qemu-kvm Not tainted 3.7.0-rc4-test2+ #154 Call Trace: [] vmwrite_error+0x27/0x29 [kvm_intel] [] vmcs_writel+0x1b/0x20 [kvm_intel] [] vmx_cpuid_update+0x74/0x170 [kvm_intel] [] kvm_vcpu_ioctl_set_cpuid2+0x76/0x90 [kvm] [] kvm_arch_vcpu_ioctl+0xc37/0xed0 [kvm] [] ? __vunmap+0x9c/0x110 [] ? vmx_vcpu_load+0x39/0x1a0 [kvm_intel] [] ? kvm_arch_vcpu_load+0x52/0x1a0 [kvm] [] ? vcpu_load+0x74/0xd0 [kvm] [] kvm_vcpu_ioctl+0x110/0x5e0 [kvm] [] ? kvm_dev_ioctl+0x4d/0x4a0 [kvm] [] do_vfs_ioctl+0x8f/0x530 [] ? remove_vma+0x56/0x60 [] ? do_munmap+0x328/0x400 [] ? fget_light+0x4c/0x100 [] sys_ioctl+0x91/0xb0 [] system_call_fastpath+0x1a/0x1f This patch adds a check for the availability of secondary exec control to avoid these warnings. Cc: [v3.6+] Signed-off-by: Takashi Iwai Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad6b1dd06f8b..f85815945fc6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6549,19 +6549,22 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); /* Exposing INVPCID only when PCID is exposed */ best = kvm_find_cpuid_entry(vcpu, 0x7, 0); if (vmx_invpcid_supported() && best && (best->ebx & bit(X86_FEATURE_INVPCID)) && guest_cpuid_has_pcid(vcpu)) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } else { - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } -- cgit v1.2.3 From caaa8c6339ccefee6e834c4a8b0bec7e721e7357 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Sat, 27 Oct 2012 20:34:15 -0200 Subject: x86: remove dummy long from EFI stub Commit 2e064b1 (x86, efi: Fix issue of overlapping .reloc section for EFI_STUB) removed a dummy reloc added by commit 291f363 (x86, efi: EFI boot stub support), but forgot to remove the dummy long used by that reloc. Reviewed-by: Jordan Justen Tested-by: Lee G Rosenbaum Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Cesar Eduardo Barros Signed-off-by: Matt Fleming --- arch/x86/boot/header.S | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 2a017441b8b2..8c132a625b94 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -476,6 +476,3 @@ die: setup_corrupt: .byte 7 .string "No setup signature found...\n" - - .data -dummy: .long 0 -- cgit v1.2.3 From 0f905a43ce955b638139bd84486194770a6a2c08 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 20 Nov 2012 13:07:46 +0000 Subject: x86, efi: Fix processor-specific memcpy() build error Building for Athlon/Duron/K7 results in the following build error, arch/x86/boot/compressed/eboot.o: In function `__constant_memcpy3d': eboot.c:(.text+0x385): undefined reference to `_mmx_memcpy' arch/x86/boot/compressed/eboot.o: In function `efi_main': eboot.c:(.text+0x1a22): undefined reference to `_mmx_memcpy' because the boot stub code doesn't link with the kernel proper, and therefore doesn't have access to the 3DNow version of memcpy. So, follow the example of misc.c and #undef memcpy so that we use the version provided by misc.c. See https://bugzilla.kernel.org/show_bug.cgi?id=50391 Reported-by: Al Viro Reported-by: Ryan Underwood Cc: H. Peter Anvin Cc: stable@vger.kernel.org Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index c760e073963e..e87b0cac14b5 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -12,6 +12,8 @@ #include #include +#undef memcpy /* Use memcpy from misc.c */ + #include "eboot.h" static efi_system_table_t *sys_table; -- cgit v1.2.3 From 1022623842cb72ee4d0dbf02f6937f38c92c3f41 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 3 Sep 2012 20:54:48 +0200 Subject: x86-32: Fix invalid stack address while in softirq In 32 bit the stack address provided by kernel_stack_pointer() may point to an invalid range causing NULL pointer access or page faults while in NMI (see trace below). This happens if called in softirq context and if the stack is empty. The address at ®s->sp is then out of range. Fixing this by checking if regs and ®s->sp are in the same stack context. Otherwise return the previous stack pointer stored in struct thread_info. If that address is invalid too, return address of regs. BUG: unable to handle kernel NULL pointer dereference at 0000000a IP: [] print_context_stack+0x6e/0x8d *pde = 00000000 Oops: 0000 [#1] SMP Modules linked in: Pid: 4434, comm: perl Not tainted 3.6.0-rc3-oprofile-i386-standard-g4411a05 #4 Hewlett-Packard HP xw9400 Workstation/0A1Ch EIP: 0060:[] EFLAGS: 00010093 CPU: 0 EIP is at print_context_stack+0x6e/0x8d EAX: ffffe000 EBX: 0000000a ECX: f4435f94 EDX: 0000000a ESI: f4435f94 EDI: f4435f94 EBP: f5409ec0 ESP: f5409ea0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 0000000a CR3: 34ac9000 CR4: 000007d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 Process perl (pid: 4434, ti=f5408000 task=f5637850 task.ti=f4434000) Stack: 000003e8 ffffe000 00001ffc f4e39b00 00000000 0000000a f4435f94 c155198c f5409ef0 c1003723 c155198c f5409f04 00000000 f5409edc 00000000 00000000 f5409ee8 f4435f94 f5409fc4 00000001 f5409f1c c12dce1c 00000000 c155198c Call Trace: [] dump_trace+0x7b/0xa1 [] x86_backtrace+0x40/0x88 [] ? oprofile_add_sample+0x56/0x84 [] oprofile_add_sample+0x75/0x84 [] op_amd_check_ctrs+0x46/0x260 [] profile_exceptions_notify+0x23/0x4c [] nmi_handle+0x31/0x4a [] ? ftrace_define_fields_irq_handler_entry+0x45/0x45 [] do_nmi+0xa0/0x2ff [] ? ftrace_define_fields_irq_handler_entry+0x45/0x45 [] nmi_stack_correct+0x28/0x2d [] ? ftrace_define_fields_irq_handler_entry+0x45/0x45 [] ? do_softirq+0x4b/0x7f [] irq_exit+0x35/0x5b [] smp_apic_timer_interrupt+0x6c/0x7a [] apic_timer_interrupt+0x2a/0x30 Code: 89 fe eb 08 31 c9 8b 45 0c ff 55 ec 83 c3 04 83 7d 10 00 74 0c 3b 5d 10 73 26 3b 5d e4 73 0c eb 1f 3b 5d f0 76 1a 3b 5d e8 73 15 <8b> 13 89 d0 89 55 e0 e8 ad 42 03 00 85 c0 8b 55 e0 75 a6 eb cc EIP: [] print_context_stack+0x6e/0x8d SS:ESP 0068:f5409ea0 CR2: 000000000000000a ---[ end trace 62afee3481b00012 ]--- Kernel panic - not syncing: Fatal exception in interrupt V2: * add comments to kernel_stack_pointer() * always return a valid stack address by falling back to the address of regs Reported-by: Yang Wei Cc: Signed-off-by: Robert Richter Link: http://lkml.kernel.org/r/20120912135059.GZ8285@erda.amd.com Signed-off-by: H. Peter Anvin Cc: Jun Zhang --- arch/x86/include/asm/ptrace.h | 15 ++++----------- arch/x86/kernel/ptrace.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index dcfde52979c3..19f16ebaf4fa 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -205,21 +205,14 @@ static inline bool user_64bit_mode(struct pt_regs *regs) } #endif -/* - * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode - * when it traps. The previous stack will be directly underneath the saved - * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. - * - * This is valid only for kernel mode traps. - */ -static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) -{ #ifdef CONFIG_X86_32 - return (unsigned long)(®s->sp); +extern unsigned long kernel_stack_pointer(struct pt_regs *regs); #else +static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ return regs->sp; -#endif } +#endif #define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b00b33a18390..2484e331a64d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -166,6 +166,34 @@ static inline bool invalid_selector(u16 value) #define FLAG_MASK FLAG_MASK_32 +/* + * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode + * when it traps. The previous stack will be directly underneath the saved + * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. + * + * Now, if the stack is empty, '®s->sp' is out of range. In this + * case we try to take the previous stack. To always return a non-null + * stack pointer we fall back to regs as stack if no previous stack + * exists. + * + * This is valid only for kernel mode traps. + */ +unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ + unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); + unsigned long sp = (unsigned long)®s->sp; + struct thread_info *tinfo; + + if (context == (sp & ~(THREAD_SIZE - 1))) + return sp; + + tinfo = (struct thread_info *)context; + if (tinfo->previous_esp) + return tinfo->previous_esp; + + return (unsigned long)regs; +} + static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); -- cgit v1.2.3 From cb57a2b4cff7edf2a4e32c0163200e9434807e0a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 20 Nov 2012 22:21:02 -0800 Subject: x86-32: Export kernel_stack_pointer() for modules Modules, in particular oprofile (and possibly other similar tools) need kernel_stack_pointer(), so export it using EXPORT_SYMBOL_GPL(). Cc: Yang Wei Cc: Robert Richter Cc: Jun Zhang Cc: Link: http://lkml.kernel.org/r/20120912135059.GZ8285@erda.amd.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/ptrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2484e331a64d..5e0596b0632e 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -193,6 +194,7 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs) return (unsigned long)regs; } +EXPORT_SYMBOL_GPL(kernel_stack_pointer); static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { -- cgit v1.2.3 From 36c46ca4f322a7bf89aad5462a3a1f61713edce7 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 15 Nov 2012 13:41:50 -0500 Subject: x86, microcode, AMD: Add support for family 16h processors Add valid patch size for family 16h processors. [ hpa: promoting to urgent/stable since it is hw enabling and trivial ] Signed-off-by: Boris Ostrovsky Acked-by: Andreas Herrmann Link: http://lkml.kernel.org/r/1353004910-2204-1-git-send-email-boris.ostrovsky@amd.com Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/microcode_amd.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index b3e67ba55b77..efdec7cd8e01 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -190,6 +190,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, #define F1XH_MPB_MAX_SIZE 2048 #define F14H_MPB_MAX_SIZE 1824 #define F15H_MPB_MAX_SIZE 4096 +#define F16H_MPB_MAX_SIZE 3458 switch (c->x86) { case 0x14: @@ -198,6 +199,9 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, case 0x15: max_size = F15H_MPB_MAX_SIZE; break; + case 0x16: + max_size = F16H_MPB_MAX_SIZE; + break; default: max_size = F1XH_MPB_MAX_SIZE; break; -- cgit v1.2.3 From ee4eb87be2c3f69c2c4d9f1c1d98e363a7ad18ab Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 2 Nov 2012 11:18:39 +0000 Subject: x86-64: Fix ordering of CFI directives and recent ASM_CLAC additions While these got added in the right place everywhere else, entry_64.S is the odd one where they ended up before the initial CFI directive(s). In order to cover the full code ranges, the CFI directive must be first, though. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/5093BA1F02000078000A600E@nat28.tlf.novell.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b51b2c7ee51f..1328fe49a3f1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -995,8 +995,8 @@ END(interrupt) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: - ASM_CLAC XCPT_FRAME + ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ /* 0(%rsp): old_rsp-ARGOFFSET */ @@ -1135,8 +1135,8 @@ END(common_interrupt) */ .macro apicinterrupt num sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC pushq_cfi $~(\num) .Lcommon_\sym: interrupt \do_sym @@ -1190,8 +1190,8 @@ apicinterrupt IRQ_WORK_VECTOR \ */ .macro zeroentry sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1208,8 +1208,8 @@ END(\sym) .macro paranoidzeroentry sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1227,8 +1227,8 @@ END(\sym) #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1247,8 +1247,8 @@ END(\sym) .macro errorentry sym do_sym ENTRY(\sym) - ASM_CLAC XCPT_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1266,8 +1266,8 @@ END(\sym) /* error code is on the stack already */ .macro paranoiderrorentry sym do_sym ENTRY(\sym) - ASM_CLAC XCPT_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 -- cgit v1.2.3 From cb7cb2864e758a1b040040bc55e404c677c911cb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 21 Nov 2012 14:41:21 -0800 Subject: x86, kvm: Remove incorrect redundant assembly constraint In __emulate_1op_rax_rdx, we use "+a" and "+d" which are input/output constraints, and *then* use "a" and "d" as input constraints. This is incorrect, but happens to work on some versions of gcc. However, it breaks gcc with -O0 and icc, and may break on future versions of gcc. Reported-and-tested-by: Melanie Blower Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/B3584E72CFEBED439A3ECA9BCE67A4EF1B17AF90@FMSMSX107.amr.corp.intel.com Reviewed-by: Paolo Bonzini Acked-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 39171cb307ea..bba39bfa1c4b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -426,8 +426,7 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) _ASM_EXTABLE(1b, 3b) \ : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \ - "a" (*rax), "d" (*rdx)); \ + : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \ } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -- cgit v1.2.3 From 6662c34fa9c60a48aaa5879cb229cd9a84de9c22 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 27 Nov 2012 08:54:36 -0800 Subject: x86-32: Unbreak booting on some 486 clones There appear to have been some 486 clones, including the "enhanced" version of Am486, which have CPUID but not CR4. These 486 clones had only the FPU flag, if any, unlike the Intel 486s with CPUID, which also had VME and therefore needed CR4. Therefore, look at the basic CPUID flags and require at least one bit other than bit 0 before we modify CR4. Thanks to Christian Ludloff of sandpile.org for confirming this as a problem. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 957a47aec64e..4dac2f68ed4a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -292,8 +292,8 @@ default_entry: * be using the global pages. * * NOTE! If we are on a 486 we may have no cr4 at all! - * Specifically, cr4 exists if and only if CPUID exists, - * which in turn exists if and only if EFLAGS.ID exists. + * Specifically, cr4 exists if and only if CPUID exists + * and has flags other than the FPU flag set. */ movl $X86_EFLAGS_ID,%ecx pushl %ecx @@ -308,6 +308,11 @@ default_entry: testl %ecx,%eax jz 6f # No ID flag = no CPUID = no CR4 + movl $1,%eax + cpuid + andl $~1,%edx # Ignore CPUID.FPU + jz 6f # No flags or only CPUID.FPU = no CR4 + movl pa(mmu_cr4_features),%eax movl %eax,%cr4 -- cgit v1.2.3 From 644c154186386bb1fa6446bc5e037b9ed098db46 Mon Sep 17 00:00:00 2001 From: Vincent Palatin Date: Fri, 30 Nov 2012 12:15:32 -0800 Subject: x86, fpu: Avoid FPU lazy restore after suspend When a cpu enters S3 state, the FPU state is lost. After resuming for S3, if we try to lazy restore the FPU for a process running on the same CPU, this will result in a corrupted FPU context. Ensure that "fpu_owner_task" is properly invalided when (re-)initializing a CPU, so nobody will try to lazy restore a state which doesn't exist in the hardware. Tested with a 64-bit kernel on a 4-core Ivybridge CPU with eagerfpu=off, by doing thousands of suspend/resume cycles with 4 processes doing FPU operations running. Without the patch, a process is killed after a few hundreds cycles by a SIGFPE. Cc: Duncan Laurie Cc: Olof Johansson Cc: v3.4+ # for 3.4 need to replace this_cpu_write by percpu_write Signed-off-by: Vincent Palatin Link: http://lkml.kernel.org/r/1354306532-1014-1-git-send-email-vpalatin@chromium.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/fpu-internal.h | 15 +++++++++------ arch/x86/kernel/smpboot.c | 5 +++++ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 831dbb9c6c02..41ab26ea6564 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -399,14 +399,17 @@ static inline void drop_init_fpu(struct task_struct *tsk) typedef struct { int preload; } fpu_switch_t; /* - * FIXME! We could do a totally lazy restore, but we need to - * add a per-cpu "this was the task that last touched the FPU - * on this CPU" variable, and the task needs to have a "I last - * touched the FPU on this CPU" and check them. + * Must be run with preemption disabled: this clears the fpu_owner_task, + * on this CPU. * - * We don't do that yet, so "fpu_lazy_restore()" always returns - * false, but some day.. + * This will disable any lazy FPU state restore of the current FPU state, + * but if the current thread owns the FPU, it will still be saved by. */ +static inline void __cpu_disable_lazy_restore(unsigned int cpu) +{ + per_cpu(fpu_owner_task, cpu) = NULL; +} + static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) { return new == this_cpu_read_stable(fpu_owner_task) && diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c80a33bc528b..f3e2ec878b8c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,6 +68,8 @@ #include #include #include +#include +#include #include #include #include @@ -818,6 +820,9 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + /* the FPU context is blank, nobody can own it */ + __cpu_disable_lazy_restore(cpu); + err = do_boot_cpu(apicid, cpu, tidle); if (err) { pr_debug("do_boot_cpu failed %d\n", err); -- cgit v1.2.3