From 439793d4b3c99e550daebd868bbd58967c93d0b3 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 1 Aug 2012 17:01:42 +0300 Subject: KVM: x86: update KVM_SAVE_MSRS_BEGIN to correct value When MSR_KVM_PV_EOI_EN was added to msrs_to_save array KVM_SAVE_MSRS_BEGIN was not updated accordingly. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42bce48f6928..dce75b760312 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -806,7 +806,7 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 9 +#define KVM_SAVE_MSRS_BEGIN 10 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, -- cgit v1.2.3 From c6fd893da927c6cefb2ece22402765379921a834 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 31 Jul 2012 10:29:14 -0700 Subject: x86, avx: don't use avx instructions with "noxsave" boot param Clear AVX, AVX2 features along with clearing XSAVE feature bits, as part of the parsing "noxsave" parameter. Fixes the kernel boot panic with "noxsave" boot parameter. We could have checked cpu_has_osxsave along with cpu_has_avx etc, but Peter mentioned clearing the feature bits will be better for uses like static_cpu_has() etc. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1343755754.2041.2.camel@sbsiddha-desk.sc.intel.com Cc: # v3.5 Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 46d8786d655e..a5fbc3c5fccc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -144,6 +144,8 @@ static int __init x86_xsave_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_AVX); + setup_clear_cpu_cap(X86_FEATURE_AVX2); return 1; } __setup("noxsave", x86_xsave_setup); -- cgit v1.2.3 From 484d90eec884d814b005c9736bcf3fd018acba65 Mon Sep 17 00:00:00 2001 From: Andrew Boie Date: Fri, 10 Aug 2012 11:49:06 -0700 Subject: x86, build: Globally set -fno-pic GCC built with nonstandard options can enable -fpic by default. We never want this for 32-bit kernels and it will break the build. [ hpa: Notably the Android toolchain apparently does this. ] Change-Id: Iaab7d66e598b1c65ac4a4f0229eca2cd3d0d2898 Signed-off-by: Andrew Boie Link: http://lkml.kernel.org/r/1344624546-29691-1-git-send-email-andrew.p.boie@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/Makefile | 4 ++++ arch/x86/boot/Makefile | 2 +- arch/x86/realmode/rm/Makefile | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b0c5276861ec..682e9c210baa 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -27,6 +27,10 @@ ifeq ($(CONFIG_X86_32),y) KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return + # Never want PIC in a 32-bit kernel, prevent breakage with GCC built + # with nonstandard options + KBUILD_CFLAGS += -fno-pic + # prevent gcc from keeping the stack 16 byte aligned KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 5a747dd884db..f7535bedc33f 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -57,7 +57,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ -include $(srctree)/$(src)/code16gcc.h \ - -fno-strict-aliasing -fomit-frame-pointer \ + -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ $(call cc-option, -ffreestanding) \ $(call cc-option, -fno-toplevel-reorder,\ $(call cc-option, -fno-unit-at-a-time)) \ diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index b2d534cab25f..88692871823f 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -72,7 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ -include $(srctree)/$(src)/../../boot/code16gcc.h \ - -fno-strict-aliasing -fomit-frame-pointer \ + -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ $(call cc-option, -ffreestanding) \ $(call cc-option, -fno-toplevel-reorder,\ $(call cc-option, -fno-unit-at-a-time)) \ -- cgit v1.2.3 From cffa59baa5f1cf3e3e9e172697db48912471531c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 2 Aug 2012 12:55:27 +0200 Subject: perf, x86: Fix uncore_types_exit section mismatch Fix the following section mismatch: WARNING: arch/x86/kernel/cpu/built-in.o(.text+0x7ad9): Section mismatch in reference from the function uncore_types_exit() to the function .init.text:uncore_type_exit() The function uncore_types_exit() references the function __init uncore_type_exit(). This is often because uncore_types_exit lacks a __init annotation or the annotation of uncore_type_exit is wrong. caused by 14371cce03c2 ("perf: Add generic PCI uncore PMU device support"). Cc: Zheng Yan Cc: Ingo Molnar Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-8-git-send-email-zheng.z.yan@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 7563fda9f033..a7ccd68aa13a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2373,7 +2373,7 @@ static void __init uncore_type_exit(struct intel_uncore_type *type) type->attr_groups[1] = NULL; } -static void uncore_types_exit(struct intel_uncore_type **types) +static void __init uncore_types_exit(struct intel_uncore_type **types) { int i; for (i = 0; types[i]; i++) -- cgit v1.2.3 From ebb6cc03596cc89c89670473282ea46573feb34f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 Aug 2012 13:11:21 +0800 Subject: perf/x86: Fixes for Nehalem-EX uncore driver This patch includes following fixes and update: - Only some events in the Sbox and Mbox can use the match/mask registers, add code to check this. - The format definitions for xbr_mm_cfg and xbr_match registers in the Rbox are wrong, xbr_mm_cfg should use 32 bits, xbr_match should use 64 bits. - Cleanup the Rbox code. Compute the addresses extra registers in the enable_event function instead of the hw_config function. This simplifies the code in nhmex_rbox_alter_er(). Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344229882-3907-2-git-send-email-zheng.z.yan@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 195 ++++++++++++-------------- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 1 + 2 files changed, 87 insertions(+), 109 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index a7ccd68aa13a..84434e2a676f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -796,7 +796,6 @@ static struct intel_uncore_type *nhm_msr_uncores[] = { DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5"); DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7"); -DEFINE_UNCORE_FORMAT_ATTR(mm_cfg, mm_cfg, "config:63"); DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63"); DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); @@ -1032,24 +1031,22 @@ static struct intel_uncore_type nhmex_uncore_bbox = { static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) { - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - if (event->attr.config & NHMEX_S_PMON_MM_CFG_EN) { - reg1->config = event->attr.config1; - reg2->config = event->attr.config2; - } else { - reg1->config = ~0ULL; - reg2->config = ~0ULL; - } + /* only TO_R_PROG_EV event uses the match/mask register */ + if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) != + NHMEX_S_EVENT_TO_R_PROG_EV) + return 0; if (box->pmu->pmu_idx == 0) reg1->reg = NHMEX_S0_MSR_MM_CFG; else reg1->reg = NHMEX_S1_MSR_MM_CFG; - reg1->idx = 0; - + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; return 0; } @@ -1059,8 +1056,8 @@ static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct per struct hw_perf_event_extra *reg1 = &hwc->extra_reg; struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - wrmsrl(reg1->reg, 0); - if (reg1->config != ~0ULL || reg2->config != ~0ULL) { + if (reg1->idx != EXTRA_REG_NONE) { + wrmsrl(reg1->reg, 0); wrmsrl(reg1->reg + 1, reg1->config); wrmsrl(reg1->reg + 2, reg2->config); wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); @@ -1074,7 +1071,6 @@ static struct attribute *nhmex_uncore_sbox_formats_attr[] = { &format_attr_edge.attr, &format_attr_inv.attr, &format_attr_thresh8.attr, - &format_attr_mm_cfg.attr, &format_attr_match.attr, &format_attr_mask.attr, NULL, @@ -1264,7 +1260,8 @@ again: } /* for the match/mask registers */ - if ((uncore_box_is_fake(box) || !reg2->alloc) && + if (reg2->idx != EXTRA_REG_NONE && + (uncore_box_is_fake(box) || !reg2->alloc) && !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config)) goto fail; @@ -1278,7 +1275,8 @@ again: if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) nhmex_mbox_alter_er(event, idx[0], true); reg1->alloc |= alloc; - reg2->alloc = 1; + if (reg2->idx != EXTRA_REG_NONE) + reg2->alloc = 1; } return NULL; fail: @@ -1342,9 +1340,6 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event struct extra_reg *er; unsigned msr; int reg_idx = 0; - - if (WARN_ON_ONCE(reg1->idx != -1)) - return -EINVAL; /* * The mbox events may require 2 extra MSRs at the most. But only * the lower 32 bits in these MSRs are significant, so we can use @@ -1355,11 +1350,6 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; - if (er->idx == __BITS_VALUE(reg1->idx, 0, 8) || - er->idx == __BITS_VALUE(reg1->idx, 1, 8)) - continue; - if (WARN_ON_ONCE(reg_idx >= 2)) - return -EINVAL; msr = er->msr + type->msr_offset * box->pmu->pmu_idx; if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff)) @@ -1368,6 +1358,8 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event /* always use the 32~63 bits to pass the PLD config */ if (er->idx == EXTRA_REG_NHMEX_M_PLD) reg_idx = 1; + else if (WARN_ON_ONCE(reg_idx > 0)) + return -EINVAL; reg1->idx &= ~(0xff << (reg_idx * 8)); reg1->reg &= ~(0xffff << (reg_idx * 16)); @@ -1376,17 +1368,21 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event reg1->config = event->attr.config1; reg_idx++; } - /* use config2 to pass the filter config */ - reg2->idx = EXTRA_REG_NHMEX_M_FILTER; - if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) - reg2->config = event->attr.config2; - else - reg2->config = ~0ULL; - if (box->pmu->pmu_idx == 0) - reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; - else - reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; - + /* + * The mbox only provides ability to perform address matching + * for the PLD events. + */ + if (reg_idx == 2) { + reg2->idx = EXTRA_REG_NHMEX_M_FILTER; + if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) + reg2->config = event->attr.config2; + else + reg2->config = ~0ULL; + if (box->pmu->pmu_idx == 0) + reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; + else + reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; + } return 0; } @@ -1422,34 +1418,36 @@ static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct per wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), nhmex_mbox_shared_reg_config(box, idx)); - wrmsrl(reg2->reg, 0); - if (reg2->config != ~0ULL) { - wrmsrl(reg2->reg + 1, - reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); - wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & - (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); - wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + if (reg2->idx != EXTRA_REG_NONE) { + wrmsrl(reg2->reg, 0); + if (reg2->config != ~0ULL) { + wrmsrl(reg2->reg + 1, + reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); + wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & + (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); + wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + } } wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); } -DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); -DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); -DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); -DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); -DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); -DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); -DEFINE_UNCORE_FORMAT_ATTR(filter_cfg, filter_cfg, "config2:63"); -DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); -DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); -DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); +DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); +DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); +DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); +DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); +DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); +DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63"); +DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); +DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); +DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); static struct attribute *nhmex_uncore_mbox_formats_attr[] = { &format_attr_count_mode.attr, @@ -1458,7 +1456,7 @@ static struct attribute *nhmex_uncore_mbox_formats_attr[] = { &format_attr_flag_mode.attr, &format_attr_inc_sel.attr, &format_attr_set_flag_sel.attr, - &format_attr_filter_cfg.attr, + &format_attr_filter_cfg_en.attr, &format_attr_filter_match.attr, &format_attr_filter_mask.attr, &format_attr_dsp.attr, @@ -1513,7 +1511,7 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) struct hw_perf_event_extra *reg1 = &hwc->extra_reg; int port; - /* adjust the main event selector */ + /* adjust the main event selector and extra register index */ if (reg1->idx % 2) { reg1->idx--; hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; @@ -1522,29 +1520,17 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; } - /* adjust address or config of extra register */ + /* adjust extra register config */ port = reg1->idx / 6 + box->pmu->pmu_idx * 4; switch (reg1->idx % 6) { - case 0: - reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port); - break; - case 1: - reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port); - break; case 2: - /* the 8~15 bits to the 0~7 bits */ + /* shift the 8~15 bits to the 0~7 bits */ reg1->config >>= 8; break; case 3: - /* the 0~7 bits to the 8~15 bits */ + /* shift the 0~7 bits to the 8~15 bits */ reg1->config <<= 8; break; - case 4: - reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port); - break; - case 5: - reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port); - break; }; } @@ -1671,7 +1657,7 @@ static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - int port, idx; + int idx; idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >> NHMEX_R_PMON_CTL_EV_SEL_SHIFT; @@ -1681,27 +1667,11 @@ static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event reg1->idx = idx; reg1->config = event->attr.config1; - port = idx / 6 + box->pmu->pmu_idx * 4; - idx %= 6; - switch (idx) { - case 0: - reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port); - break; - case 1: - reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port); - break; - case 2: - case 3: - reg1->reg = NHMEX_R_MSR_PORTN_QLX_CFG(port); - break; + switch (idx % 6) { case 4: case 5: - if (idx == 4) - reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port); - else - reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port); - reg2->config = event->attr.config2; hwc->config |= event->attr.config & (~0ULL << 32); + reg2->config = event->attr.config2; break; }; return 0; @@ -1727,28 +1697,34 @@ static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct per struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &hwc->extra_reg; struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - int idx, er_idx; + int idx, port; - idx = reg1->idx % 6; - er_idx = idx; - if (er_idx > 2) - er_idx--; - er_idx += (reg1->idx / 6) * 5; + idx = reg1->idx; + port = idx / 6 + box->pmu->pmu_idx * 4; - switch (idx) { + switch (idx % 6) { case 0: + wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config); + break; case 1: - wrmsrl(reg1->reg, reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config); break; case 2: case 3: - wrmsrl(reg1->reg, nhmex_rbox_shared_reg_config(box, er_idx)); + wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port), + nhmex_rbox_shared_reg_config(box, 2 + (idx / 6) * 5)); break; case 4: + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port), + hwc->config >> 32); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config); + break; case 5: - wrmsrl(reg1->reg, reg1->config); - wrmsrl(reg1->reg + 1, hwc->config >> 32); - wrmsrl(reg1->reg + 2, reg2->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port), + hwc->config >> 32); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config); break; }; @@ -1756,8 +1732,8 @@ static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct per (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); } -DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config:32-63"); -DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63"); DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63"); DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15"); DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31"); @@ -2303,6 +2279,7 @@ int uncore_pmu_event_init(struct perf_event *event) event->hw.idx = -1; event->hw.last_tag = ~0ULL; event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; if (event->attr.config == UNCORE_FIXED_EVENT) { /* no fixed counter */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index c9e5dc56630a..8384e9b543b7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -230,6 +230,7 @@ #define NHMEX_S1_MSR_MASK 0xe5a #define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63) +#define NHMEX_S_EVENT_TO_R_PROG_EV 0 /* NHM-EX Mbox */ #define NHMEX_M0_MSR_GLOBAL_CTL 0xca0 -- cgit v1.2.3 From cb37af77124e8532e6ae3f9ca332593ba423b5f8 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 Aug 2012 13:11:22 +0800 Subject: perf/x86: Add Intel Westmere-EX uncore support The Westmere-EX uncore is similar to the Nehalem-EX uncore. The differences are: - Westmere-EX uncore has 10 instances of Cbox. The MSRs for Cbox8 and Cbox9 in the Westmere-EX aren't contiguous with Cbox 0~7. - The fvid field in the ZDP_CTL_FVC register in the Mbox is different. It's 5 bits in the Nehalem-EX, 6 bits in the Westmere-EX. Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344229882-3907-3-git-send-email-zheng.z.yan@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 56 ++++++++++++++++++++++----- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 45 ++++++++++----------- 2 files changed, 68 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 84434e2a676f..0a5571080e74 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -901,16 +901,21 @@ static struct attribute_group nhmex_uncore_cbox_format_group = { .attrs = nhmex_uncore_cbox_formats_attr, }; +/* msr offset for each instance of cbox */ +static unsigned nhmex_cbox_msr_offsets[] = { + 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0, +}; + static struct intel_uncore_type nhmex_uncore_cbox = { .name = "cbox", .num_counters = 6, - .num_boxes = 8, + .num_boxes = 10, .perf_ctr_bits = 48, .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0, .perf_ctr = NHMEX_C0_MSR_PMON_CTR0, .event_mask = NHMEX_PMON_RAW_EVENT_MASK, .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL, - .msr_offset = NHMEX_C_MSR_OFFSET, + .msr_offsets = nhmex_cbox_msr_offsets, .pair_ctr_ctl = 1, .ops = &nhmex_uncore_ops, .format_group = &nhmex_uncore_cbox_format_group @@ -1138,6 +1143,9 @@ static struct extra_reg nhmex_uncore_mbox_extra_regs[] = { EVENT_EXTRA_END }; +/* Nehalem-EX or Westmere-EX ? */ +bool uncore_nhmex; + static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config) { struct intel_uncore_extra_reg *er; @@ -1167,18 +1175,29 @@ static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 return false; /* mask of the shared fields */ - mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK; er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; raw_spin_lock_irqsave(&er->lock, flags); /* add mask of the non-shared field if it's in use */ - if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) - mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) { + if (uncore_nhmex) + mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + } if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) { atomic_add(1 << (idx * 8), &er->ref); - mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | - NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | + NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK | + WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); er->config &= ~mask; er->config |= (config & mask); ret = true; @@ -1212,7 +1231,10 @@ u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) /* get the non-shared control bits and shift them */ idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (uncore_nhmex) + config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); if (new_idx > orig_idx) { idx = new_idx - orig_idx; config <<= 3 * idx; @@ -1222,6 +1244,10 @@ u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) } /* add the shared control bits back */ + if (uncore_nhmex) + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + else + config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; if (modify) { /* adjust the main event selector */ @@ -1480,6 +1506,12 @@ static struct uncore_event_desc nhmex_uncore_mbox_events[] = { { /* end: all zeroes */ }, }; +static struct uncore_event_desc wsmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"), + { /* end: all zeroes */ }, +}; + static struct intel_uncore_ops nhmex_uncore_mbox_ops = { NHMEX_UNCORE_OPS_COMMON_INIT(), .enable_event = nhmex_mbox_msr_enable_event, @@ -2791,7 +2823,13 @@ static int __init uncore_cpu_init(void) snbep_uncore_cbox.num_boxes = max_cores; msr_uncores = snbep_msr_uncores; break; - case 46: + case 46: /* Nehalem-EX */ + uncore_nhmex = true; + case 47: /* Westmere-EX aka. Xeon E7 */ + if (!uncore_nhmex) + nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; + if (nhmex_uncore_cbox.num_boxes > max_cores) + nhmex_uncore_cbox.num_boxes = max_cores; msr_uncores = nhmex_msr_uncores; break; default: diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 8384e9b543b7..5b81c1856aac 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -276,18 +276,12 @@ NHMEX_M_PMON_CTL_INC_SEL_MASK | \ NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) - -#define NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK 0x1f -#define NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK (0x7 << 5) -#define NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK (0x7 << 8) -#define NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR (1 << 23) -#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK \ - (NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK | \ - NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK | \ - NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK | \ - NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR) +#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23)) #define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (11 + 3 * (n))) +#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24)) +#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (12 + 3 * (n))) + /* * use the 9~13 bits to select event If the 7th bit is not set, * otherwise use the 19~21 bits to select event. @@ -369,6 +363,7 @@ struct intel_uncore_type { unsigned num_shared_regs:8; unsigned single_fixed:1; unsigned pair_ctr_ctl:1; + unsigned *msr_offsets; struct event_constraint unconstrainted; struct event_constraint *constraints; struct intel_uncore_pmu *pmus; @@ -486,29 +481,31 @@ unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) return idx * 8 + box->pmu->type->perf_ctr; } -static inline -unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) +static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box) +{ + struct intel_uncore_pmu *pmu = box->pmu; + return pmu->type->msr_offsets ? + pmu->type->msr_offsets[pmu->pmu_idx] : + pmu->type->msr_offset * pmu->pmu_idx; +} + +static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) { if (!box->pmu->type->box_ctl) return 0; - return box->pmu->type->box_ctl + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + return box->pmu->type->box_ctl + uncore_msr_box_offset(box); } -static inline -unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) +static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) { if (!box->pmu->type->fixed_ctl) return 0; - return box->pmu->type->fixed_ctl + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box); } -static inline -unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) +static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) { - return box->pmu->type->fixed_ctr + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box); } static inline @@ -516,7 +513,7 @@ unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) { return box->pmu->type->event_ctl + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + uncore_msr_box_offset(box); } static inline @@ -524,7 +521,7 @@ unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) { return box->pmu->type->perf_ctr + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + uncore_msr_box_offset(box); } static inline -- cgit v1.2.3 From 26a4f3c08de49c1437a7b7f97693cf22d8c31656 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 9 Aug 2012 11:52:34 +0300 Subject: perf/x86: disable PEBS on a guest entry. If PMU counter has PEBS enabled it is not enough to disable counter on a guest entry since PEBS memory write can overshoot guest entry and corrupt guest memory. Disabling PEBS during guest entry solves the problem. Tested-by: David Ahern Signed-off-by: Gleb Natapov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120809085234.GI3341@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 382366977d4c..7f2739e03e79 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1522,8 +1522,16 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; + /* + * If PMU counter has PEBS enabled it is not enough to disable counter + * on a guest entry since PEBS memory write can overshoot guest entry + * and corrupt guest memory. Disabling PEBS solves the problem. + */ + arr[1].msr = MSR_IA32_PEBS_ENABLE; + arr[1].host = cpuc->pebs_enabled; + arr[1].guest = 0; - *nr = 1; + *nr = 2; return arr; } -- cgit v1.2.3 From f1c6300183dbf5b9da25988e13f6f25a9e27151b Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 8 Aug 2012 12:16:52 -0700 Subject: x86, apic: fix broken legacy interrupts in the logical apic mode Recent commit 332afa656e76458ee9cf0f0d123016a0658539e4 cleaned up a workaround that updates irq_cfg domain for legacy irq's that are handled by the IO-APIC. This was assuming that the recent changes in assign_irq_vector() were sufficient to remove the workaround. But this broke couple of AMD platforms. One of them seems to be sending interrupts to the offline cpu's, resulting in spurious "No irq handler for vector xx (irq -1)" messages when those cpu's come online. And the other platform seems to always send the interrupt to the last logical CPU (cpu-7). Recent changes had an unintended side effect of using only logical cpu-0 in the IO-APIC RTE (during boot for the legacy interrupts) and this broke the legacy interrupts not getting routed to the cpu-7 on the AMD platform, resulting in a boot hang. For now, reintroduce the removed workaround, (essentially not allowing the vector to change for legacy irq's when io-apic starts to handle the irq. Which also addressed the uninteded sife effect of just specifying cpu-0 in the IO-APIC RTE for those irq's during boot). Reported-and-tested-by: Robert Richter Reported-and-tested-by: Borislav Petkov Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1344453412.29170.5.camel@sbsiddha-desk.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a6c64aaddf9a..c265593ec2cd 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1356,6 +1356,16 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (!IO_APIC_IRQ(irq)) return; + /* + * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC + * can handle this irq and the apic driver is finialized at this point, + * update the cfg->domain. + */ + if (irq < legacy_pic->nr_legacy_irqs && + cpumask_equal(cfg->domain, cpumask_of(0))) + apic->vector_allocation_domain(0, cfg->domain, + apic->target_cpus()); + if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; -- cgit v1.2.3 From f026cfa82f628db24b8cea41b9d6202af104cecb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 14 Aug 2012 09:53:38 -0700 Subject: Revert "x86-64/efi: Use EFI to deal with platform wall clock" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit bacef661acdb634170a8faddbc1cf28e8f8b9eee. This commit has been found to cause serious regressions on a number of ASUS machines at the least. We probably need to provide a 1:1 map in addition to the EFI virtual memory map in order for this to work. Signed-off-by: H. Peter Anvin Reported-and-bisected-by: Jérôme Carretero Cc: Jan Beulich Cc: Matt Fleming Cc: Matthew Garrett Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120805172903.5f8bb24c@zougloub.eu --- arch/x86/mm/pageattr.c | 10 ++++------ arch/x86/platform/efi/efi.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 931930a96160..a718e0d23503 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -919,13 +919,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* * On success we use clflush, when the CPU supports it to - * avoid the wbindv. If the CPU does not support it, in the - * error case, and during early boot (for EFI) we fall back - * to cpa_flush_all (which uses wbinvd): + * avoid the wbindv. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * wbindv): */ - if (early_boot_irqs_disabled) - __cpa_flush_all((void *)(long)cache); - else if (!ret && cpu_has_clflush) { + if (!ret && cpu_has_clflush) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { cpa_flush_array(addr, numpages, cache, cpa.flags, pages); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 2dc29f51e75a..92660edaa1e7 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -234,7 +234,22 @@ static efi_status_t __init phys_efi_set_virtual_address_map( return status; } -static int efi_set_rtc_mmss(unsigned long nowtime) +static efi_status_t __init phys_efi_get_time(efi_time_t *tm, + efi_time_cap_t *tc) +{ + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&rtc_lock, flags); + efi_call_phys_prelog(); + status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm), + virt_to_phys(tc)); + efi_call_phys_epilog(); + spin_unlock_irqrestore(&rtc_lock, flags); + return status; +} + +int efi_set_rtc_mmss(unsigned long nowtime) { int real_seconds, real_minutes; efi_status_t status; @@ -263,7 +278,7 @@ static int efi_set_rtc_mmss(unsigned long nowtime) return 0; } -static unsigned long efi_get_time(void) +unsigned long efi_get_time(void) { efi_status_t status; efi_time_t eft; @@ -606,13 +621,18 @@ static int __init efi_runtime_init(void) } /* * We will only need *early* access to the following - * EFI runtime service before set_virtual_address_map + * two EFI runtime services before set_virtual_address_map * is invoked. */ + efi_phys.get_time = (efi_get_time_t *)runtime->get_time; efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) runtime->set_virtual_address_map; - + /* + * Make efi_get_time can be called before entering + * virtual mode. + */ + efi.get_time = phys_efi_get_time; early_iounmap(runtime, sizeof(efi_runtime_services_t)); return 0; @@ -700,10 +720,12 @@ void __init efi_init(void) efi_enabled = 0; return; } +#ifdef CONFIG_X86_32 if (efi_native) { x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; } +#endif #if EFI_DEBUG print_efi_memmap(); -- cgit v1.2.3 From ca08649eb5dd30f11a5a8fe8659b48899b7ea6a1 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 16 Aug 2012 11:31:27 -0400 Subject: Revert "xen PVonHVM: move shared_info to MMIO before kexec" This reverts commit 00e37bdb0113a98408de42db85be002f21dbffd3. During shutdown of PVHVM guests with more than 2VCPUs on certain machines we can hit the race where the replaced shared_info is not replaced fast enough and the PV time clock retries reading the same area over and over without any any success and is stuck in an infinite loop. Acked-by: Olaf Hering Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 118 +++++------------------------------------------ arch/x86/xen/suspend.c | 2 +- arch/x86/xen/xen-ops.h | 2 +- 3 files changed, 13 insertions(+), 109 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a6f8acbdfc9a..f1814fc2cb77 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include @@ -1472,130 +1471,38 @@ asmlinkage void __init xen_start_kernel(void) #endif } -#ifdef CONFIG_XEN_PVHVM -/* - * The pfn containing the shared_info is located somewhere in RAM. This - * will cause trouble if the current kernel is doing a kexec boot into a - * new kernel. The new kernel (and its startup code) can not know where - * the pfn is, so it can not reserve the page. The hypervisor will - * continue to update the pfn, and as a result memory corruption occours - * in the new kernel. - * - * One way to work around this issue is to allocate a page in the - * xen-platform pci device's BAR memory range. But pci init is done very - * late and the shared_info page is already in use very early to read - * the pvclock. So moving the pfn from RAM to MMIO is racy because some - * code paths on other vcpus could access the pfn during the small - * window when the old pfn is moved to the new pfn. There is even a - * small window were the old pfn is not backed by a mfn, and during that - * time all reads return -1. - * - * Because it is not known upfront where the MMIO region is located it - * can not be used right from the start in xen_hvm_init_shared_info. - * - * To minimise trouble the move of the pfn is done shortly before kexec. - * This does not eliminate the race because all vcpus are still online - * when the syscore_ops will be called. But hopefully there is no work - * pending at this point in time. Also the syscore_op is run last which - * reduces the risk further. - */ - -static struct shared_info *xen_hvm_shared_info; - -static void xen_hvm_connect_shared_info(unsigned long pfn) +void __ref xen_hvm_init_shared_info(void) { + int cpu; struct xen_add_to_physmap xatp; + static struct shared_info *shared_info_page = 0; + if (!shared_info_page) + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = pfn; + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); -} -static void xen_hvm_set_shared_info(struct shared_info *sip) -{ - int cpu; - - HYPERVISOR_shared_info = sip; + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info * page, we use it in the event channel upcall and in some pvclock * related functions. We don't need the vcpu_info placement * optimizations because we don't use any pv_mmu or pv_irq op on * HVM. - * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_set_shared_info is run at resume time too and + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_init_shared_info is run at resume time too and * in that case multiple vcpus might be online. */ for_each_online_cpu(cpu) { per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; } } -/* Reconnect the shared_info pfn to a mfn */ -void xen_hvm_resume_shared_info(void) -{ - xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); -} - -#ifdef CONFIG_KEXEC -static struct shared_info *xen_hvm_shared_info_kexec; -static unsigned long xen_hvm_shared_info_pfn_kexec; - -/* Remember a pfn in MMIO space for kexec reboot */ -void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn) -{ - xen_hvm_shared_info_kexec = sip; - xen_hvm_shared_info_pfn_kexec = pfn; -} - -static void xen_hvm_syscore_shutdown(void) -{ - struct xen_memory_reservation reservation = { - .domid = DOMID_SELF, - .nr_extents = 1, - }; - unsigned long prev_pfn; - int rc; - - if (!xen_hvm_shared_info_kexec) - return; - - prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT; - set_xen_guest_handle(reservation.extent_start, &prev_pfn); - - /* Move pfn to MMIO, disconnects previous pfn from mfn */ - xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec); - - /* Update pointers, following hypercall is also a memory barrier */ - xen_hvm_set_shared_info(xen_hvm_shared_info_kexec); - - /* Allocate new mfn for previous pfn */ - do { - rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - if (rc == 0) - msleep(123); - } while (rc == 0); - - /* Make sure the previous pfn is really connected to a (new) mfn */ - BUG_ON(rc != 1); -} - -static struct syscore_ops xen_hvm_syscore_ops = { - .shutdown = xen_hvm_syscore_shutdown, -}; -#endif - -/* Use a pfn in RAM, may move to MMIO before kexec. */ -static void __init xen_hvm_init_shared_info(void) -{ - /* Remember pointer for resume */ - xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE); - xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); - xen_hvm_set_shared_info(xen_hvm_shared_info); -} - +#ifdef CONFIG_XEN_PVHVM static void __init init_hvm_pv_info(void) { int major, minor; @@ -1646,9 +1553,6 @@ static void __init xen_hvm_guest_init(void) init_hvm_pv_info(); xen_hvm_init_shared_info(); -#ifdef CONFIG_KEXEC - register_syscore_ops(&xen_hvm_syscore_ops); -#endif if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index ae8a00c39de4..45329c8c226e 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_resume_shared_info(); + xen_hvm_init_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1e4329e04e0f..202d4c150154 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -41,7 +41,7 @@ void xen_enable_syscall(void); void xen_vcpu_restore(void); void xen_callback_vector(void); -void xen_hvm_resume_shared_info(void); +void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); -- cgit v1.2.3 From 250a41e0ecc433cdd553a364d0fc74c766425209 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 17 Aug 2012 09:27:35 -0400 Subject: xen/p2m: Reuse existing P2M leafs if they are filled with 1:1 PFNs or INVALID. If P2M leaf is completly packed with INVALID_P2M_ENTRY or with 1:1 PFNs (so IDENTITY_FRAME type PFNs), we can swap the P2M leaf with either a p2m_missing or p2m_identity respectively. The old page (which was created via extend_brk or was grafted on from the mfn_list) can be re-used for setting new PFNs. This also means we can remove git commit: 5bc6f9888db5739abfa0cae279b4b442e4db8049 xen/p2m: Reserve 8MB of _brk space for P2M leafs when populating back which tried to fix this. and make the amount that is required to be reserved much smaller. CC: stable@vger.kernel.org # for 3.5 only. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b2e91d40a4cb..d4b255463253 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -196,9 +196,11 @@ RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); /* When we populate back during bootup, the amount of pages can vary. The * max we have is seen is 395979, but that does not mean it can't be more. - * But some machines can have 3GB I/O holes even. So lets reserve enough - * for 4GB of I/O and E820 holes. */ -RESERVE_BRK(p2m_populated, PMD_SIZE * 4); + * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle + * it can re-use Xen provided mfn_list array, so we only need to allocate at + * most three P2M top nodes. */ +RESERVE_BRK(p2m_populated, PAGE_SIZE * 3); + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); @@ -575,12 +577,99 @@ static bool __init early_alloc_p2m(unsigned long pfn) } return true; } + +/* + * Skim over the P2M tree looking at pages that are either filled with + * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and + * replace the P2M leaf with a p2m_missing or p2m_identity. + * Stick the old page in the new P2M tree location. + */ +bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn) +{ + unsigned topidx; + unsigned mididx; + unsigned ident_pfns; + unsigned inv_pfns; + unsigned long *p2m; + unsigned long *mid_mfn_p; + unsigned idx; + unsigned long pfn; + + /* We only look when this entails a P2M middle layer */ + if (p2m_index(set_pfn)) + return false; + + for (pfn = 0; pfn <= MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { + topidx = p2m_top_index(pfn); + + if (!p2m_top[topidx]) + continue; + + if (p2m_top[topidx] == p2m_mid_missing) + continue; + + mididx = p2m_mid_index(pfn); + p2m = p2m_top[topidx][mididx]; + if (!p2m) + continue; + + if ((p2m == p2m_missing) || (p2m == p2m_identity)) + continue; + + if ((unsigned long)p2m == INVALID_P2M_ENTRY) + continue; + + ident_pfns = 0; + inv_pfns = 0; + for (idx = 0; idx < P2M_PER_PAGE; idx++) { + /* IDENTITY_PFNs are 1:1 */ + if (p2m[idx] == IDENTITY_FRAME(pfn + idx)) + ident_pfns++; + else if (p2m[idx] == INVALID_P2M_ENTRY) + inv_pfns++; + else + break; + } + if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE)) + goto found; + } + return false; +found: + /* Found one, replace old with p2m_identity or p2m_missing */ + p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); + /* And the other for save/restore.. */ + mid_mfn_p = p2m_top_mfn_p[topidx]; + /* NOTE: Even if it is a p2m_identity it should still be point to + * a page filled with INVALID_P2M_ENTRY entries. */ + mid_mfn_p[mididx] = virt_to_mfn(p2m_missing); + + /* Reset where we want to stick the old page in. */ + topidx = p2m_top_index(set_pfn); + mididx = p2m_mid_index(set_pfn); + + /* This shouldn't happen */ + if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) + early_alloc_p2m(set_pfn); + + if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) + return false; + + p2m_init(p2m); + p2m_top[topidx][mididx] = p2m; + mid_mfn_p = p2m_top_mfn_p[topidx]; + mid_mfn_p[mididx] = virt_to_mfn(p2m); + + return true; +} bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(!__set_phys_to_machine(pfn, mfn))) { if (!early_alloc_p2m(pfn)) return false; + if (early_can_reuse_p2m_middle(pfn, mfn)) + return __set_phys_to_machine(pfn, mfn); + if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) return false; -- cgit v1.2.3 From 515c7af85ed92696c311c53d53cb4898ff32d784 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Sat, 18 Aug 2012 16:11:37 -0400 Subject: x32: Use compat shims for {g,s}etsockopt Some of the arguments to {g,s}etsockopt are passed in userland pointers. If we try to use the 64bit entry point, we end up sometimes failing. For example, dhcpcd doesn't run in x32: # dhcpcd eth0 dhcpcd[1979]: version 5.5.6 starting dhcpcd[1979]: eth0: broadcasting for a lease dhcpcd[1979]: eth0: open_socket: Invalid argument dhcpcd[1979]: eth0: send_raw_packet: Bad file descriptor The code in particular is getting back EINVAL when doing: struct sock_fprog pf; setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &pf, sizeof(pf)); Diving into the kernel code, we can see: include/linux/filter.h: struct sock_fprog { unsigned short len; struct sock_filter __user *filter; }; net/core/sock.c: case SO_ATTACH_FILTER: ret = -EINVAL; if (optlen == sizeof(struct sock_fprog)) { struct sock_fprog fprog; ret = -EFAULT; if (copy_from_user(&fprog, optval, sizeof(fprog))) break; ret = sk_attach_filter(&fprog, sk); } break; arch/x86/syscalls/syscall_64.tbl: 54 common setsockopt sys_setsockopt 55 common getsockopt sys_getsockopt So for x64, sizeof(sock_fprog) is 16 bytes. For x86/x32, it's 8 bytes. This comes down to the pointer being 32bit for x32, which means we need to do structure size translation. But since x32 comes in directly to sys_setsockopt, it doesn't get translated like x86. After changing the syscall table and rebuilding glibc with the new kernel headers, dhcp runs fine in an x32 userland. Oddly, it seems like Linus noted the same thing during the initial port, but I guess that was missed/lost along the way: https://lkml.org/lkml/2011/8/26/452 [ hpa: tagging for -stable since this is an ABI fix. ] Bugzilla: https://bugs.gentoo.org/423649 Reported-by: Mads Signed-off-by: Mike Frysinger Link: http://lkml.kernel.org/r/1345320697-15713-1-git-send-email-vapier@gentoo.org Cc: H. J. Lu Cc: v3.4..v3.5 Signed-off-by: H. Peter Anvin --- arch/x86/syscalls/syscall_64.tbl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 29aed7ac2c02..a582bfed95bb 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -60,8 +60,8 @@ 51 common getsockname sys_getsockname 52 common getpeername sys_getpeername 53 common socketpair sys_socketpair -54 common setsockopt sys_setsockopt -55 common getsockopt sys_getsockopt +54 64 setsockopt sys_setsockopt +55 64 getsockopt sys_getsockopt 56 common clone stub_clone 57 common fork stub_fork 58 common vfork stub_vfork @@ -353,3 +353,5 @@ 538 x32 sendmmsg compat_sys_sendmmsg 539 x32 process_vm_readv compat_sys_process_vm_readv 540 x32 process_vm_writev compat_sys_process_vm_writev +541 x32 setsockopt compat_sys_setsockopt +542 x32 getsockopt compat_sys_getsockopt -- cgit v1.2.3 From eb48c071464757414538c68a6033c8f8c15196f8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Aug 2012 16:15:52 -0700 Subject: mm: hugetlbfs: correctly populate shared pmd Each page mapped in a process's address space must be correctly accounted for in _mapcount. Normally the rules for this are straightforward but hugetlbfs page table sharing is different. The page table pages at the PMD level are reference counted while the mapcount remains the same. If this accounting is wrong, it causes bugs like this one reported by Larry Woodman: kernel BUG at mm/filemap.c:135! invalid opcode: 0000 [#1] SMP CPU 22 Modules linked in: bridge stp llc sunrpc binfmt_misc dcdbas microcode pcspkr acpi_pad acpi] Pid: 18001, comm: mpitest Tainted: G W 3.3.0+ #4 Dell Inc. PowerEdge R620/07NDJ2 RIP: 0010:[] [] __delete_from_page_cache+0x15d/0x170 Process mpitest (pid: 18001, threadinfo ffff880428972000, task ffff880428b5cc20) Call Trace: delete_from_page_cache+0x40/0x80 truncate_hugepages+0x115/0x1f0 hugetlbfs_evict_inode+0x18/0x30 evict+0x9f/0x1b0 iput_final+0xe3/0x1e0 iput+0x3e/0x50 d_kill+0xf8/0x110 dput+0xe2/0x1b0 __fput+0x162/0x240 During fork(), copy_hugetlb_page_range() detects if huge_pte_alloc() shared page tables with the check dst_pte == src_pte. The logic is if the PMD page is the same, they must be shared. This assumes that the sharing is between the parent and child. However, if the sharing is with a different process entirely then this check fails as in this diagram: parent | ------------>pmd src_pte----------> data page ^ other--------->pmd--------------------| ^ child-----------| dst_pte For this situation to occur, it must be possible for Parent and Other to have faulted and failed to share page tables with each other. This is possible due to the following style of race. PROC A PROC B copy_hugetlb_page_range copy_hugetlb_page_range src_pte == huge_pte_offset src_pte == huge_pte_offset !src_pte so no sharing !src_pte so no sharing (time passes) hugetlb_fault hugetlb_fault huge_pte_alloc huge_pte_alloc huge_pmd_share huge_pmd_share LOCK(i_mmap_mutex) find nothing, no sharing UNLOCK(i_mmap_mutex) LOCK(i_mmap_mutex) find nothing, no sharing UNLOCK(i_mmap_mutex) pmd_alloc pmd_alloc LOCK(instantiation_mutex) fault UNLOCK(instantiation_mutex) LOCK(instantiation_mutex) fault UNLOCK(instantiation_mutex) These two processes are not poing to the same data page but are not sharing page tables because the opportunity was missed. When either process later forks, the src_pte == dst pte is potentially insufficient. As the check falls through, the wrong PTE information is copied in (harmless but wrong) and the mapcount is bumped for a page mapped by a shared page table leading to the BUG_ON. This patch addresses the issue by moving pmd_alloc into huge_pmd_share which guarantees that the shared pud is populated in the same critical section as pmd. This also means that huge_pte_offset test in huge_pmd_share is serialized correctly now which in turn means that the success of the sharing will be higher as the racing tasks see the pud and pmd populated together. Race identified and changelog written mostly by Mel Gorman. {akpm@linux-foundation.org: attempt to make the huge_pmd_share() comment comprehensible, clean up coding style] Reported-by: Larry Woodman Tested-by: Larry Woodman Reviewed-by: Mel Gorman Signed-off-by: Michal Hocko Reviewed-by: Rik van Riel Cc: David Gibson Cc: Ken Chen Cc: Cong Wang Cc: Hillf Danton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index f6679a7fb8ca..b91e48512425 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -56,9 +56,16 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) } /* - * search for a shareable pmd page for hugetlb. + * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() + * and returns the corresponding pte. While this is not necessary for the + * !shared pmd case because we can allocate the pmd later as well, it makes the + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_mutex section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. */ -static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +static pte_t * +huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { struct vm_area_struct *vma = find_vma(mm, addr); struct address_space *mapping = vma->vm_file->f_mapping; @@ -68,9 +75,10 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) struct vm_area_struct *svma; unsigned long saddr; pte_t *spte = NULL; + pte_t *pte; if (!vma_shareable(vma, addr)) - return; + return (pte_t *)pmd_alloc(mm, pud, addr); mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { @@ -97,7 +105,9 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: + pte = (pte_t *)pmd_alloc(mm, pud, addr); mutex_unlock(&mapping->i_mmap_mutex); + return pte; } /* @@ -142,8 +152,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, } else { BUG_ON(sz != PMD_SIZE); if (pud_none(*pud)) - huge_pmd_share(mm, addr, pud); - pte = (pte_t *) pmd_alloc(mm, pud, addr); + pte = huge_pmd_share(mm, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); } } BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); -- cgit v1.2.3 From 83be4ffa1acbcd529b771f4d2e639b15e2b7957e Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 14 Aug 2012 14:47:37 -0700 Subject: x86/spinlocks: Fix comment in spinlock.h This comment is no longer true. We support up to 2^16 CPUs because __ticket_t is an u16 if NR_CPUS is larger than 256. Signed-off-by: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/include/asm/spinlock.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index b315a33867f2..33692eaabab5 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -12,8 +12,7 @@ * Simple spin lock operations. There are two variants, one clears IRQ's * on the local processor, one does not. * - * These are fair FIFO ticket locks, which are currently limited to 256 - * CPUs. + * These are fair FIFO ticket locks, which support up to 2^16 CPUs. * * (the type definitions are in asm/spinlock_types.h) */ -- cgit v1.2.3 From 2530cd4f448935c74eeb49f29559589928e4b2f0 Mon Sep 17 00:00:00 2001 From: "Liu, Chuansheng" Date: Tue, 14 Aug 2012 06:55:01 +0000 Subject: x86/fixup_irq: Use cpu_online_mask instead of cpu_all_mask When one CPU is going down and this CPU is the last one in irq affinity, current code is setting cpu_all_mask as the new affinity for that irq. But for some systems (such as in Medfield Android mobile) the firmware sends the interrupt to each CPU in the irq affinity mask, averaged, and cpu_all_mask includes all potential CPUs, i.e. offline ones as well. So replace cpu_all_mask with cpu_online_mask. Signed-off-by: liu chuansheng Acked-by: Yanmin Zhang Acked-by: Thomas Gleixner Link: http://lkml.kernel.org/r/27240C0AC20F114CBF8149A2696CBE4A137286@SHSMSX101.ccr.corp.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 7ad683d78645..d44f7829968e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -270,7 +270,7 @@ void fixup_irqs(void) if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { break_affinity = 1; - affinity = cpu_all_mask; + affinity = cpu_online_mask; } chip = irq_data_get_irq_chip(data); -- cgit v1.2.3 From cb09cad44f07044d9810f18f6f9a6a6f3771f979 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 22 Aug 2012 13:03:48 +0300 Subject: x86/alternatives: Fix p6 nops on non-modular kernels Probably a leftover from the early days of self-patching, p6nops are marked __initconst_or_module, which causes them to be discarded in a non-modular kernel. If something later triggers patching, it will overwrite kernel code with garbage. Reported-by: Tomas Racek Signed-off-by: Avi Kivity Cc: Michael Tokarev Cc: Borislav Petkov Cc: Marcelo Tosatti Cc: qemu-devel@nongnu.org Cc: Anthony Liguori Cc: H. Peter Anvin Cc: Alan Cox Cc: Alan Cox Link: http://lkml.kernel.org/r/5034AE84.90708@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index afb7ff79a29f..ced4534baed5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -165,7 +165,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = #endif #ifdef P6_NOP1 -static const unsigned char __initconst_or_module p6nops[] = +static const unsigned char p6nops[] = { P6_NOP1, P6_NOP2, -- cgit v1.2.3 From 35f2d16bb9ace0fb2671b8232839944ad9057c6f Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 20 Aug 2012 18:35:39 +0900 Subject: KVM: MMU: Fix mmu_shrink() so that it can free mmu pages as intended Although the possible race described in commit 85b7059169e128c57a3a8a3e588fb89cb2031da1 KVM: MMU: fix shrinking page from the empty mmu was correct, the real cause of that issue was a more trivial bug of mmu_shrink() introduced by commit 1952639665e92481c34c34c3e2a71bf3e66ba362 KVM: MMU: do not iterate over all VMs in mmu_shrink() Here is the bug: if (kvm->arch.n_used_mmu_pages > 0) { if (!nr_to_scan--) break; continue; } We skip VMs whose n_used_mmu_pages is not zero and try to shrink others: in other words we try to shrink empty ones by mistake. This patch reverses the logic so that mmu_shrink() can free pages from the first VM whose n_used_mmu_pages is not zero. Note that we also add comments explaining the role of nr_to_scan which is not practically important now, hoping this will be improved in the future. Signed-off-by: Takuya Yoshikawa Cc: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 01ca00423938..7fbd0d273ea8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4112,17 +4112,22 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) int idx; LIST_HEAD(invalid_list); + /* + * Never scan more than sc->nr_to_scan VM instances. + * Will not hit this condition practically since we do not try + * to shrink more than one VM and it is very unlikely to see + * !n_used_mmu_pages so many times. + */ + if (!nr_to_scan--) + break; /* * n_used_mmu_pages is accessed without holding kvm->mmu_lock * here. We may skip a VM instance errorneosly, but we do not * want to shrink a VM that only started to populate its MMU * anyway. */ - if (kvm->arch.n_used_mmu_pages > 0) { - if (!nr_to_scan--) - break; + if (!kvm->arch.n_used_mmu_pages) continue; - } idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); -- cgit v1.2.3 From 5ad105e569c45dcfad50d724c61d5061248be755 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 19 Aug 2012 14:34:31 +0300 Subject: KVM: x86 emulator: use stack size attribute to mask rsp in stack ops The sub-register used to access the stack (sp, esp, or rsp) is not determined by the address size attribute like other memory references, but by the stack segment's B bit (if not in x86_64 mode). Fix by using the existing stack_mask() to figure out the correct mask. This long-existing bug was exposed by a combination of a27685c33acccce (emulate invalid guest state by default), which causes many more instructions to be emulated, and a seabios change (possibly a bug) which causes the high 16 bits of esp to become polluted across calls to real mode software interrupts. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 97d9a9914ba8..a3b57a27be88 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -475,13 +475,26 @@ register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) return address_mask(ctxt, reg); } +static void masked_increment(ulong *reg, ulong mask, int inc) +{ + assign_masked(reg, *reg + inc, mask); +} + static inline void register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) { + ulong mask; + if (ctxt->ad_bytes == sizeof(unsigned long)) - *reg += inc; + mask = ~0UL; else - *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); + mask = ad_mask(ctxt); + masked_increment(reg, mask, inc); +} + +static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) +{ + masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc); } static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -1522,8 +1535,8 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) { struct segmented_address addr; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); + rsp_increment(ctxt, -bytes); + addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; return segmented_write(ctxt, addr, data, bytes); @@ -1542,13 +1555,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, int rc; struct segmented_address addr; - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); + addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len); + rsp_increment(ctxt, len); return rc; } @@ -1688,8 +1701,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) while (reg >= VCPU_REGS_RAX) { if (reg == VCPU_REGS_RSP) { - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], - ctxt->op_bytes); + rsp_increment(ctxt, ctxt->op_bytes); --reg; } @@ -2825,7 +2837,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); + rsp_increment(ctxt, ctxt->src.val); return X86EMUL_CONTINUE; } -- cgit v1.2.3 From 36bf50d7697be18c6bfd0401e037df10bff1e573 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 31 Jul 2012 15:41:45 +0200 Subject: x86, microcode, AMD: Fix broken ucode patch size check This issue was recently observed on an AMD C-50 CPU where a patch of maximum size was applied. Commit be62adb49294 ("x86, microcode, AMD: Simplify ucode verification") added current_size in get_matching_microcode(). This is calculated as size of the ucode patch + 8 (ie. size of the header). Later this is compared against the maximum possible ucode patch size for a CPU family. And of course this fails if the patch has already maximum size. Cc: [3.3+] Signed-off-by: Andreas Herrmann Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-1-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_amd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 8a2ce8fd41c0..82746f942cd8 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -143,11 +143,12 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr, unsigned int *current_size) { struct microcode_header_amd *mc_hdr; - unsigned int actual_size; + unsigned int actual_size, patch_size; u16 equiv_cpu_id; /* size of the current patch we're staring at */ - *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE; + patch_size = *(u32 *)(ucode_ptr + 4); + *current_size = patch_size + SECTION_HDR_SIZE; equiv_cpu_id = find_equiv_id(); if (!equiv_cpu_id) @@ -174,7 +175,7 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr, /* * now that the header looks sane, verify its size */ - actual_size = verify_ucode_size(cpu, *current_size, leftover_size); + actual_size = verify_ucode_size(cpu, patch_size, leftover_size); if (!actual_size) return 0; -- cgit v1.2.3 From c96aae1f7f393387d160211f60398d58463a7e65 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 17 Aug 2012 16:43:28 -0400 Subject: xen/setup: Fix one-off error when adding for-balloon PFNs to the P2M. When we are finished with return PFNs to the hypervisor, then populate it back, and also mark the E820 MMIO and E820 gaps as IDENTITY_FRAMEs, we then call P2M to set areas that can be used for ballooning. We were off by one, and ended up over-writting a P2M entry that most likely was an IDENTITY_FRAME. For example: 1-1 mapping on 40000->40200 1-1 mapping on bc558->bc5ac 1-1 mapping on bc5b4->bc8c5 1-1 mapping on bc8c6->bcb7c 1-1 mapping on bcd00->100000 Released 614 pages of unused memory Set 277889 page(s) to 1-1 mapping Populating 40200-40466 pfn range: 614 pages added => here we set from 40466 up to bc559 P2M tree to be INVALID_P2M_ENTRY. We should have done it up to bc558. The end result is that if anybody is trying to construct a PTE for PFN bc558 they end up with ~PAGE_PRESENT. CC: stable@vger.kernel.org Reported-by-and-Tested-by: Andre Przywara Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ead85576d54a..d11ca11d14fc 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -78,9 +78,16 @@ static void __init xen_add_extra_mem(u64 start, u64 size) memblock_reserve(start, size); xen_max_p2m_pfn = PFN_DOWN(start + size); + for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) + continue; + WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", + pfn, mfn); - for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++) __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + } } static unsigned long __init xen_do_chunk(unsigned long start, -- cgit v1.2.3 From e3e45c01ae690e65f2650e5288b9af802e95a136 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 24 Aug 2012 15:34:34 +0200 Subject: perf/x86: Fix microcode revision check for SNB-PEBS The following patch makes the microcode update code path actually invoke the perf_check_microcode() function and thus potentially renabling SNB PEBS. By default, CONFIG_MICROCODE_OLD_INTERFACE is forced to Y in arch/x86/Kconfig. There is no way to disable this. That means that the code path used in arch/x86/kernel/microcode_core.c did not include the call to perf_check_microcode(). Thus, even though the microcode was updated to a version that fixes the SNB PEBS problem, perf_event would still return EOPNOTSUPP when enabling precise sampling. This patch simply adds a call to perf_check_microcode() in the call path used when OLD_INTERFACE=y. Signed-off-by: Stephane Eranian Acked-by: Borislav Petkov Cc: peterz@infradead.org Cc: andi@firstfloor.org Link: http://lkml.kernel.org/r/20120824133434.GA8014@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 4873e62db6a1..9e5bcf1e2376 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -225,6 +225,9 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, if (do_microcode_update(buf, len) == 0) ret = (ssize_t)len; + if (ret > 0) + perf_check_microcode(); + mutex_unlock(µcode_mutex); put_online_cpus(); -- cgit v1.2.3 From 1d92128fe9e30c2340283361957a840f108e4abf Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 26 Aug 2012 18:00:29 +0300 Subject: KVM: x86: fix KVM_GET_MSR for PV EOI KVM_GET_MSR was missing support for PV EOI, which is needed for migration. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dce75b760312..148ed666e311 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2000,6 +2000,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_STEAL_TIME: data = vcpu->arch.st.msr_val; break; + case MSR_KVM_PV_EOI_EN: + data = vcpu->arch.pv_eoi.msr_val; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: -- cgit v1.2.3 From 749c59fd15b2c18dd6c15c353a899fb6ac49b865 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Thu, 30 Aug 2012 11:32:13 +0100 Subject: KVM: PIC: fix use of uninitialised variable. Commit aea218f3cbbc (KVM: PIC: call ack notifiers for irqs that are dropped form irr) used an uninitialised variable to track whether an appropriate apic had been found. This could result in calling the ack notifier incorrectly. Cc: Gleb Natapov Cc: Avi Kivity Signed-off-by: Jamie Iles Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index e498b18f010c..9fc9aa7ac703 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -318,7 +318,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) if (val & 0x10) { u8 edge_irr = s->irr & ~s->elcr; int i; - bool found; + bool found = false; struct kvm_vcpu *vcpu; s->init4 = val & 1; -- cgit v1.2.3 From 3ec18cd8b8f8395d0df604c62ab3bc2cf3a966b4 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 20 Aug 2012 11:24:21 +0200 Subject: perf/x86: Enable Intel Cedarview Atom suppport This patch enables perf_events support for Intel Cedarview Atom (model 54) processors. Support includes PEBS and LBR. Tested on my Atom N2600 netbook. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120820092421.GA11284@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 1 + arch/x86/kernel/cpu/perf_event_intel_lbr.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7f2739e03e79..0d3d63afa76a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2008,6 +2008,7 @@ __init int intel_pmu_init(void) break; case 28: /* Atom */ + case 54: /* Cedariew */ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 520b4265fcd2..da02e9cc3754 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -686,7 +686,8 @@ void intel_pmu_lbr_init_atom(void) * to have an operational LBR which can freeze * on PMU interrupt */ - if (boot_cpu_data.x86_mask < 10) { + if (boot_cpu_data.x86_model == 28 + && boot_cpu_data.x86_mask < 10) { pr_cont("LBR disabled due to erratum"); return; } -- cgit v1.2.3 From 50e900417b8096939d12a46848f965e27a905e36 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 4 Sep 2012 15:45:17 -0400 Subject: xen/p2m: Fix one-off error in checking the P2M tree directory. We would traverse the full P2M top directory (from 0->MAX_DOMAIN_PAGES inclusive) when trying to figure out whether we can re-use some of the P2M middle leafs. Which meant that if the kernel was compiled with MAX_DOMAIN_PAGES=512 we would try to use the 512th entry. Fortunately for us the p2m_top_index has a check for this: BUG_ON(pfn >= MAX_P2M_PFN); which we hit and saw this: (XEN) domain_crash_sync called from entry.S (XEN) Domain 0 (vcpu#0) crashed on cpu#0: (XEN) ----[ Xen-4.1.2-OVM x86_64 debug=n Tainted: C ]---- (XEN) CPU: 0 (XEN) RIP: e033:[] (XEN) RFLAGS: 0000000000000212 EM: 1 CONTEXT: pv guest (XEN) rax: ffffffff81db5000 rbx: ffffffff81db4000 rcx: 0000000000000000 (XEN) rdx: 0000000000480211 rsi: 0000000000000000 rdi: ffffffff81db4000 (XEN) rbp: ffffffff81793db8 rsp: ffffffff81793d38 r8: 0000000008000000 (XEN) r9: 4000000000000000 r10: 0000000000000000 r11: ffffffff81db7000 (XEN) r12: 0000000000000ff8 r13: ffffffff81df1ff8 r14: ffffffff81db6000 (XEN) r15: 0000000000000ff8 cr0: 000000008005003b cr4: 00000000000026f0 (XEN) cr3: 0000000661795000 cr2: 0000000000000000 Fixes-Oracle-Bug: 14570662 CC: stable@vger.kernel.org # only for v3.5 Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index d4b255463253..76ba0e97e530 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -599,7 +599,7 @@ bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_ if (p2m_index(set_pfn)) return false; - for (pfn = 0; pfn <= MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { topidx = p2m_top_index(pfn); if (!p2m_top[topidx]) -- cgit v1.2.3 From ce7184bdbd38d920fb515266fbbdc585ad2e5493 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 24 Aug 2012 08:55:13 +0000 Subject: xen: fix logical error in tlb flushing While TLB_FLUSH_ALL gets passed as 'end' argument to flush_tlb_others(), the Xen code was made to check its 'start' parameter. That may give a incorrect op.cmd to MMUEXT_INVLPG_MULTI instead of MMUEXT_TLB_FLUSH_MULTI. Then it causes some page can not be flushed from TLB. This patch fixed this issue. Reported-by: Jan Beulich Signed-off-by: Alex Shi Acked-by: Jan Beulich Tested-by: Yongjie Ren Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b65a76133f4f..5141d808e751 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1283,7 +1283,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { + if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { args->op.cmd = MMUEXT_INVLPG_MULTI; args->op.arg1.linear_addr = start; } -- cgit v1.2.3 From 4f97704555672f9ab48ca623561e96a9430bec9a Mon Sep 17 00:00:00 2001 From: "Ren, Yongjie" Date: Fri, 7 Sep 2012 07:36:59 +0000 Subject: KVM: x86: Check INVPCID feature bit in EBX of leaf 7 Checks and operations on the INVPCID feature bit should use EBX of CPUID leaf 7 instead of ECX. Signed-off-by: Junjie Mao Signed-off-by: Yongjie Ren Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c00f03de1b79..002b4a566e2d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6575,7 +6575,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) /* Exposing INVPCID only when PCID is exposed */ best = kvm_find_cpuid_entry(vcpu, 0x7, 0); if (vmx_invpcid_supported() && - best && (best->ecx & bit(X86_FEATURE_INVPCID)) && + best && (best->ebx & bit(X86_FEATURE_INVPCID)) && guest_cpuid_has_pcid(vcpu)) { exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, @@ -6585,7 +6585,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); if (best) - best->ecx &= ~bit(X86_FEATURE_INVPCID); + best->ebx &= ~bit(X86_FEATURE_INVPCID); } } -- cgit v1.2.3 From 4484141a94f4a5afea6ebc0b2abba0aa1b0ae9d1 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 7 Sep 2012 14:14:20 +0800 Subject: KVM: fix error paths for failed gfn_to_page() calls This bug was triggered: [ 4220.198458] BUG: unable to handle kernel paging request at fffffffffffffffe [ 4220.203907] IP: [] put_page+0xf/0x34 ...... [ 4220.237326] Call Trace: [ 4220.237361] [] kvm_arch_destroy_vm+0xf9/0x101 [kvm] [ 4220.237382] [] kvm_put_kvm+0xcc/0x127 [kvm] [ 4220.237401] [] kvm_vcpu_release+0x18/0x1c [kvm] [ 4220.237407] [] __fput+0x111/0x1ed [ 4220.237411] [] ____fput+0xe/0x10 [ 4220.237418] [] task_work_run+0x5d/0x88 [ 4220.237424] [] do_exit+0x2bf/0x7ca The test case: printf(fmt, ##args); \ exit(-1);} while (0) static int create_vm(void) { int sys_fd, vm_fd; sys_fd = open("/dev/kvm", O_RDWR); if (sys_fd < 0) die("open /dev/kvm fail.\n"); vm_fd = ioctl(sys_fd, KVM_CREATE_VM, 0); if (vm_fd < 0) die("KVM_CREATE_VM fail.\n"); return vm_fd; } static int create_vcpu(int vm_fd) { int vcpu_fd; vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); if (vcpu_fd < 0) die("KVM_CREATE_VCPU ioctl.\n"); printf("Create vcpu.\n"); return vcpu_fd; } static void *vcpu_thread(void *arg) { int vm_fd = (int)(long)arg; create_vcpu(vm_fd); return NULL; } int main(int argc, char *argv[]) { pthread_t thread; int vm_fd; (void)argc; (void)argv; vm_fd = create_vm(); pthread_create(&thread, NULL, vcpu_thread, (void *)(long)vm_fd); printf("Exit.\n"); return 0; } It caused by release kvm->arch.ept_identity_map_addr which is the error page. The parent thread can send KILL signal to the vcpu thread when it was exiting which stops faulting pages and potentially allocating memory. So gfn_to_pfn/gfn_to_page may fail at this time Fixed by checking the page before it is used Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 19 ++++++++++++++++--- arch/x86/kvm/x86.c | 13 ++++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 002b4a566e2d..b1eb202ee76a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3619,6 +3619,7 @@ static void seg_setup(int seg) static int alloc_apic_access_page(struct kvm *kvm) { + struct page *page; struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; @@ -3633,7 +3634,13 @@ static int alloc_apic_access_page(struct kvm *kvm) if (r) goto out; - kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); + page = gfn_to_page(kvm, 0xfee00); + if (is_error_page(page)) { + r = -EFAULT; + goto out; + } + + kvm->arch.apic_access_page = page; out: mutex_unlock(&kvm->slots_lock); return r; @@ -3641,6 +3648,7 @@ out: static int alloc_identity_pagetable(struct kvm *kvm) { + struct page *page; struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; @@ -3656,8 +3664,13 @@ static int alloc_identity_pagetable(struct kvm *kvm) if (r) goto out; - kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, - kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); + page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); + if (is_error_page(page)) { + r = -EFAULT; + goto out; + } + + kvm->arch.ept_identity_pagetable = page; out: mutex_unlock(&kvm->slots_lock); return r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 148ed666e311..2966c847d489 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5113,17 +5113,20 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) !kvm_event_needs_reinjection(vcpu); } -static void vapic_enter(struct kvm_vcpu *vcpu) +static int vapic_enter(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct page *page; if (!apic || !apic->vapic_addr) - return; + return 0; page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); + if (is_error_page(page)) + return -EFAULT; vcpu->arch.apic->vapic_page = page; + return 0; } static void vapic_exit(struct kvm_vcpu *vcpu) @@ -5430,7 +5433,11 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) } vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - vapic_enter(vcpu); + r = vapic_enter(vcpu); + if (r) { + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + return r; + } r = 1; while (r > 0) { -- cgit v1.2.3