diff options
Diffstat (limited to 'arch/x86')
113 files changed, 1072 insertions, 663 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bada636d1065..b9c546a305a4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2732,10 +2732,6 @@ config COMPAT_FOR_U64_ALIGNMENT config SYSVIPC_COMPAT def_bool y depends on SYSVIPC - -config KEYS_COMPAT - def_bool y - depends on KEYS endif endmenu diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index e5612f3e3b57..d7ac721a8a96 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -16,7 +16,7 @@ #ifndef BOOT_BOOT_H #define BOOT_BOOT_H -#define STACK_SIZE 512 /* Minimum number of bytes for stack */ +#define STACK_SIZE 1024 /* Minimum number of bytes for stack */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 34d9e15857c3..4669b3a931ed 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -94,7 +94,7 @@ vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o quiet_cmd_check_data_rel = DATAREL $@ define cmd_check_data_rel for obj in $(filter %.o,$^); do \ - readelf -S $$obj | grep -qF .rel.local && { \ + ${CROSS_COMPILE}readelf -S $$obj | grep -qF .rel.local && { \ echo "error: $$obj has data relocations!" >&2; \ exit 1; \ } || true; \ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index a66854d99ee1..6de58f1bd7ec 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -430,9 +430,6 @@ void choose_random_location(unsigned long input, { unsigned long random_addr, min_addr; - /* By default, keep output position unchanged. */ - *virt_addr = *output; - if (cmdline_find_option_bool("nokaslr")) { warn("KASLR disabled: 'nokaslr' on cmdline."); return; diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b3c5a5f030ce..c945acd8fa33 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -338,7 +338,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, unsigned long output_len) { const unsigned long kernel_total_size = VO__end - VO__text; - unsigned long virt_addr = (unsigned long)output; + unsigned long virt_addr = LOAD_PHYSICAL_ADDR; /* Retain x86 boot parameters pointer passed from startup_32/64. */ boot_params = rmode; @@ -397,7 +397,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, #ifndef CONFIG_RELOCATABLE if ((unsigned long)output != LOAD_PHYSICAL_ADDR) error("Destination address does not match LOAD_PHYSICAL_ADDR"); - if ((unsigned long)output != virt_addr) + if (virt_addr != LOAD_PHYSICAL_ADDR) error("Destination virtual address changed when not relocatable"); #endif diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 1c8355eadbd1..766a5211f827 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -81,8 +81,6 @@ static inline void choose_random_location(unsigned long input, unsigned long output_size, unsigned long *virt_addr) { - /* No change from existing output location. */ - *virt_addr = *output; } #endif diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index cc3bd583dce1..9e240fcba784 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include "ctype.h" +#include "string.h" int memcmp(const void *s1, const void *s2, size_t len) { diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index 725e820602b1..113588ddb43f 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -18,4 +18,13 @@ int memcmp(const void *s1, const void *s2, size_t len); #define memset(d,c,l) __builtin_memset(d,c,l) #define memcmp __builtin_memcmp +extern int strcmp(const char *str1, const char *str2); +extern int strncmp(const char *cs, const char *ct, size_t count); +extern size_t strlen(const char *s); +extern char *strstr(const char *s1, const char *s2); +extern size_t strnlen(const char *s, size_t maxlen); +extern unsigned int atou(const char *s); +extern unsigned long long simple_strtoull(const char *cp, char **endp, + unsigned int base); + #endif /* BOOT_STRING_H */ diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S index 96df6a39d7e2..a2ae6891e1fc 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S @@ -157,8 +157,8 @@ LABEL skip_ %I .endr # Find min length - vmovdqa _lens+0*16(state), %xmm0 - vmovdqa _lens+1*16(state), %xmm1 + vmovdqu _lens+0*16(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} @@ -178,8 +178,8 @@ LABEL skip_ %I vpsubd %xmm2, %xmm0, %xmm0 vpsubd %xmm2, %xmm1, %xmm1 - vmovdqa %xmm0, _lens+0*16(state) - vmovdqa %xmm1, _lens+1*16(state) + vmovdqu %xmm0, _lens+0*16(state) + vmovdqu %xmm1, _lens+1*16(state) # "state" and "args" are the same address, arg1 # len is arg2 @@ -235,8 +235,8 @@ ENTRY(sha1_mb_mgr_get_comp_job_avx2) jc .return_null # Find min length - vmovdqa _lens(state), %xmm0 - vmovdqa _lens+1*16(state), %xmm1 + vmovdqu _lens(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1cd792db15ef..1eab79c9ac48 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -117,11 +117,10 @@ .set T1, REG_T1 .endm -#define K_BASE %r8 #define HASH_PTR %r9 +#define BLOCKS_CTR %r8 #define BUFFER_PTR %r10 #define BUFFER_PTR2 %r13 -#define BUFFER_END %r11 #define PRECALC_BUF %r14 #define WK_BUF %r15 @@ -205,14 +204,14 @@ * blended AVX2 and ALU instruction scheduling * 1 vector iteration per 8 rounds */ - vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP + vmovdqu (i * 2)(BUFFER_PTR), W_TMP .elseif ((i & 7) == 1) - vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ + vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ WY_TMP, WY_TMP .elseif ((i & 7) == 2) vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY .elseif ((i & 7) == 4) - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP .elseif ((i & 7) == 7) vmovdqu WY_TMP, PRECALC_WK(i&~7) @@ -255,7 +254,7 @@ vpxor WY, WY_TMP, WY_TMP .elseif ((i & 7) == 7) vpxor WY_TMP2, WY_TMP, WY - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY @@ -291,7 +290,7 @@ vpsrld $30, WY, WY vpor WY, WY_TMP, WY .elseif ((i & 7) == 7) - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY @@ -446,6 +445,16 @@ .endm +/* Add constant only if (%2 > %3) condition met (uses RTA as temp) + * %1 + %2 >= %3 ? %4 : 0 + */ +.macro ADD_IF_GE a, b, c, d + mov \a, RTA + add $\d, RTA + cmp $\c, \b + cmovge RTA, \a +.endm + /* * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining */ @@ -463,13 +472,16 @@ lea (2*4*80+32)(%rsp), WK_BUF # Precalc WK for first 2 blocks - PRECALC_OFFSET = 0 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 .set i, 0 .rept 160 PRECALC i .set i, i + 1 .endr - PRECALC_OFFSET = 128 + + /* Go to next block if needed */ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 xchg WK_BUF, PRECALC_BUF .align 32 @@ -479,8 +491,8 @@ _loop: * we use K_BASE value as a signal of a last block, * it is set below by: cmovae BUFFER_PTR, K_BASE */ - cmp K_BASE, BUFFER_PTR - jne _begin + test BLOCKS_CTR, BLOCKS_CTR + jnz _begin .align 32 jmp _end .align 32 @@ -512,10 +524,10 @@ _loop0: .set j, j+2 .endr - add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ - cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ - + /* Update Counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 /* * rounds * 60,62,64,66,68 @@ -532,8 +544,8 @@ _loop0: UPDATE_HASH 12(HASH_PTR), D UPDATE_HASH 16(HASH_PTR), E - cmp K_BASE, BUFFER_PTR /* is current block the last one? */ - je _loop + test BLOCKS_CTR, BLOCKS_CTR + jz _loop mov TB, B @@ -575,10 +587,10 @@ _loop2: .set j, j+2 .endr - add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ - - cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + /* update counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 jmp _loop3 _loop3: @@ -641,19 +653,12 @@ _loop3: avx2_zeroupper - lea K_XMM_AR(%rip), K_BASE - + /* Setup initial values */ mov CTX, HASH_PTR mov BUF, BUFFER_PTR - lea 64(BUF), BUFFER_PTR2 - - shl $6, CNT /* mul by 64 */ - add BUF, CNT - add $64, CNT - mov CNT, BUFFER_END - cmp BUFFER_END, BUFFER_PTR2 - cmovae K_BASE, BUFFER_PTR2 + mov BUF, BUFFER_PTR2 + mov CNT, BLOCKS_CTR xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S index a78a0694ddef..ec9bee661d50 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S @@ -155,8 +155,8 @@ LABEL skip_ %I .endr # Find min length - vmovdqa _lens+0*16(state), %xmm0 - vmovdqa _lens+1*16(state), %xmm1 + vmovdqu _lens+0*16(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} @@ -176,8 +176,8 @@ LABEL skip_ %I vpsubd %xmm2, %xmm0, %xmm0 vpsubd %xmm2, %xmm1, %xmm1 - vmovdqa %xmm0, _lens+0*16(state) - vmovdqa %xmm1, _lens+1*16(state) + vmovdqu %xmm0, _lens+0*16(state) + vmovdqu %xmm1, _lens+1*16(state) # "state" and "args" are the same address, arg1 # len is arg2 @@ -234,8 +234,8 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2) jc .return_null # Find min length - vmovdqa _lens(state), %xmm0 - vmovdqa _lens+1*16(state), %xmm1 + vmovdqu _lens(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ef766a358b37..e7b0e7ff4c58 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1215,6 +1215,8 @@ ENTRY(nmi) * other IST entries. */ + ASM_CLAC + /* Use %rdx as our temp variable throughout */ pushq %rdx diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 7853b53959cd..3f9d1a83891a 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -30,8 +30,10 @@ static int __init vdso32_setup(char *s) { vdso32_enabled = simple_strtoul(s, NULL, 0); - if (vdso32_enabled > 1) + if (vdso32_enabled > 1) { pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); + vdso32_enabled = 0; + } return 1; } @@ -62,13 +64,18 @@ subsys_initcall(sysenter_setup); /* Register vsyscall32 into the ABI table */ #include <linux/sysctl.h> +static const int zero; +static const int one = 1; + static struct ctl_table abi_table2[] = { { .procname = "vsyscall32", .data = &vdso32_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *)&zero, + .extra2 = (int *)&one, }, {} }; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7fe88bb57e36..9604b2574d6c 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -505,6 +505,10 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; + + /* There's no sense in having PEBS for non sampling events: */ + if (!is_sampling_event(event)) + return -EINVAL; } /* * check that PEBS LBR correction does not conflict with @@ -2096,8 +2100,8 @@ static int x86_pmu_event_init(struct perf_event *event) static void refresh_pce(void *ignored) { - if (current->mm) - load_mm_cr4(current->mm); + if (current->active_mm) + load_mm_cr4(current->active_mm); } static void x86_pmu_event_mapped(struct perf_event *event) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index cb8522290e6a..f0f197f459b5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -431,11 +431,11 @@ static __initconst const u64 skl_hw_cache_event_ids [ C(DTLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */ - [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */ + [ C(RESULT_MISS) ] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */ - [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */ + [ C(RESULT_MISS) ] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */ }, [ C(OP_PREFETCH) ] = { [ C(RESULT_ACCESS) ] = 0x0, @@ -3164,13 +3164,16 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { for_each_cpu(i, topology_sibling_cpumask(cpu)) { + struct cpu_hw_events *sibling; struct intel_excl_cntrs *c; - c = per_cpu(cpu_hw_events, i).excl_cntrs; + sibling = &per_cpu(cpu_hw_events, i); + c = sibling->excl_cntrs; if (c && c->core_id == core_id) { cpuc->kfree_on_online[1] = cpuc->excl_cntrs; cpuc->excl_cntrs = c; - cpuc->excl_thread_id = 1; + if (!sibling->excl_thread_id) + cpuc->excl_thread_id = 1; break; } } @@ -3975,7 +3978,7 @@ __init int intel_pmu_init(void) x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1; if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index fec8a461bdef..1076c9a77292 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -434,6 +434,7 @@ static struct pmu cstate_core_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct pmu cstate_pkg_pmu = { @@ -447,6 +448,7 @@ static struct pmu cstate_pkg_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static const struct cstate_model nhm_cstates __initconst = { diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 81b321ace8e0..f924629836a8 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -507,6 +507,9 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].to = msr_lastbranch.to; cpuc->lbr_entries[i].mispred = 0; cpuc->lbr_entries[i].predicted = 0; + cpuc->lbr_entries[i].in_tx = 0; + cpuc->lbr_entries[i].abort = 0; + cpuc->lbr_entries[i].cycles = 0; cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index c5047b8f777b..df60b58691e7 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -106,18 +106,24 @@ static struct attribute_group pt_cap_group = { }; PMU_FORMAT_ATTR(cyc, "config:1" ); +PMU_FORMAT_ATTR(pwr_evt, "config:4" ); +PMU_FORMAT_ATTR(fup_on_ptw, "config:5" ); PMU_FORMAT_ATTR(mtc, "config:9" ); PMU_FORMAT_ATTR(tsc, "config:10" ); PMU_FORMAT_ATTR(noretcomp, "config:11" ); +PMU_FORMAT_ATTR(ptw, "config:12" ); PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); PMU_FORMAT_ATTR(psb_period, "config:24-27" ); static struct attribute *pt_formats_attr[] = { &format_attr_cyc.attr, + &format_attr_pwr_evt.attr, + &format_attr_fup_on_ptw.attr, &format_attr_mtc.attr, &format_attr_tsc.attr, &format_attr_noretcomp.attr, + &format_attr_ptw.attr, &format_attr_mtc_period.attr, &format_attr_cyc_thresh.attr, &format_attr_psb_period.attr, diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 0a535cea8ff3..4c1b7ea18541 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -161,7 +161,13 @@ static u64 rapl_timer_ms; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { - return rapl_pmus->pmus[topology_logical_package_id(cpu)]; + unsigned int pkgid = topology_logical_package_id(cpu); + + /* + * The unsigned check also catches the '-1' return value for non + * existent mappings in the topology map. + */ + return pkgid < rapl_pmus->maxpkg ? rapl_pmus->pmus[pkgid] : NULL; } static inline u64 rapl_read_counter(struct perf_event *event) @@ -402,6 +408,8 @@ static int rapl_pmu_event_init(struct perf_event *event) /* must be done before validate_group */ pmu = cpu_to_rapl_pmu(event->cpu); + if (!pmu) + return -EINVAL; event->cpu = pmu->cpu; event->pmu_private = pmu; event->hw.event_base = msr; @@ -585,6 +593,19 @@ static int rapl_cpu_online(unsigned int cpu) struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); int target; + if (!pmu) { + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -ENOMEM; + + raw_spin_lock_init(&pmu->lock); + INIT_LIST_HEAD(&pmu->active_list); + pmu->pmu = &rapl_pmus->pmu; + pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(pmu); + + rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; + } /* * Check if there is an online cpu in the package which collects rapl * events already. @@ -598,27 +619,6 @@ static int rapl_cpu_online(unsigned int cpu) return 0; } -static int rapl_cpu_prepare(unsigned int cpu) -{ - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); - - if (pmu) - return 0; - - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); - if (!pmu) - return -ENOMEM; - - raw_spin_lock_init(&pmu->lock); - INIT_LIST_HEAD(&pmu->active_list); - pmu->pmu = &rapl_pmus->pmu; - pmu->timer_interval = ms_to_ktime(rapl_timer_ms); - pmu->cpu = -1; - rapl_hrtimer_init(pmu); - rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; - return 0; -} - static int rapl_check_hw_unit(bool apply_quirk) { u64 msr_rapl_power_unit_bits; @@ -697,6 +697,7 @@ static int __init init_rapl_pmus(void) rapl_pmus->pmu.start = rapl_pmu_event_start; rapl_pmus->pmu.stop = rapl_pmu_event_stop; rapl_pmus->pmu.read = rapl_pmu_event_read; + rapl_pmus->pmu.module = THIS_MODULE; return 0; } @@ -759,7 +760,7 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init), @@ -803,28 +804,21 @@ static int __init rapl_pmu_init(void) * Install callbacks. Core will call them for each online cpu. */ - ret = cpuhp_setup_state(CPUHP_PERF_X86_RAPL_PREP, "PERF_X86_RAPL_PREP", - rapl_cpu_prepare, NULL); - if (ret) - goto out; - ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, "AP_PERF_X86_RAPL_ONLINE", rapl_cpu_online, rapl_cpu_offline); if (ret) - goto out1; + goto out; ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); if (ret) - goto out2; + goto out1; rapl_advertise(); return 0; -out2: - cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out1: - cpuhp_remove_state(CPUHP_PERF_X86_RAPL_PREP); + cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out: pr_warn("Initialization failed (%d), disabled\n", ret); cleanup_rapl_pmus(); @@ -835,7 +829,6 @@ module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); - cpuhp_remove_state_nocalls(CPUHP_PERF_X86_RAPL_PREP); perf_pmu_unregister(&rapl_pmus->pmu); cleanup_rapl_pmus(); } diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 19d646a783fd..aec6cc925af8 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -733,6 +733,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .start = uncore_pmu_event_start, .stop = uncore_pmu_event_stop, .read = uncore_pmu_event_read, + .module = THIS_MODULE, }; } else { pmu->pmu = *pmu->type->pmu; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 272427700d48..afe8024e9e95 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -2686,7 +2686,7 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { void hswep_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(0); + int pkg = boot_cpu_data.logical_proc_id; if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index e7636bac7372..6c98821fef5e 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -62,8 +62,10 @@ #define new_len2 145f-144f /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas uses a "true" value of -1. */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 1b020381ab38..d4aea31eec03 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end) alt_end_marker ":\n" /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax * - * The additional "-" is needed because gas works with s32s. + * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" /* * Pad the second replacement alternative with additional NOPs if it is diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index e7f155c3045e..7bcd138c3aa9 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -204,6 +204,7 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ + unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -226,8 +227,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fsbase; \ - (pr_reg)[22] = current->thread.gsbase; \ + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ @@ -245,12 +246,13 @@ extern int force_personality32; #define CORE_DUMP_USE_REGSET #define ELF_EXEC_PAGESIZE 4096 -/* This is the location that an ET_DYN program is loaded if exec'ed. Typical - use of this is to invoke "./ld.so someprog" to test out a new version of - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - -#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) +/* + * This is the base location for PIE (ET_DYN with INTERP) loads. On + * 64-bit, this is above 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ + (TASK_SIZE / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, @@ -278,7 +280,7 @@ struct task_struct; #define ARCH_DLINFO_IA32 \ do { \ - if (vdso32_enabled) { \ + if (VDSO_CURRENT_BASE) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d34bd370074b..6c5020163db0 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -304,13 +304,13 @@ static inline unsigned type in##bwl##_p(int port) \ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ { \ asm volatile("rep; outs" #bwl \ - : "+S"(addr), "+c"(count) : "d"(port)); \ + : "+S"(addr), "+c"(count) : "d"(port) : "memory"); \ } \ \ static inline void ins##bwl(int port, void *addr, unsigned long count) \ { \ asm volatile("rep; ins" #bwl \ - : "+D"(addr), "+c"(count) : "d"(port)); \ + : "+D"(addr), "+c"(count) : "d"(port) : "memory"); \ } BUILDIO(b, b, char) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e9cd7befcb76..fc3c7e49c8e4 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -221,6 +221,9 @@ struct x86_emulate_ops { void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); + + unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); + void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -290,10 +293,10 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; - int emul_flags; bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ + bool tf; /* TF value before instruction (after for syscall/sysret) */ bool have_exception; struct x86_exception exception; diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index c2b8d24a235c..6226cb0eca23 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -35,6 +35,7 @@ struct kvm_page_track_notifier_node { }; void kvm_page_track_init(struct kvm *kvm); +void kvm_page_track_cleanup(struct kvm *kvm); void kvm_page_track_free_memslot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont); diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 9bd7ff5ffbcc..70c9cc3f098c 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -257,6 +257,7 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif int mce_available(struct cpuinfo_x86 *c); +bool mce_is_memory_error(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 8e0a9fe86de4..f9dd22469388 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -116,9 +116,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.execute_only_pkey = -1; } #endif - init_new_context_ldt(tsk, mm); - - return 0; + return init_new_context_ldt(tsk, mm); } static inline void destroy_context(struct mm_struct *mm) { diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 78f3760ca1f2..b601ddac5719 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -405,6 +405,8 @@ #define MSR_IA32_TSC_ADJUST 0x0000003b #define MSR_IA32_BNDCFGS 0x00000d90 +#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc + #define MSR_IA32_XSS 0x00000da0 #define FEATURE_CONTROL_LOCKED (1<<0) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 0b1ff4c1c14e..fffb2794dd89 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -7,6 +7,7 @@ bool pat_enabled(void); void pat_disable(const char *reason); extern void pat_init(void); +extern void init_cache_modes(void); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 34684adb6899..b3b09b98896d 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -46,6 +46,15 @@ extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { + /* + * "Allocated" pkeys are those that have been returned + * from pkey_alloc(). pkey 0 is special, and never + * returned from pkey_alloc(). + */ + if (pkey <= 0) + return false; + if (pkey >= arch_max_pkey()) + return false; return mm_pkey_allocation_map(mm) & (1U << pkey); } @@ -82,12 +91,6 @@ int mm_pkey_alloc(struct mm_struct *mm) static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { - /* - * pkey 0 is special, always allocated and can never - * be freed. - */ - if (!pkey) - return -EINVAL; if (!mm_pkey_is_allocated(mm, pkey)) return -EINVAL; diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 2c1ebeb4d737..e2904373010d 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -55,7 +55,8 @@ static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) - * instruction. + * instruction. Note that @size is internally rounded up to be cache + * line size aligned. */ static inline void arch_wb_cache_pmem(void *addr, size_t size) { @@ -69,15 +70,6 @@ static inline void arch_wb_cache_pmem(void *addr, size_t size) clwb(p); } -/* - * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec - * iterators, so for other types (bvec & kvec) we must do a cache write-back. - */ -static inline bool __iter_needs_pmem_wb(struct iov_iter *i) -{ - return iter_is_iovec(i) == false; -} - /** * arch_copy_from_iter_pmem - copy data from an iterator to PMEM * @addr: PMEM destination address @@ -94,7 +86,35 @@ static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, /* TODO: skip the write-back by always using non-temporal stores */ len = copy_from_iter_nocache(addr, bytes, i); - if (__iter_needs_pmem_wb(i)) + /* + * In the iovec case on x86_64 copy_from_iter_nocache() uses + * non-temporal stores for the bulk of the transfer, but we need + * to manually flush if the transfer is unaligned. A cached + * memory copy is used when destination or size is not naturally + * aligned. That is: + * - Require 8-byte alignment when size is 8 bytes or larger. + * - Require 4-byte alignment when size is 4 bytes. + * + * In the non-iovec case the entire destination needs to be + * flushed. + */ + if (iter_is_iovec(i)) { + unsigned long flushed, dest = (unsigned long) addr; + + if (bytes < 8) { + if (!IS_ALIGNED(dest, 4) || (bytes != 4)) + arch_wb_cache_pmem(addr, bytes); + } else { + if (!IS_ALIGNED(dest, 8)) { + dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); + arch_wb_cache_pmem(addr, 1); + } + + flushed = dest - (unsigned long) addr; + if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8)) + arch_wb_cache_pmem(addr + bytes - 1, 1); + } + } else arch_wb_cache_pmem(addr, bytes); return len; diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6fa85944af83..fc5abff9b7fd 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -188,7 +188,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) static inline void __flush_tlb_all(void) { - if (static_cpu_has(X86_FEATURE_PGE)) + if (boot_cpu_has(X86_FEATURE_PGE)) __flush_tlb_global(); else __flush_tlb(); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index faf3687f1035..dead0f3921f3 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -68,6 +68,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ }) +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task()) +#else +# define WARN_ON_IN_IRQ() +#endif + /** * access_ok: - Checks if a user space pointer is valid * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that @@ -88,8 +94,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. */ -#define access_ok(type, addr, size) \ - likely(!__range_not_ok(addr, size, user_addr_max())) +#define access_ok(type, addr, size) \ +({ \ + WARN_ON_IN_IRQ(); \ + likely(!__range_not_ok(addr, size, user_addr_max())); \ +}) /* * These are the main single-value transfer routines. They automatically @@ -315,10 +324,10 @@ do { \ #define __get_user_asm_u64(x, ptr, retval, errret) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %2,%%eax\n" \ "2: movl %3,%%edx\n" \ - "3: " ASM_CLAC "\n" \ + "3:\n" \ ".section .fixup,\"ax\"\n" \ "4: mov %4,%0\n" \ " xorl %%eax,%%eax\n" \ @@ -327,7 +336,7 @@ do { \ ".previous\n" \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ - : "=r" (retval), "=A"(x) \ + : "=r" (retval), "=&A"(x) \ : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \ "i" (errret), "0" (retval)); \ }) diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index 608a79d5a466..e6911caf5bbf 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) /* No need for a barrier -- XCHG is a barrier on x86. */ #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +extern int xen_have_vector_callback; + +/* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ +static inline bool xen_support_evtchn_rebind(void) +{ + return (!xen_hvm_domain() || xen_have_vector_callback); +} + #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a12a047184ee..8b678af866f7 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -43,6 +43,7 @@ #include <asm/page.h> #include <asm/pgtable.h> +#include <asm/smap.h> #include <xen/interface/xen.h> #include <xen/interface/sched.h> @@ -214,10 +215,12 @@ privcmd_call(unsigned call, __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); + stac(); asm volatile("call *%[call]" : __HYPERCALL_5PARAM : [call] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); + clac(); return (long)__res; } diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 931ced8ca345..b89bef95f63b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -176,10 +176,15 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } + if (!enabled) { + ++disabled_cpus; + return -EINVAL; + } + if (boot_cpu_physical_apicid != -1U) ver = boot_cpu_apic_version; - cpu = __generic_processor_info(id, ver, enabled); + cpu = generic_processor_info(id, ver); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; @@ -338,6 +343,14 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, struct mpc_intsrc mp_irq; /* + * Check bus_irq boundary. + */ + if (bus_irq >= NR_IRQS_LEGACY) { + pr_warn("Invalid bus_irq %u for legacy override\n", bus_irq); + return; + } + + /* * Convert 'gsi' to 'ioapic.pin'. */ ioapic = mp_find_ioapic(gsi); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f2234918e494..c6583efdbdaf 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1863,14 +1863,14 @@ static void __smp_spurious_interrupt(u8 vector) "should never happen.\n", vector, smp_processor_id()); } -__visible void smp_spurious_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) { entering_irq(); __smp_spurious_interrupt(~regs->orig_ax); exiting_irq(); } -__visible void smp_trace_spurious_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) { u8 vector = ~regs->orig_ax; @@ -1921,14 +1921,14 @@ static void __smp_error_interrupt(struct pt_regs *regs) } -__visible void smp_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) { entering_irq(); __smp_error_interrupt(regs); exiting_irq(); } -__visible void smp_trace_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) { entering_irq(); trace_error_apic_entry(ERROR_APIC_VECTOR); @@ -2070,7 +2070,7 @@ static int allocate_logical_cpuid(int apicid) return nr_logical_cpuids++; } -int __generic_processor_info(int apicid, int version, bool enabled) +int generic_processor_info(int apicid, int version) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2128,11 +2128,9 @@ int __generic_processor_info(int apicid, int version, bool enabled) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - if (enabled) { - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); - } + pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " + "reached. Processor %d/0x%x ignored.\n", + max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2184,23 +2182,13 @@ int __generic_processor_info(int apicid, int version, bool enabled) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - - if (enabled) { - num_processors++; - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - } else { - disabled_cpus++; - } + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; return cpu; } -int generic_processor_info(int apicid, int version) -{ - return __generic_processor_info(apicid, version, true); -} - int hard_smp_processor_id(void) { return read_apic_id(); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d1e25564b3c1..cf89928dbd46 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1876,6 +1876,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_ack = irq_chip_ack_parent, .irq_eoi = ioapic_ack_level, .irq_set_affinity = ioapic_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -1887,6 +1888,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = { .irq_ack = irq_chip_ack_parent, .irq_eoi = ioapic_ir_ack_level, .irq_set_affinity = ioapic_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -2114,7 +2116,7 @@ static inline void __init check_timer(void) int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_ioapic_irq(irq_get_chip_data(0)); + unmask_ioapic_irq(irq_get_irq_data(0)); } irq_domain_deactivate_irq(irq_data); irq_domain_activate_irq(irq_data); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 5d30c5e42bb1..f3557a1eb562 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -559,7 +559,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) __send_cleanup_vector(data); } -asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) +asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 93d824ec3120..040af1939460 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -85,7 +85,7 @@ void mce_gen_pool_process(void) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { mce = &node->mce; - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index cd74a3f00aea..de20902ecf23 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -13,7 +13,7 @@ enum severity_level { MCE_PANIC_SEVERITY, }; -extern struct atomic_notifier_head x86_mce_decoder_chain; +extern struct blocking_notifier_head x86_mce_decoder_chain; #define ATTR_LEN 16 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 631356c8cca4..f46071cb2c90 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -245,6 +245,9 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc if (m->status & MCI_STATUS_UC) { + if (ctx == IN_KERNEL) + return MCE_PANIC_SEVERITY; + /* * On older systems where overflow_recov flag is not present, we * should simply panic if an error overflow occurs. If @@ -255,10 +258,6 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc if (mce_flags.smca) return mce_severity_amd_smca(m, ctx); - /* software can try to contain */ - if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL)) - return MCE_PANIC_SEVERITY; - /* kill current process */ return MCE_AR_SEVERITY; } else { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a7fdf453d895..8ca5f8ad008e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -120,7 +120,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) @@ -213,13 +213,13 @@ void mce_register_decode_chain(struct notifier_block *nb) if (nb != &mce_srao_nb && nb->priority == INT_MAX) nb->priority -= 1; - atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); void mce_unregister_decode_chain(struct notifier_block *nb) { - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); + blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); @@ -272,8 +272,6 @@ struct mca_msr_regs msr_ops = { static void print_mce(struct mce *m) { - int ret = 0; - pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); @@ -309,14 +307,6 @@ static void print_mce(struct mce *m) m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, cpu_data(m->extcpu).microcode); - /* - * Print out human-readable details about the MCE error, - * (if the CPU has an implementation for that) - */ - ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); - if (ret == NOTIFY_STOP) - return; - pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } @@ -608,16 +598,14 @@ static void mce_read_aux(struct mce *m, int i) } } -static bool memory_error(struct mce *m) +bool mce_is_memory_error(struct mce *m) { - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86_vendor == X86_VENDOR_AMD) { + if (m->cpuvendor == X86_VENDOR_AMD) { /* ErrCodeExt[20:16] */ u8 xec = (m->status >> 16) & 0x1f; return (xec == 0x0 || xec == 0x8); - } else if (c->x86_vendor == X86_VENDOR_INTEL) { + } else if (m->cpuvendor == X86_VENDOR_INTEL) { /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes * @@ -638,6 +626,7 @@ static bool memory_error(struct mce *m) return false; } +EXPORT_SYMBOL_GPL(mce_is_memory_error); DEFINE_PER_CPU(unsigned, mce_poll_count); @@ -701,7 +690,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); - if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) + if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m)) if (m.status & MCI_STATUS_ADDRV) m.severity = severity; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9b5403462936..39526e1e3132 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -59,7 +59,7 @@ static const char * const th_names[] = { "load_store", "insn_fetch", "combined_unit", - "", + "decode_unit", "northbridge", "execution_unit", }; @@ -593,14 +593,14 @@ static inline void __smp_deferred_error_interrupt(void) deferred_error_int_vector(); } -asmlinkage __visible void smp_deferred_error_interrupt(void) +asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) { entering_irq(); __smp_deferred_error_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_deferred_error_interrupt(void) +asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) { entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); @@ -955,6 +955,9 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) const char *name = get_name(bank, NULL); int err = 0; + if (!dev) + return -ENODEV; + if (is_shared_bank(bank)) { nb = node_to_amd_nb(amd_get_nb_id(cpu)); diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6b9dc4d18ccc..c460c91d0c8f 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -431,14 +431,16 @@ static inline void __smp_thermal_interrupt(void) smp_thermal_vector(); } -asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry +smp_thermal_interrupt(struct pt_regs *regs) { entering_irq(); __smp_thermal_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry +smp_trace_thermal_interrupt(struct pt_regs *regs) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index fcf9ae9384f4..976042371b4b 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -24,14 +24,14 @@ static inline void __smp_threshold_interrupt(void) mce_threshold_vector(); } -asmlinkage __visible void smp_threshold_interrupt(void) +asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) { entering_irq(); __smp_threshold_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_threshold_interrupt(void) +asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) { entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index cdc0deab00c9..13dbcc0f9d03 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -34,6 +34,7 @@ #include <linux/mm.h> #include <asm/microcode_intel.h> +#include <asm/intel-family.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/setup.h> @@ -1046,6 +1047,18 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } +static bool is_blacklisted(unsigned int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) { + pr_err_once("late loading on model 79 is disabled.\n"); + return true; + } + + return false; +} + static enum ucode_state request_microcode_fw(int cpu, struct device *device, bool refresh_fw) { @@ -1054,6 +1067,9 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, const struct firmware *firmware; enum ucode_state ret; + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); @@ -1078,6 +1094,9 @@ static int get_ucode_user(void *to, const void *from, size_t n) static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 8f44c5a50ab8..f228f74051b6 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -31,6 +31,7 @@ #include <asm/apic.h> #include <asm/timer.h> #include <asm/reboot.h> +#include <asm/nmi.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); @@ -158,6 +159,26 @@ static unsigned char hv_get_nmi_reason(void) return 0; } +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes + * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle + * unknown NMI on the first CPU which gets it. + */ +static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) +{ + static atomic_t nmi_cpu = ATOMIC_INIT(-1); + + if (!unknown_nmi_panic) + return NMI_DONE; + + if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1) + return NMI_HANDLED; + + return NMI_DONE; +} +#endif + static void __init ms_hyperv_init_platform(void) { /* @@ -183,6 +204,9 @@ static void __init ms_hyperv_init_platform(void) pr_info("HyperV: LAPIC Timer Frequency: %#x\n", lapic_timer_frequency); } + + register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST, + "hv_nmi_unknown"); #endif if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 2f2b8c7ccb85..6f0ab305dd5e 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -101,6 +101,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) * Boot time FPU feature detection code: */ unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +EXPORT_SYMBOL_GPL(mxcsr_feature_mask); static void __init fpu__init_system_mxcsr(void) { diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index c114b132d121..7052d9a65fe9 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -130,11 +130,16 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_write(fpu); - if (boot_cpu_has(X86_FEATURE_XSAVES)) + if (boot_cpu_has(X86_FEATURE_XSAVES)) { ret = copyin_to_xsaves(kbuf, ubuf, xsave); - else + } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + /* xcomp_bv must be 0 when using uncompacted format */ + if (!ret && xsave->header.xcomp_bv) + ret = -EINVAL; + } + /* * In case of failure, mark all states as init: */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a184c210efba..3ec0d2d64601 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -329,6 +329,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } else { err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); + + /* xcomp_bv must be 0 when using uncompacted format */ + if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv) + err = -EINVAL; } if (err || __copy_from_user(&env, buf, sizeof(env))) { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8639bb2ae058..6bf09f5594b2 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -983,6 +983,18 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long return_hooker = (unsigned long) &return_to_handler; + /* + * When resuming from suspend-to-ram, this function can be indirectly + * called from early CPU startup code while the CPU is in real mode, + * which would fail miserably. Make sure the stack pointer is a + * virtual address. + * + * This check isn't as accurate as virt_addr_valid(), but it should be + * good enough for this purpose, and it's fast. + */ + if (unlikely((long)__builtin_frame_address(0) >= 0)) + return; + if (unlikely(ftrace_graph_is_dead())) return; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 54a2372f5dbb..b5785c197e53 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -4,6 +4,7 @@ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE */ +#define DISABLE_BRANCH_PROFILING #include <linux/init.h> #include <linux/linkage.h> #include <linux/types.h> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9f669fdd2010..8a7ad9fb22c1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -265,7 +265,7 @@ void __smp_x86_platform_ipi(void) x86_platform_ipi_callback(); } -__visible void smp_x86_platform_ipi(struct pt_regs *regs) +__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -316,7 +316,7 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) } #endif -__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) +__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 3512ba607361..275487872be2 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -9,6 +9,7 @@ #include <linux/hardirq.h> #include <asm/apic.h> #include <asm/trace/irq_vectors.h> +#include <linux/interrupt.h> static inline void __smp_irq_work_interrupt(void) { @@ -16,14 +17,14 @@ static inline void __smp_irq_work_interrupt(void) irq_work_run(); } -__visible void smp_irq_work_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_irq_work_interrupt(); exiting_irq(); } -__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index c6ee63f927ab..d688826e5736 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -67,7 +67,7 @@ #endif /* Ensure if the instruction can be boostable */ -extern int can_boost(kprobe_opcode_t *instruction); +extern int can_boost(kprobe_opcode_t *instruction, void *addr); /* Recover instruction if given address is probed */ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index d9d8d16b69db..b55d07b9d530 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -166,12 +166,12 @@ NOKPROBE_SYMBOL(skip_prefixes); * Returns non-zero if opcode is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ -int can_boost(kprobe_opcode_t *opcodes) +int can_boost(kprobe_opcode_t *opcodes, void *addr) { kprobe_opcode_t opcode; kprobe_opcode_t *orig_opcodes = opcodes; - if (search_exception_tables((unsigned long)opcodes)) + if (search_exception_tables((unsigned long)addr)) return 0; /* Page fault may occur on this address. */ retry: @@ -416,7 +416,7 @@ static int arch_copy_kprobe(struct kprobe *p) * __copy_instruction can modify the displacement of the instruction, * but it doesn't affect boostable check. */ - if (can_boost(p->ainsn.insn)) + if (can_boost(p->ainsn.insn, p->addr)) p->ainsn.boostable = 0; else p->ainsn.boostable = -1; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3bb4c5f021f6..4d74f7386a61 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -178,7 +178,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src) while (len < RELATIVEJUMP_SIZE) { ret = __copy_instruction(dest + len, src + len); - if (!ret || !can_boost(dest + len)) + if (!ret || !can_boost(dest + len, src + len)) return -EINVAL; len += ret; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index edbbfc854e39..77f17cbfe271 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -141,7 +141,8 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1; + n.halted = is_idle_task(current) || preempt_count() > 1 || + rcu_preempt_depth(); init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); raw_spin_unlock(&b->lock); @@ -152,6 +153,8 @@ void kvm_async_pf_task_wait(u32 token) if (hlist_unhashed(&n.link)) break; + rcu_irq_exit(); + if (!n.halted) { local_irq_enable(); schedule(); @@ -160,11 +163,11 @@ void kvm_async_pf_task_wait(u32 token) /* * We cannot reschedule. So halt. */ - rcu_irq_exit(); native_safe_halt(); - rcu_irq_enter(); local_irq_disable(); } + + rcu_irq_enter(); } if (!n.halted) finish_swait(&n.wq, &wait); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 5d400ba1349d..d47517941bbc 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -296,7 +296,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, /* were we called with bad_dma_address? */ badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); - if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { + if (unlikely(dma_addr < badend)) { WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); return; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3760b3c1ca0..0887d2ae3797 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -136,6 +136,123 @@ void release_thread(struct task_struct *dead_task) } } +enum which_selector { + FS, + GS +}; + +/* + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. + * It's forcibly inlined because it'll generate better code and this function + * is hot. + */ +static __always_inline void save_base_legacy(struct task_struct *prev_p, + unsigned short selector, + enum which_selector which) +{ + if (likely(selector == 0)) { + /* + * On Intel (without X86_BUG_NULL_SEG), the segment base could + * be the pre-existing saved base or it could be zero. On AMD + * (with X86_BUG_NULL_SEG), the segment base could be almost + * anything. + * + * This branch is very hot (it's hit twice on almost every + * context switch between 64-bit programs), and avoiding + * the RDMSR helps a lot, so we just assume that whatever + * value is already saved is correct. This matches historical + * Linux behavior, so it won't break existing applications. + * + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we + * report that the base is zero, it needs to actually be zero: + * see the corresponding logic in load_seg_legacy. + */ + } else { + /* + * If the selector is 1, 2, or 3, then the base is zero on + * !X86_BUG_NULL_SEG CPUs and could be anything on + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux + * has never attempted to preserve the base across context + * switches. + * + * If selector > 3, then it refers to a real segment, and + * saving the base isn't necessary. + */ + if (which == FS) + prev_p->thread.fsbase = 0; + else + prev_p->thread.gsbase = 0; + } +} + +static __always_inline void save_fsgs(struct task_struct *task) +{ + savesegment(fs, task->thread.fsindex); + savesegment(gs, task->thread.gsindex); + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); +} + +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) +{ + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) +{ + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -216,10 +333,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, unsigned int _cs, unsigned int _ss, unsigned int _ds) { + WARN_ON_ONCE(regs != current_pt_regs()); + + if (static_cpu_has(X86_BUG_NULL_SEG)) { + /* Loading zero below won't clear the base. */ + loadsegment(fs, __USER_DS); + load_gs_index(__USER_DS); + } + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); + regs->ip = new_ip; regs->sp = new_sp; regs->cs = _cs; @@ -264,7 +390,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - unsigned prev_fsindex, prev_gsindex; fpu_switch_t fpu_switch; fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); @@ -274,8 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * * (e.g. xen_load_tls()) */ - savesegment(fs, prev_fsindex); - savesegment(gs, prev_gsindex); + save_fsgs(prev_p); /* * Load TLS before restoring any segments so that segment loads @@ -314,108 +438,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - /* - * Switch FS and GS. - * - * These are even more complicated than DS and ES: they have - * 64-bit bases are that controlled by arch_prctl. The bases - * don't necessarily match the selectors, as user code can do - * any number of things to cause them to be inconsistent. - * - * We don't promise to preserve the bases if the selectors are - * nonzero. We also don't promise to preserve the base if the - * selector is zero and the base doesn't match whatever was - * most recently passed to ARCH_SET_FS/GS. (If/when the - * FSGSBASE instructions are enabled, we'll need to offer - * stronger guarantees.) - * - * As an invariant, - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is - * impossible. - */ - if (next->fsindex) { - /* Loading a nonzero value into FS sets the index and base. */ - loadsegment(fs, next->fsindex); - } else { - if (next->fsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_fsindex) - loadsegment(fs, 0); - wrmsrl(MSR_FS_BASE, next->fsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - */ - loadsegment(fs, __USER_DS); - loadsegment(fs, 0); - } else { - /* - * If the previous index is zero and ARCH_SET_FS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->fsbase || prev_fsindex) - loadsegment(fs, 0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_fsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_fsindex) - prev->fsbase = 0; - prev->fsindex = prev_fsindex; - - if (next->gsindex) { - /* Loading a nonzero value into GS sets the index and base. */ - load_gs_index(next->gsindex); - } else { - if (next->gsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_gsindex) - load_gs_index(0); - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - * - * This contains a pointless SWAPGS pair. - * Fixing it would involve an explicit check - * for Xen or a new pvop. - */ - load_gs_index(__USER_DS); - load_gs_index(0); - } else { - /* - * If the previous index is zero and ARCH_SET_GS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->gsbase || prev_gsindex) - load_gs_index(0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_gsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_gsindex) - prev->gsbase = 0; - prev->gsindex = prev_gsindex; + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); switch_fpu_finish(next_fpu, fpu_switch); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e244c19a2451..067f9813fd2c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -223,6 +223,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, + { /* Handle problems with rebooting on ASUS EeeBook X205TA */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"), + }, + }, + { /* Handle problems with rebooting on ASUS EeeBook X205TAW */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TAW", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"), + }, + }, /* Certec */ { /* Handle problems with rebooting on Certec BPC600 */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9c337b0e8ba7..feaab07fa124 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1054,6 +1054,13 @@ void __init setup_arch(char **cmdline_p) max_possible_pfn = max_pfn; /* + * This call is required when the CPU does not support PAT. If + * mtrr_bp_init() invoked it already via pat_init() the call has no + * effect. + */ + init_cache_modes(); + + /* * Define random base addresses for memory sections after max_pfn is * defined and before each memory section base is used. */ diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index ec1f756f9dc9..71beb28600d4 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -151,8 +151,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, if (from->si_signo == SIGSEGV) { if (from->si_code == SEGV_BNDERR) { - compat_uptr_t lower = (unsigned long)&to->si_lower; - compat_uptr_t upper = (unsigned long)&to->si_upper; + compat_uptr_t lower = (unsigned long)from->si_lower; + compat_uptr_t upper = (unsigned long)from->si_upper; put_user_ex(lower, &to->si_lower); put_user_ex(upper, &to->si_upper); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index c00cb64bc0a1..ca699677e288 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -259,7 +259,7 @@ static inline void __smp_reschedule_interrupt(void) scheduler_ipi(); } -__visible void smp_reschedule_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { irq_enter(); ack_APIC_irq(); @@ -270,7 +270,7 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs) */ } -__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* * Need to call irq_enter() before calling the trace point. @@ -294,14 +294,15 @@ static inline void __smp_call_function_interrupt(void) inc_irq_stat(irq_call_count); } -__visible void smp_call_function_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_call_function_interrupt(); exiting_irq(); } -__visible void smp_trace_call_function_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_trace_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); @@ -316,14 +317,16 @@ static inline void __smp_call_function_single_interrupt(void) inc_irq_stat(irq_call_count); } -__visible void smp_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_call_function_single_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } -__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_trace_call_function_single_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 36171bcd91f8..9fe7b9e1ae30 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -181,6 +181,12 @@ static void smp_callin(void) smp_store_cpu_info(cpuid); /* + * The topology information must be up to date before + * calibrate_delay() and notify_cpu_starting(). + */ + set_cpu_sibling_map(raw_smp_processor_id()); + + /* * Get our bogomips. * Update loops_per_jiffy in cpu_data. Previous call to * smp_store_cpu_info() stored a value that is close but not as @@ -190,11 +196,6 @@ static void smp_callin(void) cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; pr_debug("Stack at about %p\n", &cpuid); - /* - * This must be done before setting cpu_online_mask - * or calling notify_cpu_starting. - */ - set_cpu_sibling_map(raw_smp_processor_id()); wmb(); notify_cpu_starting(cpuid); diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index a55ed63b9f91..1119414ab419 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -140,7 +140,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (end - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -183,7 +183,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 46b2f41f8b05..44bf5cf417d3 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -694,6 +694,7 @@ unsigned long native_calibrate_tsc(void) crystal_khz = 24000; /* 24.0 MHz */ break; case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ATOM_DENVERTON: crystal_khz = 25000; /* 25.0 MHz */ break; case INTEL_FAM6_ATOM_GOLDMONT: @@ -1287,6 +1288,8 @@ static int __init init_tsc_clocksource(void) * exporting a reliable TSC. */ if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + if (boot_cpu_has(X86_FEATURE_ART)) + art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); return 0; } @@ -1379,12 +1382,10 @@ void __init tsc_init(void) unsigned long calibrate_delay_is_known(void) { int sibling, cpu = smp_processor_id(); - struct cpumask *mask = topology_core_cpumask(cpu); + int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); + const struct cpumask *mask = topology_core_cpumask(cpu); - if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) - return 0; - - if (!mask) + if (tsc_disabled || !constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index afa7bbb596cd..91af75e37306 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -456,7 +456,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; cpuid_mask(&entry->ecx, CPUID_7_ECX); /* PKU is not yet implemented for shadow paging. */ - if (!tdp_enabled) + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); } else { entry->ebx = 0; @@ -765,18 +765,20 @@ out: static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) { struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; - int j, nent = vcpu->arch.cpuid_nent; + struct kvm_cpuid_entry2 *ej; + int j = i; + int nent = vcpu->arch.cpuid_nent; e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; ; j = (j + 1) % nent) { - struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; - if (ej->function == e->function) { - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - return j; - } - } - return 0; /* silence gcc, even though control never reaches here */ + do { + j = (j + 1) % nent; + ej = &vcpu->arch.cpuid_entries[j]; + } while (ej->function != e->function); + + ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + + return j; } /* find an entry with matching function, matching index (if needed), and that @@ -846,12 +848,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) if (!best) best = check_cpuid_limit(vcpu, function, index); - /* - * Perfmon not yet supported for L2 guest. - */ - if (is_guest_mode(vcpu) && function == 0xa) - best = NULL; - if (best) { *eax = best->eax; *ebx = best->ebx; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 35058c2c0eea..9368fecca3ee 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_RTM)); } +static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_MPX)); +} + static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9f676adcdfc2..72b737b8c9d6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2543,7 +2543,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) u64 smbase; int ret; - if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0) + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); /* @@ -2592,11 +2592,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; } - if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) ctxt->ops->set_nmi_mask(ctxt, false); - ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK; - ctxt->emul_flags &= ~X86EMUL_SMM_MASK; + ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & + ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); return X86EMUL_CONTINUE; } @@ -2738,6 +2738,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); } + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; return X86EMUL_CONTINUE; } @@ -5312,6 +5313,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; + unsigned emul_flags; ctxt->mem_read.pos = 0; @@ -5326,6 +5328,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + emul_flags = ctxt->ops->get_hflags(ctxt); if (unlikely(ctxt->d & (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || @@ -5359,7 +5362,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(ctxt, &ctxt->dst); } - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5388,7 +5391,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5442,7 +5445,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3f05c044720b..b24b3c6d686e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -246,9 +246,14 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) recalculate_apic_map(apic->vcpu->kvm); } +static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) +{ + return ((id >> 4) << 16) | (1 << (id & 0xf)); +} + static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { - u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + u32 ldr = kvm_apic_calc_x2apic_ldr(id); kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); @@ -2029,6 +2034,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, { if (apic_x2apic_mode(vcpu->arch.apic)) { u32 *id = (u32 *)(s->regs + APIC_ID); + u32 *ldr = (u32 *)(s->regs + APIC_LDR); if (vcpu->kvm->arch.x2apic_format) { if (*id != vcpu->vcpu_id) @@ -2039,6 +2045,10 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, else *id <<= 24; } + + /* In x2APIC mode, the LDR is fixed and based on the id */ + if (set) + *ldr = kvm_apic_calc_x2apic_ldr(*id); } return 0; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d9c7e986b4e4..d29c745f10ad 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3489,12 +3489,15 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool can_do_async_pf(struct kvm_vcpu *vcpu) +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) { if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu))) return false; + if (is_guest_mode(vcpu)) + return false; + return kvm_x86_ops->interrupt_allowed(vcpu); } @@ -3510,7 +3513,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (!async) return false; /* *pfn has correct page already */ - if (!prefault && can_do_async_pf(vcpu)) { + if (!prefault && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(gva, gfn); if (kvm_find_async_pf_gfn(vcpu, gfn)) { trace_kvm_async_pf_doublefault(gva, gfn); @@ -3646,19 +3649,19 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { /* - * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set - * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means - * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. - */ - gpte |= level - PT_PAGE_TABLE_LEVEL - 1; - - /* * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. * If it is clear, there are no large pages at this level, so clear * PT_PAGE_SIZE_MASK in gpte if that is the case. */ gpte &= level - mmu->last_nonleaf_level; + /* + * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set + * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means + * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PT_PAGE_TABLE_LEVEL - 1; + return gpte & PT_PAGE_SIZE_MASK; } @@ -4166,6 +4169,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); + update_last_nonleaf_level(vcpu, context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ddc56e91f2e4..c92834c55c59 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -75,6 +75,7 @@ enum { int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index b431539c3714..85024e0cfaa5 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -156,6 +156,14 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); } +void kvm_page_track_cleanup(struct kvm *kvm) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + cleanup_srcu_struct(&head->track_srcu); +} + void kvm_page_track_init(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a01105485315..37363900297d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -324,10 +324,11 @@ retry_walk: --walker->level; index = PT_INDEX(addr, walker->level); - table_gfn = gpte_to_gfn(pte); offset = index * sizeof(pt_element_t); pte_gpa = gfn_to_gpa(table_gfn) + offset; + + BUG_ON(walker->level < 1); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c index 9d4a8504a95a..5ab4a364348e 100644 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/pmu_intel.c @@ -294,7 +294,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) ((u64)1 << edx.split.bit_width_fixed) - 1; } - pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | + pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8ca1eca5038d..23f1a6bd7a0d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2149,6 +2149,8 @@ static int ud_interception(struct vcpu_svm *svm) int er; er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); + if (er == EMULATE_USER_EXIT) + return 0; if (er != EMULATE_DONE) kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -3583,6 +3585,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; switch (ecx) { + case MSR_IA32_CR_PAT: + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + return 1; + vcpu->arch.pat = data; + svm->vmcb->save.g_pat = data; + mark_dirty(svm->vmcb, VMCB_NPT); + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr); break; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 64774f419c72..f0d3de153e29 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2167,46 +2167,44 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) struct pi_desc old, new; unsigned int dest; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; + /* + * First handle the simple case where no cmpxchg is necessary; just + * allow posting non-urgent interrupts. + * + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block will do it for us and the wakeup_handler + * expects the VCPU to be on the blocked_vcpu_list that matches + * PI.NDST. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || + vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + return; + } + + /* The full case. */ do { old.control = new.control = pi_desc->control; - /* - * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there - * are two possible cases: - * 1. After running 'pre_block', context switch - * happened. For this case, 'sn' was set in - * vmx_vcpu_put(), so we need to clear it here. - * 2. After running 'pre_block', we were blocked, - * and woken up by some other guy. For this case, - * we don't need to do anything, 'pi_post_block' - * will do everything for us. However, we cannot - * check whether it is case #1 or case #2 here - * (maybe, not needed), so we also clear sn here, - * I think it is not a big deal. - */ - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { - if (vcpu->cpu != cpu) { - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - } + dest = cpu_physical_id(cpu); - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; - /* Allow posting non-urgent interrupts */ new.sn = 0; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); } static void decache_tsc_multiplier(struct vcpu_vmx *vmx) @@ -2455,7 +2453,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) if (!(vmcs12->exception_bitmap & (1u << nr))) return 0; - nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); return 1; @@ -2987,7 +2985,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -3069,7 +3068,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) + return 1; + if (is_noncanonical_address(data & PAGE_MASK) || + (data & MSR_IA32_BNDCFGS_RSVD)) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; @@ -3693,7 +3696,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save) } vmcs_write16(sf->selector, var.selector); - vmcs_write32(sf->base, var.base); + vmcs_writel(sf->base, var.base); vmcs_write32(sf->limit, var.limit); vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); } @@ -4756,21 +4759,30 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) { #ifdef CONFIG_SMP if (vcpu->mode == IN_GUEST_MODE) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * Currently, we don't support urgent interrupt, - * all interrupts are recognized as non-urgent - * interrupt, so we cannot post interrupts when - * 'SN' is set. + * The vector of interrupt to be delivered to vcpu had + * been set in PIR before this function. * - * If the vcpu is in guest mode, it means it is - * running instead of being scheduled out and - * waiting in the run queue, and that's the only - * case when 'SN' is set currently, warning if - * 'SN' is set. + * Following cases will be reached in this block, and + * we always send a notification event in all cases as + * explained below. + * + * Case 1: vcpu keeps in non-root mode. Sending a + * notification event posts the interrupt to vcpu. + * + * Case 2: vcpu exits to root mode and is still + * runnable. PIR will be synced to vIRR before the + * next vcpu entry. Sending a notification event in + * this case has no effect, as vcpu is not in root + * mode. + * + * Case 3: vcpu exits to root mode and is blocked. + * vcpu_block() has already synced PIR to vIRR and + * never blocks vcpu if vIRR is not cleared. Therefore, + * a blocked vcpu here does not wait for any requested + * interrupts in PIR, and sending a notification event + * which has no effect is safe here. */ - WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR); @@ -5490,6 +5502,8 @@ static int handle_exception(struct kvm_vcpu *vcpu) return 1; } er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); + if (er == EMULATE_USER_EXIT) + return 0; if (er != EMULATE_DONE) kvm_queue_exception(vcpu, UD_VECTOR); return 1; @@ -6474,7 +6488,6 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); @@ -6925,14 +6938,20 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, } page = nested_get_page(vcpu, vmptr); - if (page == NULL || - *(u32 *)kmap(page) != VMCS12_REVISION) { + if (page == NULL) { nested_vmx_failInvalid(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + if (*(u32 *)kmap(page) != VMCS12_REVISION) { kunmap(page); + nested_release_page_clean(page); + nested_vmx_failInvalid(vcpu); skip_emulated_instruction(vcpu); return 1; } kunmap(page); + nested_release_page_clean(page); vmx->nested.vmxon_ptr = vmptr; break; case EXIT_REASON_VMCLEAR: @@ -8045,8 +8064,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_TASK_SWITCH: return true; case EXIT_REASON_CPUID: - if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) - return false; return true; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); @@ -8131,6 +8148,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); case EXIT_REASON_PREEMPTION_TIMER: return false; + case EXIT_REASON_PML_FULL: + /* We don't expose PML support to L1. */ + return false; default: return true; } @@ -8202,7 +8222,7 @@ static void kvm_flush_pml_buffers(struct kvm *kvm) static void vmx_dump_sel(char *name, uint32_t sel) { pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", - name, vmcs_read32(sel), + name, vmcs_read16(sel), vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); @@ -9176,6 +9196,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; + return &vmx->vcpu; free_vmcs: @@ -9985,6 +10012,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(vmx->nested.virtual_apic_page)); vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } else { +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif } if (cpu_has_vmx_msr_bitmap() && @@ -10067,6 +10099,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } + if (enable_pml) { + /* + * Conceptually we want to copy the PML address and index from + * vmcs01 here, and then back to vmcs01 on nested vmexit. But, + * since we always flush the log on each vmexit, this happens + * to be equivalent to simply resetting the fields in vmcs02. + */ + ASSERT(vmx->pml_pg); + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + } + if (nested_cpu_has_ept(vmcs12)) { kvm_mmu_unload(vcpu); nested_ept_init_mmu_context(vcpu); @@ -10648,7 +10692,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); */ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); - kvm_set_cr4(vcpu, vmcs12->host_cr4); + vmx_set_cr4(vcpu, vmcs12->host_cr4); nested_ept_uninit_mmu_context(vcpu); @@ -10673,6 +10717,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); + vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) @@ -10977,6 +11023,37 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + do { + old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + vcpu->pre_pcpu = -1; + } +} + /* * This routine does the following things for vCPU which is going * to be blocked if VT-d PI is enabled. @@ -10992,7 +11069,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, */ static int pi_pre_block(struct kvm_vcpu *vcpu) { - unsigned long flags; unsigned int dest; struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -11002,34 +11078,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) !kvm_vcpu_apicv_active(vcpu)) return 0; - vcpu->pre_pcpu = vcpu->cpu; - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } do { old.control = new.control = pi_desc->control; - /* - * We should not block the vCPU if - * an interrupt is posted for it. - */ - if (pi_test_on(pi_desc) == 1) { - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - - return 1; - } - WARN((pi_desc->sn == 1), "Warning: SN field of posted-interrupts " "is set before blocking\n"); @@ -11051,10 +11113,15 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); - return 0; + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); } static int vmx_pre_block(struct kvm_vcpu *vcpu) @@ -11070,44 +11137,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) static void pi_post_block(struct kvm_vcpu *vcpu) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - unsigned long flags; - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (vcpu->pre_pcpu == -1) return; - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* Allow posting non-urgent interrupts */ - new.sn = 0; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); - - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - } + WARN_ON(irqs_disabled()); + local_irq_disable(); + __pi_post_block(vcpu); + local_irq_enable(); } static void vmx_post_block(struct kvm_vcpu *vcpu) @@ -11135,7 +11171,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; - int idx, ret = -EINVAL; + int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || @@ -11144,7 +11180,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, idx = srcu_read_lock(&kvm->irq_srcu); irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { if (e->type != KVM_IRQ_ROUTING_MSI) @@ -11187,12 +11228,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, if (set) ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else { - /* suppress notification event before unposting */ - pi_set_sn(vcpu_to_pi_desc(vcpu)); + else ret = irq_set_vcpu_affinity(host_irq, NULL); - pi_clear_sn(vcpu_to_pi_desc(vcpu)); - } if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 731044efb195..02d45296a97c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1735,6 +1735,7 @@ static u64 __get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; + u64 ret; spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { @@ -1746,10 +1747,17 @@ static u64 __get_kvmclock_ns(struct kvm *kvm) hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; spin_unlock(&ka->pvclock_gtod_sync_lock); + /* both __this_cpu_read() and rdtsc() should be on the same cpu */ + get_cpu(); + kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); - return __pvclock_read_cycles(&hv_clock, rdtsc()); + ret = __pvclock_read_cycles(&hv_clock, rdtsc()); + + put_cpu(); + + return ret; } u64 get_kvmclock_ns(struct kvm *kvm) @@ -1789,6 +1797,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) */ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); + if (guest_hv_clock.version & 1) + ++guest_hv_clock.version; /* first time write, random junk */ + vcpu->hv_clock.version = guest_hv_clock.version + 1; kvm_write_guest_cached(v->kvm, &vcpu->pv_time, &vcpu->hv_clock, @@ -3051,6 +3062,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) return -EINVAL; + /* INITs are latched while in SMM */ + if (events->flags & KVM_VCPUEVENT_VALID_SMM && + (events->smi.smm || events->smi.pending) && + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) + return -EINVAL; + process_nmi(vcpu); vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; @@ -3225,11 +3242,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, } } +#define XSAVE_MXCSR_OFFSET 24 + static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { u64 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; + u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; if (boot_cpu_has(X86_FEATURE_XSAVE)) { /* @@ -3237,11 +3257,13 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~kvm_supported_xcr0()) + if (xstate_bv & ~kvm_supported_xcr0() || + mxcsr & ~mxcsr_feature_mask) return -EINVAL; load_xsave(vcpu, (u8 *)guest_xsave->region); } else { - if (xstate_bv & ~XFEATURE_MASK_FPSSE) + if (xstate_bv & ~XFEATURE_MASK_FPSSE || + mxcsr & ~mxcsr_feature_mask) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state.fxsave, guest_xsave->region, sizeof(struct fxregs_state)); @@ -4744,16 +4766,20 @@ emul_write: static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) { - /* TODO: String I/O for in kernel device */ - int r; + int r = 0, i; - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); + for (i = 0; i < vcpu->arch.pio.count; i++) { + if (vcpu->arch.pio.in) + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + else + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); + if (r) + break; + pd += vcpu->arch.pio.size; + } return r; } @@ -4791,6 +4817,8 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (vcpu->arch.pio.count) goto data_avail; + memset(vcpu->arch.pio_data, 0, size * count); + ret = emulator_pio_in_out(vcpu, size, port, val, count, true); if (ret) { data_avail: @@ -4974,6 +5002,8 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, if (var.unusable) { memset(desc, 0, sizeof(*desc)); + if (base3) + *base3 = 0; return false; } @@ -5129,6 +5159,16 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); } +static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) +{ + return emul_to_vcpu(ctxt)->arch.hflags; +} + +static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) +{ + kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -5168,6 +5208,8 @@ static const struct x86_emulate_ops emulate_ops = { .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, .set_nmi_mask = emulator_set_nmi_mask, + .get_hflags = emulator_get_hflags, + .set_hflags = emulator_set_hflags, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -5211,6 +5253,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); ctxt->eflags = kvm_get_rflags(vcpu); + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; + ctxt->eip = kvm_rip_read(vcpu); ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : @@ -5220,7 +5264,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); - ctxt->emul_flags = vcpu->arch.hflags; init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; @@ -5427,37 +5470,26 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) +static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) { struct kvm_run *kvm_run = vcpu->run; - /* - * rflags is the old, "raw" value of the flags. The new value has - * not been saved yet. - * - * This is correct even for TF set by the guest, because "the - * processor will not generate this exception after the instruction - * that sets the TF flag". - */ - if (unlikely(rflags & X86_EFLAGS_TF)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | - DR6_RTM; - kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; - } else { - vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; - /* - * "Certain debug exceptions may clear bit 0-3. The - * remaining contents of the DR6 register are never - * cleared by the processor". - */ - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); - } + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + } else { + vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BS | DR6_RTM; + kvm_queue_exception(vcpu, DB_VECTOR); } } @@ -5547,6 +5579,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) return EMULATE_DONE; + if (ctxt->have_exception && inject_emulated_exception(vcpu)) + return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) return EMULATE_FAIL; return handle_emulation_failure(vcpu); @@ -5611,11 +5645,10 @@ restart: unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - if (vcpu->arch.hflags != ctxt->emul_flags) - kvm_set_hflags(vcpu, ctxt->emul_flags); kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE) - kvm_vcpu_check_singlestep(vcpu, rflags, &r); + if (r == EMULATE_DONE && + (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + kvm_vcpu_do_singlestep(vcpu, &r); if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) __kvm_set_rflags(vcpu, ctxt->eflags); @@ -6086,7 +6119,8 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); + return emulator_write_emulated(ctxt, rip, instruction, 3, + &ctxt->exception); } static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) @@ -7162,6 +7196,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state != KVM_MP_STATE_RUNNABLE) return -EINVAL; + /* INITs are latched while in SMM */ + if ((is_smm(vcpu) || vcpu->arch.smi_pending) && + (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || + mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) + return -EINVAL; + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); @@ -7976,6 +8016,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_free_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kvm_mmu_uninit_vm(kvm); + kvm_page_track_cleanup(kvm); } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, @@ -8412,8 +8453,7 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) return true; else - return !kvm_event_needs_reinjection(vcpu) && - kvm_x86_ops->interrupt_allowed(vcpu); + return kvm_can_do_async_pf(vcpu); } void kvm_arch_start_assignment(struct kvm *kvm) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index d376e4b48f88..04c067bafe24 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -84,7 +84,7 @@ ENTRY(copy_user_generic_unrolled) movl %edx,%ecx andl $63,%edx shrl $6,%ecx - jz 17f + jz .L_copy_short_string 1: movq (%rsi),%r8 2: movq 1*8(%rsi),%r9 3: movq 2*8(%rsi),%r10 @@ -105,7 +105,8 @@ ENTRY(copy_user_generic_unrolled) leaq 64(%rdi),%rdi decl %ecx jnz 1b -17: movl %edx,%ecx +.L_copy_short_string: + movl %edx,%ecx andl $7,%edx shrl $3,%ecx jz 20f @@ -221,6 +222,8 @@ EXPORT_SYMBOL(copy_user_generic_string) */ ENTRY(copy_user_enhanced_fast_string) ASM_STAC + cmpl $64,%edx + jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */ movl %edx,%ecx 1: rep movsb diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 779782f58324..9a53a06e5a3e 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -290,7 +290,7 @@ EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled) _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 767be7c76034..1754e094bc28 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -896,7 +896,7 @@ EndTable GrpTable: Grp3_1 0: TEST Eb,Ib -1: +1: TEST Eb,Ib 2: NOT Eb 3: NEG Eb 4: MUL AL,Eb diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9f72ca3b2669..8b5ff88aa4f8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -191,8 +191,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, - struct vm_area_struct *vma) +static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) { /* This is effectively an #ifdef */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) @@ -208,7 +207,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * valid VMA, so we should never reach this without a * valid VMA. */ - if (!vma) { + if (!pkey) { WARN_ONCE(1, "PKU fault with no VMA passed in"); info->si_pkey = 0; return; @@ -218,13 +217,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * absolutely guranteed to be 100% accurate because of * the race explained above. */ - info->si_pkey = vma_pkey(vma); + info->si_pkey = *pkey; } static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk, struct vm_area_struct *vma, - int fault) + struct task_struct *tsk, u32 *pkey, int fault) { unsigned lsb = 0; siginfo_t info; @@ -239,7 +237,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; - fill_sig_info_pkey(si_code, &info, vma); + fill_sig_info_pkey(si_code, &info, pkey); force_sig_info(si_signo, &info, tsk); } @@ -718,8 +716,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; unsigned long flags; int sig; - /* No context means no VMA to pass down */ - struct vm_area_struct *vma = NULL; /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF)) { @@ -744,7 +740,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, - tsk, vma, 0); + tsk, NULL, 0); } /* @@ -853,8 +849,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - int si_code) + unsigned long address, u32 *pkey, int si_code) { struct task_struct *tsk = current; @@ -902,7 +897,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); return; } @@ -915,9 +910,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma) + unsigned long address, u32 *pkey) { - __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); + __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); } static void @@ -925,6 +920,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct vm_area_struct *vma, int si_code) { struct mm_struct *mm = current->mm; + u32 pkey; + + if (vma) + pkey = vma_pkey(vma); /* * Something tried to access memory that isn't in our memory map.. @@ -932,7 +931,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, */ up_read(&mm->mmap_sem); - __bad_area_nosemaphore(regs, error_code, address, vma, si_code); + __bad_area_nosemaphore(regs, error_code, address, + (vma) ? &pkey : NULL, si_code); } static noinline void @@ -975,7 +975,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - struct vm_area_struct *vma, unsigned int fault) + u32 *pkey, unsigned int fault) { struct task_struct *tsk = current; int code = BUS_ADRERR; @@ -1002,13 +1002,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); + force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - unsigned int fault) + unsigned long address, u32 *pkey, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1032,9 +1031,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, vma, fault); + do_sigbus(regs, error_code, address, pkey, fault); else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address, vma); + bad_area_nosemaphore(regs, error_code, address, pkey); else BUG(); } @@ -1220,6 +1219,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm; int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + u32 pkey; tsk = current; mm = tsk->mm; @@ -1393,7 +1393,17 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. + * + * Note that handle_userfault() may also release and reacquire mmap_sem + * (and not return with VM_FAULT_RETRY), when returning to userland to + * repeat the page fault later with a VM_FAULT_NOPAGE retval + * (potentially after handling any pending signal during the return to + * userland). The return to userland is identified whenever + * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. + * Thus we have to be careful about not touching vma after handling the + * fault, so we read the pkey beforehand. */ + pkey = vma_pkey(vma); fault = handle_mm_fault(vma, address, flags); major |= fault & VM_FAULT_MAJOR; @@ -1422,7 +1432,7 @@ good_area: up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, vma, fault); + mm_fault_error(regs, error_code, address, &pkey, fault); return; } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 0d4fb3ebbbac..1680768d392c 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -120,6 +120,11 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, return 0; } + if (!pte_allows_gup(pte_val(pte), write)) { + pte_unmap(ptep); + return 0; + } + if (pte_devmap(pte)) { pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { @@ -127,8 +132,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, pte_unmap(ptep); return 0; } - } else if (!pte_allows_gup(pte_val(pte), write) || - pte_special(pte)) { + } else if (pte_special(pte)) { pte_unmap(ptep); return 0; } diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 2ae8584b44c7..fe342e8ed529 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -144,7 +144,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } if (mm->get_unmapped_area == arch_get_unmapped_area) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 22af912d66d2..889e7619a091 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -643,21 +643,40 @@ void __init init_mem_mapping(void) * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains BIOS code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. + * On x86, access has to be given to the first megabyte of RAM because that + * area traditionally contains BIOS code and data regions used by X, dosemu, + * and similar apps. Since they map the entire memory range, the whole range + * must be allowed (for mapping), but any areas that would otherwise be + * disallowed are flagged as being "zero filled" instead of rejected. + * Access has to be given to non-kernel-ram areas as well, these contain the + * PCI mmio resources as well as potential bios/acpi data regions. */ int devmem_is_allowed(unsigned long pagenr) { - if (pagenr < 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + if (page_is_ram(pagenr)) { + /* + * For disallowed memory regions in the low 1MB range, + * request that the page be shown as all zeros. + */ + if (pagenr < 256) + return 2; + + return 0; + } + + /* + * This must follow RAM test, since System RAM is considered a + * restricted resource under CONFIG_STRICT_IOMEM. + */ + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) { + /* Low 1MB bypasses iomem restrictions. */ + if (pagenr < 256) + return 1; + return 0; - if (!page_is_ram(pagenr)) - return 1; - return 0; + } + + return 1; } void free_init_pages(char *what, unsigned long begin, unsigned long end) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 14b9dd71d9e8..3e27ded6ac65 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -94,10 +94,10 @@ __setup("noexec32=", nonx32_setup); */ void sync_global_pgds(unsigned long start, unsigned long end, int removed) { - unsigned long address; + unsigned long addr; - for (address = start; address <= end; address += PGDIR_SIZE) { - const pgd_t *pgd_ref = pgd_offset_k(address); + for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + const pgd_t *pgd_ref = pgd_offset_k(addr); struct page *page; /* @@ -113,7 +113,7 @@ void sync_global_pgds(unsigned long start, unsigned long end, int removed) pgd_t *pgd; spinlock_t *pgt_lock; - pgd = (pgd_t *)page_address(page) + pgd_index(address); + pgd = (pgd_t *)page_address(page) + pgd_index(addr); /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); @@ -689,7 +689,7 @@ static void __meminit free_pagetable(struct page *page, int order) if (PageReserved(page)) { __ClearPageReserved(page); - magic = (unsigned long)page->lru.next; + magic = (unsigned long)page->freelist; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0493c17b8a51..333362f992e4 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,3 +1,4 @@ +#define DISABLE_BRANCH_PROFILING #define pr_fmt(fmt) "kasan: " fmt #include <linux/bootmem.h> #include <linux/kasan.h> diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 887e57182716..aed206475aa7 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -48,7 +48,7 @@ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; #if defined(CONFIG_X86_ESPFIX64) static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; #elif defined(CONFIG_EFI) -static const unsigned long vaddr_end = EFI_VA_START; +static const unsigned long vaddr_end = EFI_VA_END; #else static const unsigned long vaddr_end = __START_KERNEL_map; #endif @@ -105,7 +105,7 @@ void __init kernel_randomize_memory(void) */ BUILD_BUG_ON(vaddr_start >= vaddr_end); BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && - vaddr_end >= EFI_VA_START); + vaddr_end >= EFI_VA_END); BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || IS_ENABLED(CONFIG_EFI)) && vaddr_end >= __START_KERNEL_map); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index e4f800999b32..a75103e7f963 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -293,7 +293,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) * We were not able to extract an address from the instruction, * probably because there was something invalid in it. */ - if (info->si_addr == (void *)-1) { + if (info->si_addr == (void __user *)-1) { err = -EINVAL; goto err_out; } @@ -525,15 +525,7 @@ int mpx_handle_bd_fault(void) if (!kernel_managing_mpx_tables(current->mm)) return -EINVAL; - if (do_mpx_bt_fault()) { - force_sig(SIGSEGV, current); - /* - * The force_sig() is essentially "handling" this - * exception, so we do not pass up the error - * from do_mpx_bt_fault(). - */ - } - return 0; + return do_mpx_bt_fault(); } /* diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 6b7ce6279133..aca6295350f3 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -100,5 +100,6 @@ void __init initmem_init(void) printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); + __vmalloc_start_set = true; setup_bootmem_allocator(); } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 83e701f160a9..89d7907c4218 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -36,14 +36,14 @@ #undef pr_fmt #define pr_fmt(fmt) "" fmt -static bool boot_cpu_done; - -static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); -static void init_cache_modes(void); +static bool __read_mostly boot_cpu_done; +static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); +static bool __read_mostly pat_initialized; +static bool __read_mostly init_cm_done; void pat_disable(const char *reason) { - if (!__pat_enabled) + if (pat_disabled) return; if (boot_cpu_done) { @@ -51,10 +51,8 @@ void pat_disable(const char *reason) return; } - __pat_enabled = 0; + pat_disabled = true; pr_info("x86/PAT: %s\n", reason); - - init_cache_modes(); } static int __init nopat(char *str) @@ -66,7 +64,7 @@ early_param("nopat", nopat); bool pat_enabled(void) { - return !!__pat_enabled; + return pat_initialized; } EXPORT_SYMBOL_GPL(pat_enabled); @@ -204,6 +202,8 @@ static void __init_cache_modes(u64 pat) update_cache_mode_entry(i, cache); } pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); + + init_cm_done = true; } #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) @@ -224,6 +224,7 @@ static void pat_bsp_init(u64 pat) } wrmsrl(MSR_IA32_CR_PAT, pat); + pat_initialized = true; __init_cache_modes(pat); } @@ -241,10 +242,9 @@ static void pat_ap_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); } -static void init_cache_modes(void) +void init_cache_modes(void) { u64 pat = 0; - static int init_cm_done; if (init_cm_done) return; @@ -286,8 +286,6 @@ static void init_cache_modes(void) } __init_cache_modes(pat); - - init_cm_done = 1; } /** @@ -305,10 +303,8 @@ void pat_init(void) u64 pat; struct cpuinfo_x86 *c = &boot_cpu_data; - if (!pat_enabled()) { - init_cache_modes(); + if (pat_disabled) return; - } if ((c->x86_vendor == X86_VENDOR_INTEL) && (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a7655f6caf7d..75fb01109f94 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -263,8 +263,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, { struct flush_tlb_info info; - if (end == 0) - end = start + PAGE_SIZE; info.flush_mm = mm; info.flush_start = start; info.flush_end = end; @@ -393,7 +391,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, 0UL); + flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); preempt_enable(); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 350f7096baac..7913b6921959 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -212,8 +212,8 @@ static void arch_perfmon_setup_counters(void) eax.full = cpuid_eax(0xa); /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ - if (eax.split.version_id == 0 && __this_cpu_read(cpu_info.x86) == 6 && - __this_cpu_read(cpu_info.x86_model) == 15) { + if (eax.split.version_id == 0 && boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == 15) { eax.split.version_id = 2; eax.split.num_counters = 2; eax.split.bit_width = 40; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 6d52b94f4bb9..20fa7c84109d 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -571,3 +571,35 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar); + +/* + * Apple MacBook Pro: Avoid [mem 0x7fa00000-0x7fbfffff] + * + * Using the [mem 0x7fa00000-0x7fbfffff] region, e.g., by assigning it to + * the 00:1c.0 Root Port, causes a conflict with [io 0x1804], which is used + * for soft poweroff and suspend-to-RAM. + * + * As far as we know, this is related to the address space, not to the Root + * Port itself. Attaching the quirk to the Root Port is a convenience, but + * it could probably also be a standalone DMI quirk. + * + * https://bugzilla.kernel.org/show_bug.cgi?id=103211 + */ +static void quirk_apple_mbp_poweroff(struct pci_dev *pdev) +{ + struct device *dev = &pdev->dev; + struct resource *res; + + if ((!dmi_match(DMI_PRODUCT_NAME, "MacBookPro11,4") && + !dmi_match(DMI_PRODUCT_NAME, "MacBookPro11,5")) || + pdev->bus->number != 0 || pdev->devfn != PCI_DEVFN(0x1c, 0)) + return; + + res = request_mem_region(0x7fa00000, 0x200000, + "MacBook Pro poweroff workaround"); + if (res) + dev_info(dev, "claimed %s %pR\n", res->name, res); + else + dev_info(dev, "can't work around MacBook Pro poweroff issue\n"); +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index bedfab98077a..4ea9f290c19f 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -234,23 +234,14 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 1; for_each_pci_msi_entry(msidesc, dev) { - __pci_read_msi_msg(msidesc, &msg); - pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | - ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); - if (msg.data != XEN_PIRQ_MSI_DATA || - xen_irq_from_pirq(pirq) < 0) { - pirq = xen_allocate_pirq_msi(dev, msidesc); - if (pirq < 0) { - irq = -ENODEV; - goto error; - } - xen_msi_compose_msg(dev, pirq, &msg); - __pci_write_msi_msg(msidesc, &msg); - dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); - } else { - dev_dbg(&dev->dev, - "xen: msi already bound to pirq=%d\n", pirq); + pirq = xen_allocate_pirq_msi(dev, msidesc); + if (pirq < 0) { + irq = -ENODEV; + goto error; } + xen_msi_compose_msg(dev, pirq, &msg); + __pci_write_msi_msg(msidesc, &msg); + dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, (type == PCI_CAP_ID_MSI) ? nvec : 1, (type == PCI_CAP_ID_MSIX) ? @@ -456,7 +447,7 @@ void __init xen_msi_init(void) int __init pci_xen_hvm_init(void) { - if (!xen_feature(XENFEAT_hvm_pirqs)) + if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs)) return 0; #ifdef CONFIG_ACPI diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 30031d5293c4..393a0c0288d1 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -201,6 +201,10 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) return; } + /* No need to reserve regions that will never be freed. */ + if (md.attribute & EFI_MEMORY_RUNTIME) + return; + size += addr % EFI_PAGE_SIZE; size = round_up(size, EFI_PAGE_SIZE); addr = round_down(addr, EFI_PAGE_SIZE); @@ -354,6 +358,9 @@ void __init efi_free_boot_services(void) free_bootmem_late(start, size); } + if (!num_entries) + return; + new_size = efi.memmap.desc_size * num_entries; new_phys = efi_memmap_alloc(num_entries); if (!new_phys) { diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c index 1693107a518e..0d17c0aafeb1 100644 --- a/arch/x86/platform/goldfish/goldfish.c +++ b/arch/x86/platform/goldfish/goldfish.c @@ -42,10 +42,22 @@ static struct resource goldfish_pdev_bus_resources[] = { } }; +static bool goldfish_enable __initdata; + +static int __init goldfish_setup(char *str) +{ + goldfish_enable = true; + return 0; +} +__setup("goldfish", goldfish_setup); + static int __init goldfish_init(void) { + if (!goldfish_enable) + return -ENODEV; + platform_device_register_simple("goldfish_pdev_bus", -1, - goldfish_pdev_bus_resources, 2); + goldfish_pdev_bus_resources, 2); return 0; } device_initcall(goldfish_init); diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index dd6cfa4ad3ac..75029d0cfa15 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -15,7 +15,7 @@ obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o obj-$(subst m,y,$(CONFIG_GPIO_INTEL_PMIC)) += platform_pmic_gpio.o obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o # SPI Devices -obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_spidev.o +obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o # I2C Devices obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c index 30c601b399ee..27186ad654c9 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c @@ -11,6 +11,7 @@ * of the License. */ +#include <linux/err.h> #include <linux/init.h> #include <linux/sfi.h> #include <linux/spi/pxa2xx_spi.h> @@ -34,6 +35,9 @@ static void __init *spidev_platform_data(void *info) { struct spi_board_info *spi_info = info; + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return ERR_PTR(-ENODEV); + spi_info->mode = SPI_MODE_0; spi_info->controller_data = &spidev_spi_chip; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index 3f1f1c77d090..10bad1e55fcc 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -19,7 +19,7 @@ #include <asm/intel_scu_ipc.h> #include <asm/io_apic.h> -#define TANGIER_EXT_TIMER0_MSI 15 +#define TANGIER_EXT_TIMER0_MSI 12 static struct platform_device wdt_dev = { .name = "intel_mid_wdt", diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 555b9fa0ad43..7dbdb780264d 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -8,6 +8,7 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib targets += purgatory.ro +KASAN_SANITIZE := n KCOV_INSTRUMENT := n # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 0c2fae8d929d..73eb7fd4aec4 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -992,11 +992,12 @@ static void emit_relocs(int as_text, int use_real_mode) die("Segment relocations found but --realmode not specified\n"); /* Order the relocations for more efficient processing */ - sort_relocs(&relocs16); sort_relocs(&relocs32); #if ELF_BITS == 64 sort_relocs(&relocs32neg); sort_relocs(&relocs64); +#else + sort_relocs(&relocs16); #endif /* Print the relocations */ diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index e30202b1716e..7c1601798169 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -125,7 +125,7 @@ int poke_user(struct task_struct *child, long addr, long data) else if ((addr >= offsetof(struct user, u_debugreg[0])) && (addr <= offsetof(struct user, u_debugreg[7]))) { addr -= offsetof(struct user, u_debugreg[0]); - addr = addr >> 2; + addr = addr >> 3; if ((addr == 4) || (addr == 5)) return -EIO; child->thread.arch.debugregs[addr] = data; diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 44c88ad1841a..bcea81f36fc5 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -145,7 +145,7 @@ static void xen_silent_inquire(int apicid) static int xen_cpu_present_to_apicid(int cpu) { if (cpu_present(cpu)) - return xen_get_apic_id(xen_apic_read(APIC_ID)); + return cpu_data(cpu).apicid; else return BAD_APICID; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bdd855685403..8f1f7efa848c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -137,6 +137,8 @@ struct shared_info xen_dummy_shared_info; void *xen_initial_gdt; RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); +__read_mostly int xen_have_vector_callback; +EXPORT_SYMBOL_GPL(xen_have_vector_callback); static int xen_cpu_up_prepare(unsigned int cpu); static int xen_cpu_up_online(unsigned int cpu); @@ -1521,7 +1523,10 @@ static void __init xen_pvh_early_guest_init(void) if (!xen_feature(XENFEAT_auto_translated_physmap)) return; - BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector)); + if (!xen_feature(XENFEAT_hvm_callback_vector)) + return; + + xen_have_vector_callback = 1; xen_pvh_early_cpu_init(0, false); xen_pvh_set_cr_flags(0); @@ -1860,7 +1865,9 @@ static int xen_cpu_up_prepare(unsigned int cpu) xen_vcpu_setup(cpu); } - if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) + if (xen_pv_domain() || + (xen_have_vector_callback && + xen_feature(XENFEAT_hvm_safe_pvclock))) xen_setup_timer(cpu); rc = xen_smp_intr_init(cpu); @@ -1876,7 +1883,9 @@ static int xen_cpu_dead(unsigned int cpu) { xen_smp_intr_free(cpu); - if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) + if (xen_pv_domain() || + (xen_have_vector_callback && + xen_feature(XENFEAT_hvm_safe_pvclock))) xen_teardown_timer(cpu); return 0; @@ -1915,8 +1924,8 @@ static void __init xen_hvm_guest_init(void) xen_panic_handler_init(); - BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector)); - + if (xen_feature(XENFEAT_hvm_callback_vector)) + xen_have_vector_callback = 1; xen_hvm_smp_init(); WARN_ON(xen_cpuhp_setup()); xen_unplug_emulated_devices(); @@ -1954,7 +1963,7 @@ bool xen_hvm_need_lapic(void) return false; if (!xen_hvm_domain()) return false; - if (xen_feature(XENFEAT_hvm_pirqs)) + if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) return false; return true; } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 7d5afdb417cc..418f1b8576cf 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2028,7 +2028,8 @@ static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) /* * Translate a virtual address to a physical one without relying on mapped - * page tables. + * page tables. Don't rely on big pages being aligned in (guest) physical + * space! */ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) { @@ -2049,7 +2050,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) sizeof(pud))); if (!pud_present(pud)) return 0; - pa = pud_pfn(pud) << PAGE_SHIFT; + pa = pud_val(pud) & PTE_PFN_MASK; if (pud_large(pud)) return pa + (vaddr & ~PUD_MASK); @@ -2057,7 +2058,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) sizeof(pmd))); if (!pmd_present(pmd)) return 0; - pa = pmd_pfn(pmd) << PAGE_SHIFT; + pa = pmd_val(pmd) & PTE_PFN_MASK; if (pmd_large(pmd)) return pa + (vaddr & ~PMD_MASK); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f8960fca0827..9f21b0c5945d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -713,10 +713,9 @@ static void __init xen_reserve_xen_mfnlist(void) size = PFN_PHYS(xen_start_info->nr_p2m_frames); } - if (!xen_is_e820_reserved(start, size)) { - memblock_reserve(start, size); + memblock_reserve(start, size); + if (!xen_is_e820_reserved(start, size)) return; - } #ifdef CONFIG_X86_32 /* @@ -727,6 +726,7 @@ static void __init xen_reserve_xen_mfnlist(void) BUG(); #else xen_relocate_p2m(); + memblock_free(start, size); #endif } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 311acad7dad2..137afbbd0590 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -765,6 +765,8 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) void __init xen_hvm_smp_init(void) { + if (!xen_have_vector_callback) + return; smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; smp_ops.smp_send_reschedule = xen_smp_send_reschedule; smp_ops.cpu_die = xen_cpu_die; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 33d8f6a7829d..67356d29d74d 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -432,6 +432,11 @@ static void xen_hvm_setup_cpu_clockevents(void) void __init xen_hvm_init_time_ops(void) { + /* vector callback is needed otherwise we cannot receive interrupts + * on cpu > 0 and at this point we don't know how many cpus are + * available */ + if (!xen_have_vector_callback) + return; if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { printk(KERN_INFO "Xen doesn't support pvclock on HVM," "disable pv timer\n"); |