diff options
Diffstat (limited to 'arch/x86')
47 files changed, 531 insertions, 326 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ba397bde7948..0dc9d0144a27 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -857,7 +857,7 @@ source "kernel/Kconfig.preempt" config X86_UP_APIC bool "Local APIC support on uniprocessors" - depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !PCI_MSI + depends on X86_32 && !SMP && !X86_32_NON_STANDARD ---help--- A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -868,6 +868,10 @@ config X86_UP_APIC performance counters), and the NMI watchdog which detects hard lockups. +config X86_UP_APIC_MSI + def_bool y + select X86_UP_APIC if X86_32 && !SMP && !X86_32_NON_STANDARD && PCI_MSI + config X86_UP_IOAPIC bool "IO-APIC support on uniprocessors" depends on X86_UP_APIC diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 5b016e2498f3..3db07f30636f 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -51,6 +51,7 @@ targets += cpustr.h $(obj)/cpustr.h: $(obj)/mkcpustr FORCE $(call if_changed,cpustr) endif +clean-files += cpustr.h # --------------------------------------------------------------------------- diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index d999398928bc..ad754b4411f7 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -90,7 +90,7 @@ suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_LZ4) := lz4 RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \ - perl $(srctree)/arch/x86/tools/calc_run_size.pl) + $(CONFIG_SHELL) $(srctree)/arch/x86/tools/calc_run_size.sh) quiet_cmd_mkpiggy = MKPIGGY $@ cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false ) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index dcc1c536cc21..a950864a64da 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -373,6 +373,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, unsigned long output_len, unsigned long run_size) { + unsigned char *output_orig = output; + real_mode = rmode; sanitize_boot_params(real_mode); @@ -421,7 +423,12 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, debug_putstr("\nDecompressing Linux... "); decompress(input_data, input_len, NULL, NULL, output, NULL, error); parse_elf(output); - handle_relocations(output, output_len); + /* + * 32-bit always performs relocations. 64-bit relocations are only + * needed if kASLR has chosen a different load address. + */ + if (!IS_ENABLED(CONFIG_X86_64) || output != output_orig) + handle_relocations(output, output_len); debug_putstr("done.\nBooting the kernel.\n"); return output; } diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index fd0f848938cc..5a4a089e8b1f 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -26,7 +26,6 @@ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o -obj-$(CONFIG_CRYPTO_SHA1_MB) += sha-mb/ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o @@ -46,6 +45,7 @@ endif ifeq ($(avx2_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o + obj-$(CONFIG_CRYPTO_SHA1_MB) += sha-mb/ endif aes-i586-y := aes-i586-asm_32.o aes_glue.o diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index 2df2a0298f5a..a916c4a61165 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -208,7 +208,7 @@ ddq_add_8: .if (klen == KEY_128) .if (load_keys) - vmovdqa 3*16(p_keys), xkeyA + vmovdqa 3*16(p_keys), xkey4 .endif .else vmovdqa 3*16(p_keys), xkeyA @@ -224,7 +224,7 @@ ddq_add_8: add $(16*by), p_in .if (klen == KEY_128) - vmovdqa 4*16(p_keys), xkey4 + vmovdqa 4*16(p_keys), xkeyB .else .if (load_keys) vmovdqa 4*16(p_keys), xkey4 @@ -234,7 +234,12 @@ ddq_add_8: .set i, 0 .rept by club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 3 */ + /* key 3 */ + .if (klen == KEY_128) + vaesenc xkey4, var_xdata, var_xdata + .else + vaesenc xkeyA, var_xdata, var_xdata + .endif .set i, (i +1) .endr @@ -243,13 +248,18 @@ ddq_add_8: .set i, 0 .rept by club XDATA, i - vaesenc xkey4, var_xdata, var_xdata /* key 4 */ + /* key 4 */ + .if (klen == KEY_128) + vaesenc xkeyB, var_xdata, var_xdata + .else + vaesenc xkey4, var_xdata, var_xdata + .endif .set i, (i +1) .endr .if (klen == KEY_128) .if (load_keys) - vmovdqa 6*16(p_keys), xkeyB + vmovdqa 6*16(p_keys), xkey8 .endif .else vmovdqa 6*16(p_keys), xkeyB @@ -267,12 +277,17 @@ ddq_add_8: .set i, 0 .rept by club XDATA, i - vaesenc xkeyB, var_xdata, var_xdata /* key 6 */ + /* key 6 */ + .if (klen == KEY_128) + vaesenc xkey8, var_xdata, var_xdata + .else + vaesenc xkeyB, var_xdata, var_xdata + .endif .set i, (i +1) .endr .if (klen == KEY_128) - vmovdqa 8*16(p_keys), xkey8 + vmovdqa 8*16(p_keys), xkeyB .else .if (load_keys) vmovdqa 8*16(p_keys), xkey8 @@ -288,7 +303,7 @@ ddq_add_8: .if (klen == KEY_128) .if (load_keys) - vmovdqa 9*16(p_keys), xkeyA + vmovdqa 9*16(p_keys), xkey12 .endif .else vmovdqa 9*16(p_keys), xkeyA @@ -297,7 +312,12 @@ ddq_add_8: .set i, 0 .rept by club XDATA, i - vaesenc xkey8, var_xdata, var_xdata /* key 8 */ + /* key 8 */ + .if (klen == KEY_128) + vaesenc xkeyB, var_xdata, var_xdata + .else + vaesenc xkey8, var_xdata, var_xdata + .endif .set i, (i +1) .endr @@ -306,7 +326,12 @@ ddq_add_8: .set i, 0 .rept by club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 9 */ + /* key 9 */ + .if (klen == KEY_128) + vaesenc xkey12, var_xdata, var_xdata + .else + vaesenc xkeyA, var_xdata, var_xdata + .endif .set i, (i +1) .endr @@ -412,7 +437,6 @@ ddq_add_8: /* main body of aes ctr load */ .macro do_aes_ctrmain key_len - cmp $16, num_bytes jb .Ldo_return2\key_len diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index a225a5ca1037..fd9f6b035b16 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -931,4 +931,4 @@ module_exit(sha1_mb_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, multi buffer accelerated"); -MODULE_ALIAS("sha1"); +MODULE_ALIAS_CRYPTO("sha1"); diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 0ab4f9fd2687..3a45668f6dc3 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -50,6 +50,7 @@ void acpi_pic_sci_set_trigger(unsigned int, u16); extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi, int trigger, int polarity); +extern void (*__acpi_unregister_gsi)(u32 gsi); static inline void disable_acpi(void) { diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 50d033a8947d..a94b82e8f156 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -251,7 +251,8 @@ static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; } -#define _LDT_empty(info) \ +/* This intentionally ignores lm, since 32-bit apps don't have that field. */ +#define LDT_empty(info) \ ((info)->base_addr == 0 && \ (info)->limit == 0 && \ (info)->contents == 0 && \ @@ -261,11 +262,18 @@ static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) (info)->seg_not_present == 1 && \ (info)->useable == 0) -#ifdef CONFIG_X86_64 -#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) -#else -#define LDT_empty(info) (_LDT_empty(info)) -#endif +/* Lots of programs expect an all-zero user_desc to mean "no segment at all". */ +static inline bool LDT_zero(const struct user_desc *info) +{ + return (info->base_addr == 0 && + info->limit == 0 && + info->contents == 0 && + info->read_exec_only == 0 && + info->seg_32bit == 0 && + info->limit_in_pages == 0 && + info->seg_not_present == 0 && + info->useable == 0); +} static inline void clear_LDT(void) { diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 40269a2bf6f9..4b75d591eb5e 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -130,7 +130,25 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end) { - mpx_notify_unmap(mm, vma, start, end); + /* + * mpx_notify_unmap() goes and reads a rarely-hot + * cacheline in the mm_struct. That can be expensive + * enough to be seen in profiles. + * + * The mpx_notify_unmap() call and its contents have been + * observed to affect munmap() performance on hardware + * where MPX is not present. + * + * The unlikely() optimizes for the fast case: no MPX + * in the CPU, or no MPX use in the process. Even if + * we get this wrong (in the unlikely event that MPX + * is widely enabled on some system) the overhead of + * MPX itself (reading bounds tables) is expected to + * overwhelm the overhead of getting this unlikely() + * consistently wrong. + */ + if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX))) + mpx_notify_unmap(mm, vma, start, end); } #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index e7e9682a33e9..f556c4843aa1 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -80,9 +80,11 @@ static inline unsigned int __getcpu(void) /* * Load per CPU data from GDT. LSL is faster than RDTSCP and - * works on all CPUs. + * works on all CPUs. This is volatile so that it orders + * correctly wrt barrier() and to keep gcc from cleverly + * hoisting it out of the calling function. */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); return p; } diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4433a4be8171..b9e30daa0881 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -611,20 +611,20 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) { - int irq; - - if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - *irqp = gsi; - } else { - mutex_lock(&acpi_ioapic_lock); - irq = mp_map_gsi_to_irq(gsi, - IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK); - mutex_unlock(&acpi_ioapic_lock); - if (irq < 0) - return -1; - *irqp = irq; + int rc, irq, trigger, polarity; + + rc = acpi_get_override_irq(gsi, &trigger, &polarity); + if (rc == 0) { + trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; + polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; + irq = acpi_register_gsi(NULL, gsi, trigger, polarity); + if (irq >= 0) { + *irqp = irq; + return 0; + } } - return 0; + + return -1; } EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); @@ -750,13 +750,13 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) } /* wrapper to silence section mismatch warning */ -int __ref acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) +int __ref acpi_map_cpu(acpi_handle handle, int physid, int *pcpu) { return _acpi_map_lsapic(handle, physid, pcpu); } -EXPORT_SYMBOL(acpi_map_lsapic); +EXPORT_SYMBOL(acpi_map_cpu); -int acpi_unmap_lsapic(int cpu) +int acpi_unmap_cpu(int cpu) { #ifdef CONFIG_ACPI_NUMA set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); @@ -768,8 +768,7 @@ int acpi_unmap_lsapic(int cpu) return (0); } - -EXPORT_SYMBOL(acpi_unmap_lsapic); +EXPORT_SYMBOL(acpi_unmap_cpu); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index e27b49d7c922..80091ae54c2b 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -66,3 +66,4 @@ targets += capflags.c $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE $(call if_changed,mkcapflags) endif +clean-files += capflags.c diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index e2b22df964cd..36d99a337b49 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -28,7 +28,7 @@ function dump_array() # If the /* comment */ starts with a quote string, grab that. VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')" [ -z "$VALUE" ] && VALUE="\"$NAME\"" - [ "$VALUE" == '""' ] && continue + [ "$VALUE" = '""' ] && continue # Name is uppercase, VALUE is all lowercase VALUE="$(echo "$VALUE" | tr A-Z a-z)" diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index a450373e8e91..939155ffdece 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -107,6 +107,7 @@ static struct clocksource hyperv_cs = { .rating = 400, /* use this when running on Hyperv*/ .read = read_hv_clock, .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static void __init ms_hyperv_init_platform(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 944bf019b74f..498b6d967138 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2431,6 +2431,7 @@ __init int intel_pmu_init(void) break; case 55: /* 22nm Atom "Silvermont" */ + case 76: /* 14nm Atom "Airmont" */ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 3c895d480cd7..073983398364 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -568,8 +568,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { }; struct event_constraint intel_slm_pebs_event_constraints[] = { - /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1), /* Allow all events as PEBS with no flags */ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1), EVENT_CONSTRAINT_END diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 673f930c700f..c4bb8b8e5017 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -103,6 +103,13 @@ static struct kobj_attribute format_attr_##_var = \ #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ +#define RAPL_EVENT_ATTR_STR(_name, v, str) \ +static struct perf_pmu_events_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \ + .id = 0, \ + .event_str = str, \ +}; + struct rapl_pmu { spinlock_t lock; int hw_unit; /* 1/2^hw_unit Joule */ @@ -135,7 +142,7 @@ static inline u64 rapl_scale(u64 v) * or use ldexp(count, -32). * Watts = Joules/Time delta */ - return v << (32 - __this_cpu_read(rapl_pmu->hw_unit)); + return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit); } static u64 rapl_event_update(struct perf_event *event) @@ -379,23 +386,36 @@ static struct attribute_group rapl_pmu_attr_group = { .attrs = rapl_pmu_attrs, }; -EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); -EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); -EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); -EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); +static ssize_t rapl_sysfs_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = \ + container_of(attr, struct perf_pmu_events_attr, attr); + + if (pmu_attr->event_str) + return sprintf(page, "%s", pmu_attr->event_str); + + return 0; +} + +RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); +RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); +RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); -EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); -EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); -EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); -EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); /* * we compute in 0.23 nJ increments regardless of MSR */ -EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); -EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); -EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); -EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); static struct attribute *rapl_events_srv_attr[] = { EVENT_PTR(rapl_cores), diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 10b8d3eaaf15..c635b8b49e93 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -840,7 +840,6 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id box->phys_id = phys_id; box->pci_dev = pdev; box->pmu = pmu; - uncore_box_init(box); pci_set_drvdata(pdev, box); raw_spin_lock(&uncore_box_lock); @@ -1004,10 +1003,8 @@ static int uncore_cpu_starting(int cpu) pmu = &type->pmus[j]; box = *per_cpu_ptr(pmu->box, cpu); /* called by uncore_cpu_init? */ - if (box && box->phys_id >= 0) { - uncore_box_init(box); + if (box && box->phys_id >= 0) continue; - } for_each_online_cpu(k) { exist = *per_cpu_ptr(pmu->box, k); @@ -1023,10 +1020,8 @@ static int uncore_cpu_starting(int cpu) } } - if (box) { + if (box) box->phys_id = phys_id; - uncore_box_init(box); - } } } return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 18eb78bbdd10..6c8c1e7e69d8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -17,7 +17,7 @@ #define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) #define UNCORE_PCI_DEV_IDX(data) (data & 0xff) #define UNCORE_EXTRA_PCI_DEV 0xff -#define UNCORE_EXTRA_PCI_DEV_MAX 2 +#define UNCORE_EXTRA_PCI_DEV_MAX 3 /* support up to 8 sockets */ #define UNCORE_SOCKET_MAX 8 @@ -257,6 +257,14 @@ static inline int uncore_num_counters(struct intel_uncore_box *box) return box->pmu->type->num_counters; } +static inline void uncore_box_init(struct intel_uncore_box *box) +{ + if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { + if (box->pmu->type->ops->init_box) + box->pmu->type->ops->init_box(box); + } +} + static inline void uncore_disable_box(struct intel_uncore_box *box) { if (box->pmu->type->ops->disable_box) @@ -265,6 +273,8 @@ static inline void uncore_disable_box(struct intel_uncore_box *box) static inline void uncore_enable_box(struct intel_uncore_box *box) { + uncore_box_init(box); + if (box->pmu->type->ops->enable_box) box->pmu->type->ops->enable_box(box); } @@ -287,14 +297,6 @@ static inline u64 uncore_read_counter(struct intel_uncore_box *box, return box->pmu->type->ops->read_counter(box, event); } -static inline void uncore_box_init(struct intel_uncore_box *box) -{ - if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { - if (box->pmu->type->ops->init_box) - box->pmu->type->ops->init_box(box); - } -} - static inline bool uncore_box_is_fake(struct intel_uncore_box *box) { return (box->phys_id < 0); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 745b158e9a65..21af6149edf2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -891,6 +891,7 @@ void snbep_uncore_cpu_init(void) enum { SNBEP_PCI_QPI_PORT0_FILTER, SNBEP_PCI_QPI_PORT1_FILTER, + HSWEP_PCI_PCU_3, }; static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) @@ -2026,6 +2027,17 @@ void hswep_uncore_cpu_init(void) { if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + + /* Detect 6-8 core systems with only two SBOXes */ + if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) { + u32 capid4; + + pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3], + 0x94, &capid4); + if (((capid4 >> 6) & 0x3) == 0) + hswep_uncore_sbox.num_boxes = 2; + } + uncore_msr_uncores = hswep_msr_uncores; } @@ -2287,6 +2299,11 @@ static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, SNBEP_PCI_QPI_PORT1_FILTER), }, + { /* PCU.3 (for Capability registers) */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + HSWEP_PCI_PCU_3), + }, { /* end: all zeroes */ } }; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 2142376dc8c6..8b7b0a51e742 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -674,7 +674,7 @@ static inline void *alloc_tramp(unsigned long size) } static inline void tramp_free(void *tramp) { - module_free(NULL, tramp); + module_memfree(tramp); } #else /* Trampolines can only be created if modules are supported */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 6307a0f0cf17..705ef8d48e2d 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -127,7 +127,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_puts(p, " Machine check polls\n"); #endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) - seq_printf(p, "%*s: ", prec, "THR"); + seq_printf(p, "%*s: ", prec, "HYP"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); seq_puts(p, " Hypervisor callback interrupts\n"); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index f7e3cd50ece0..98f654d466e5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1020,6 +1020,15 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) regs->flags &= ~X86_EFLAGS_IF; trace_hardirqs_off(); regs->ip = (unsigned long)(jp->entry); + + /* + * jprobes use jprobe_return() which skips the normal return + * path of the function, and this messes up the accounting of the + * function graph tracer to get messed up. + * + * Pause function graph tracing while performing the jprobe function. + */ + pause_graph_tracing(); return 1; } NOKPROBE_SYMBOL(setjmp_pre_handler); @@ -1048,24 +1057,25 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); u8 *addr = (u8 *) (regs->ip - 1); struct jprobe *jp = container_of(p, struct jprobe, kp); + void *saved_sp = kcb->jprobe_saved_sp; if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if (stack_addr(regs) != kcb->jprobe_saved_sp) { + if (stack_addr(regs) != saved_sp) { struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; printk(KERN_ERR "current sp %p does not match saved sp %p\n", - stack_addr(regs), kcb->jprobe_saved_sp); + stack_addr(regs), saved_sp); printk(KERN_ERR "Saved registers for jprobe %p\n", jp); show_regs(saved_regs); printk(KERN_ERR "Current registers\n"); show_regs(regs); BUG(); } + /* It's OK to start function graph tracing again */ + unpause_graph_tracing(); *regs = kcb->jprobe_saved_regs; - memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp), - kcb->jprobes_stack, - MIN_STACK_SIZE(kcb->jprobe_saved_sp)); + memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp)); preempt_enable_no_resched(); return 1; } diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index e309cc5c276e..781861cc5ee8 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -78,6 +78,14 @@ u64 perf_reg_abi(struct task_struct *task) { return PERF_SAMPLE_REGS_ABI_32; } + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs, + struct pt_regs *regs_user_copy) +{ + regs_user->regs = task_pt_regs(current); + regs_user->abi = perf_reg_abi(current); +} #else /* CONFIG_X86_64 */ #define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ (1ULL << PERF_REG_X86_ES) | \ @@ -102,4 +110,86 @@ u64 perf_reg_abi(struct task_struct *task) else return PERF_SAMPLE_REGS_ABI_64; } + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs, + struct pt_regs *regs_user_copy) +{ + struct pt_regs *user_regs = task_pt_regs(current); + + /* + * If we're in an NMI that interrupted task_pt_regs setup, then + * we can't sample user regs at all. This check isn't really + * sufficient, though, as we could be in an NMI inside an interrupt + * that happened during task_pt_regs setup. + */ + if (regs->sp > (unsigned long)&user_regs->r11 && + regs->sp <= (unsigned long)(user_regs + 1)) { + regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; + regs_user->regs = NULL; + return; + } + + /* + * RIP, flags, and the argument registers are usually saved. + * orig_ax is probably okay, too. + */ + regs_user_copy->ip = user_regs->ip; + regs_user_copy->cx = user_regs->cx; + regs_user_copy->dx = user_regs->dx; + regs_user_copy->si = user_regs->si; + regs_user_copy->di = user_regs->di; + regs_user_copy->r8 = user_regs->r8; + regs_user_copy->r9 = user_regs->r9; + regs_user_copy->r10 = user_regs->r10; + regs_user_copy->r11 = user_regs->r11; + regs_user_copy->orig_ax = user_regs->orig_ax; + regs_user_copy->flags = user_regs->flags; + + /* + * Don't even try to report the "rest" regs. + */ + regs_user_copy->bx = -1; + regs_user_copy->bp = -1; + regs_user_copy->r12 = -1; + regs_user_copy->r13 = -1; + regs_user_copy->r14 = -1; + regs_user_copy->r15 = -1; + + /* + * For this to be at all useful, we need a reasonable guess for + * sp and the ABI. Be careful: we're in NMI context, and we're + * considering current to be the current task, so we should + * be careful not to look at any other percpu variables that might + * change during context switches. + */ + if (IS_ENABLED(CONFIG_IA32_EMULATION) && + task_thread_info(current)->status & TS_COMPAT) { + /* Easy case: we're in a compat syscall. */ + regs_user->abi = PERF_SAMPLE_REGS_ABI_32; + regs_user_copy->sp = user_regs->sp; + regs_user_copy->cs = user_regs->cs; + regs_user_copy->ss = user_regs->ss; + } else if (user_regs->orig_ax != -1) { + /* + * We're probably in a 64-bit syscall. + * Warning: this code is severely racy. At least it's better + * than just blindly copying user_regs. + */ + regs_user->abi = PERF_SAMPLE_REGS_ABI_64; + regs_user_copy->sp = this_cpu_read(old_rsp); + regs_user_copy->cs = __USER_CS; + regs_user_copy->ss = __USER_DS; + regs_user_copy->cx = -1; /* usually contains garbage */ + } else { + /* We're probably in an interrupt or exception. */ + regs_user->abi = user_64bit_mode(user_regs) ? + PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; + regs_user_copy->sp = user_regs->sp; + regs_user_copy->cs = user_regs->cs; + regs_user_copy->ss = user_regs->ss; + } + + regs_user->regs = regs_user_copy; +} #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 4e942f31b1a7..7fc5e843f247 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -29,7 +29,28 @@ static int get_free_idx(void) static bool tls_desc_okay(const struct user_desc *info) { - if (LDT_empty(info)) + /* + * For historical reasons (i.e. no one ever documented how any + * of the segmentation APIs work), user programs can and do + * assume that a struct user_desc that's all zeros except for + * entry_number means "no segment at all". This never actually + * worked. In fact, up to Linux 3.19, a struct user_desc like + * this would create a 16-bit read-write segment with base and + * limit both equal to zero. + * + * That was close enough to "no segment at all" until we + * hardened this function to disallow 16-bit TLS segments. Fix + * it up by interpreting these zeroed segments the way that they + * were almost certainly intended to be interpreted. + * + * The correct way to ask for "no segment at all" is to specify + * a user_desc that satisfies LDT_empty. To keep everything + * working, we accept both. + * + * Note that there's a similar kludge in modify_ldt -- look at + * the distinction between modes 1 and 0x11. + */ + if (LDT_empty(info) || LDT_zero(info)) return true; /* @@ -71,7 +92,7 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info)) + if (LDT_empty(info) || LDT_zero(info)) desc->a = desc->b = 0; else fill_ldt(desc, info); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index b7e50bba3bbb..505449700e0c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -617,7 +617,7 @@ static unsigned long quick_pit_calibrate(void) goto success; } } - pr_err("Fast TSC calibration failed\n"); + pr_info("Fast TSC calibration failed\n"); return 0; success: diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 169b09d76ddd..de12c1d379f1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2348,7 +2348,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) * Not recognized on AMD in compat mode (but is recognized in legacy * mode). */ - if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA) + if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA) && !vendor_intel(ctxt)) return emulate_ud(ctxt); @@ -2359,25 +2359,13 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) setup_syscalls_segments(ctxt, &cs, &ss); ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); - switch (ctxt->mode) { - case X86EMUL_MODE_PROT32: - if ((msr_data & 0xfffc) == 0x0) - return emulate_gp(ctxt, 0); - break; - case X86EMUL_MODE_PROT64: - if (msr_data == 0x0) - return emulate_gp(ctxt, 0); - break; - default: - break; - } + if ((msr_data & 0xfffc) == 0x0) + return emulate_gp(ctxt, 0); ctxt->eflags &= ~(EFLG_VM | EFLG_IF); - cs_sel = (u16)msr_data; - cs_sel &= ~SELECTOR_RPL_MASK; + cs_sel = (u16)msr_data & ~SELECTOR_RPL_MASK; ss_sel = cs_sel + 8; - ss_sel &= ~SELECTOR_RPL_MASK; - if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) { + if (efer & EFER_LMA) { cs.d = 0; cs.l = 1; } @@ -2386,10 +2374,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); - ctxt->_eip = msr_data; + ctxt->_eip = (efer & EFER_LMA) ? msr_data : (u32)msr_data; ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); - *reg_write(ctxt, VCPU_REGS_RSP) = msr_data; + *reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data : + (u32)msr_data; return X86EMUL_CONTINUE; } @@ -3791,8 +3780,8 @@ static const struct opcode group5[] = { }; static const struct opcode group6[] = { - DI(Prot, sldt), - DI(Prot, str), + DI(Prot | DstMem, sldt), + DI(Prot | DstMem, str), II(Prot | Priv | SrcMem16, em_lldt, lldt), II(Prot | Priv | SrcMem16, em_ltr, ltr), N, N, N, N, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4f0c0b954686..d52dcf0776ea 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -192,6 +192,9 @@ static void recalculate_apic_map(struct kvm *kvm) u16 cid, lid; u32 ldr, aid; + if (!kvm_apic_present(vcpu)) + continue; + aid = kvm_apic_id(apic); ldr = kvm_apic_get_reg(apic, APIC_LDR); cid = apic_cluster_id(new, ldr); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 10fbed126b11..f83fc6c5e0ba 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4448,7 +4448,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) * zap all shadow pages. */ if (unlikely(kvm_current_mmio_generation(kvm) == 0)) { - printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); + printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index feb852b04598..d4c58d884838 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5840,53 +5840,10 @@ static __init int hardware_setup(void) memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); - vmx_disable_intercept_for_msr(MSR_GS_BASE, false); - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); - - memcpy(vmx_msr_bitmap_legacy_x2apic, - vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, - vmx_msr_bitmap_longmode, PAGE_SIZE); - - if (enable_apicv) { - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr); - - /* According SDM, in x2apic mode, the whole id reg is used. - * But in KVM, it only use the highest eight bits. Need to - * intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808); - /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b); - /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f); - } - - if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else - kvm_disable_tdp(); - - update_ple_window_actual_max(); - if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; goto out7; - } + } if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -5945,6 +5902,49 @@ static __init int hardware_setup(void) if (nested) nested_vmx_setup_ctls_msrs(); + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); + vmx_disable_intercept_for_msr(MSR_GS_BASE, false); + vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + + memcpy(vmx_msr_bitmap_legacy_x2apic, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic, + vmx_msr_bitmap_longmode, PAGE_SIZE); + + if (enable_apicv) { + for (msr = 0x800; msr <= 0x8ff; msr++) + vmx_disable_intercept_msr_read_x2apic(msr); + + /* According SDM, in x2apic mode, the whole id reg is used. + * But in KVM, it only use the highest eight bits. Need to + * intercept it */ + vmx_enable_intercept_msr_read_x2apic(0x802); + /* TMCCT */ + vmx_enable_intercept_msr_read_x2apic(0x839); + /* TPR */ + vmx_disable_intercept_msr_write_x2apic(0x808); + /* EOI */ + vmx_disable_intercept_msr_write_x2apic(0x80b); + /* SELF-IPI */ + vmx_disable_intercept_msr_write_x2apic(0x83f); + } + + if (enable_ept) { + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK); + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); + } else + kvm_disable_tdp(); + + update_ple_window_actual_max(); + return alloc_kvm_area(); out7: diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 2480978b31cc..1313ae6b478b 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -28,7 +28,7 @@ /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ - ((insn)->next_byte + sizeof(t) + n < (insn)->end_kaddr) + ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 38dcec403b46..e3ff27a5b634 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -898,6 +898,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); + else if (fault & VM_FAULT_SIGSEGV) + bad_area_nosemaphore(regs, error_code, address); else BUG(); } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a97ee0801475..079c3b6a3ff1 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -43,7 +43,7 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WT] = _PAGE_PCD, [_PAGE_CACHE_MODE_WP] = _PAGE_PCD, }; -EXPORT_SYMBOL_GPL(__cachemode2pte_tbl); +EXPORT_SYMBOL(__cachemode2pte_tbl); uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB, [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC, @@ -54,7 +54,7 @@ uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; -EXPORT_SYMBOL_GPL(__pte2cachemode_tbl); +EXPORT_SYMBOL(__pte2cachemode_tbl); static unsigned long __initdata pgt_buf_start; static unsigned long __initdata pgt_buf_end; @@ -438,20 +438,20 @@ static unsigned long __init init_range_memory_mapping( static unsigned long __init get_new_step_size(unsigned long step_size) { /* - * Explain why we shift by 5 and why we don't have to worry about - * 'step_size << 5' overflowing: - * - * initial mapped size is PMD_SIZE (2M). + * Initial mapped size is PMD_SIZE (2M). * We can not set step_size to be PUD_SIZE (1G) yet. * In worse case, when we cross the 1G boundary, and * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) - * to map 1G range with PTE. Use 5 as shift for now. + * to map 1G range with PTE. Hence we use one less than the + * difference of page table level shifts. * - * Don't need to worry about overflow, on 32bit, when step_size - * is 0, round_down() returns 0 for start, and that turns it - * into 0x100000000ULL. + * Don't need to worry about overflow in the top-down case, on 32bit, + * when step_size is 0, round_down() returns 0 for start, and that + * turns it into 0x100000000ULL. + * In the bottom-up case, round_up(x, 0) returns 0 though too, which + * needs to be taken into consideration by the code below. */ - return step_size << 5; + return step_size << (PMD_SHIFT - PAGE_SHIFT - 1); } /** @@ -471,7 +471,6 @@ static void __init memory_map_top_down(unsigned long map_start, unsigned long step_size; unsigned long addr; unsigned long mapped_ram_size = 0; - unsigned long new_mapped_ram_size; /* xen has big range in reserved near end of ram, skip it at first.*/ addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); @@ -496,14 +495,12 @@ static void __init memory_map_top_down(unsigned long map_start, start = map_start; } else start = map_start; - new_mapped_ram_size = init_range_memory_mapping(start, + mapped_ram_size += init_range_memory_mapping(start, last_start); last_start = start; min_pfn_mapped = last_start >> PAGE_SHIFT; - /* only increase step_size after big range get mapped */ - if (new_mapped_ram_size > mapped_ram_size) + if (mapped_ram_size >= step_size) step_size = get_new_step_size(step_size); - mapped_ram_size += new_mapped_ram_size; } if (real_end < map_end) @@ -524,7 +521,7 @@ static void __init memory_map_top_down(unsigned long map_start, static void __init memory_map_bottom_up(unsigned long map_start, unsigned long map_end) { - unsigned long next, new_mapped_ram_size, start; + unsigned long next, start; unsigned long mapped_ram_size = 0; /* step_size need to be small so pgt_buf from BRK could cover it */ unsigned long step_size = PMD_SIZE; @@ -539,19 +536,19 @@ static void __init memory_map_bottom_up(unsigned long map_start, * for page table. */ while (start < map_end) { - if (map_end - start > step_size) { + if (step_size && map_end - start > step_size) { next = round_up(start + 1, step_size); if (next > map_end) next = map_end; - } else + } else { next = map_end; + } - new_mapped_ram_size = init_range_memory_mapping(start, next); + mapped_ram_size += init_range_memory_mapping(start, next); start = next; - if (new_mapped_ram_size > mapped_ram_size) + if (mapped_ram_size >= step_size) step_size = get_new_step_size(step_size); - mapped_ram_size += new_mapped_ram_size; } } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 67ebf5751222..c439ec478216 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -349,6 +349,12 @@ static __user void *task_get_bounds_dir(struct task_struct *tsk) return MPX_INVALID_BOUNDS_DIR; /* + * 32-bit binaries on 64-bit kernels are currently + * unsupported. + */ + if (IS_ENABLED(CONFIG_X86_64) && test_thread_flag(TIF_IA32)) + return MPX_INVALID_BOUNDS_DIR; + /* * The bounds directory pointer is stored in a register * only accessible if we first do an xsave. */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index edf299c8ff6c..7ac68698406c 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -234,8 +234,13 @@ void pat_init(void) PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); /* Boot CPU check */ - if (!boot_pat_state) + if (!boot_pat_state) { rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); + if (!boot_pat_state) { + pat_disable("PAT read returns always zero, disabled."); + return; + } + } wrmsrl(MSR_IA32_CR_PAT, pat); diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 9b18ef315a55..349c0d32cc0b 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -216,7 +216,7 @@ static void pcibios_allocate_bridge_resources(struct pci_dev *dev) continue; if (r->parent) /* Already allocated */ continue; - if (!r->start || pci_claim_resource(dev, idx) < 0) { + if (!r->start || pci_claim_bridge_resource(dev, idx) < 0) { /* * Something is wrong with the region. * Invalidate the resource to prevent diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index c489ef2c1a39..9098d880c476 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -458,6 +458,7 @@ int __init pci_xen_hvm_init(void) * just how GSIs get registered. */ __acpi_register_gsi = acpi_register_gsi_xen_hvm; + __acpi_unregister_gsi = NULL; #endif #ifdef CONFIG_PCI_MSI @@ -471,52 +472,6 @@ int __init pci_xen_hvm_init(void) } #ifdef CONFIG_XEN_DOM0 -static __init void xen_setup_acpi_sci(void) -{ - int rc; - int trigger, polarity; - int gsi = acpi_sci_override_gsi; - int irq = -1; - int gsi_override = -1; - - if (!gsi) - return; - - rc = acpi_get_override_irq(gsi, &trigger, &polarity); - if (rc) { - printk(KERN_WARNING "xen: acpi_get_override_irq failed for acpi" - " sci, rc=%d\n", rc); - return; - } - trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; - polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; - - printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d " - "polarity=%d\n", gsi, trigger, polarity); - - /* Before we bind the GSI to a Linux IRQ, check whether - * we need to override it with bus_irq (IRQ) value. Usually for - * IRQs below IRQ_LEGACY_IRQ this holds IRQ == GSI, as so: - * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level) - * but there are oddballs where the IRQ != GSI: - * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 20 low level) - * which ends up being: gsi_to_irq[9] == 20 - * (which is what acpi_gsi_to_irq ends up calling when starting the - * the ACPI interpreter and keels over since IRQ 9 has not been - * setup as we had setup IRQ 20 for it). - */ - if (acpi_gsi_to_irq(gsi, &irq) == 0) { - /* Use the provided value if it's valid. */ - if (irq >= 0) - gsi_override = irq; - } - - gsi = xen_register_gsi(gsi, gsi_override, trigger, polarity); - printk(KERN_INFO "xen: acpi sci %d\n", gsi); - - return; -} - int __init pci_xen_initial_domain(void) { int irq; @@ -527,8 +482,8 @@ int __init pci_xen_initial_domain(void) x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs; pci_msi_ignore_mask = 1; #endif - xen_setup_acpi_sci(); __acpi_register_gsi = acpi_register_gsi_xen; + __acpi_unregister_gsi = NULL; /* Pre-allocate legacy irqs */ for (irq = 0; irq < nr_legacy_irqs(); irq++) { int trigger, polarity; diff --git a/arch/x86/tools/calc_run_size.pl b/arch/x86/tools/calc_run_size.pl deleted file mode 100644 index 23210baade2d..000000000000 --- a/arch/x86/tools/calc_run_size.pl +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/perl -# -# Calculate the amount of space needed to run the kernel, including room for -# the .bss and .brk sections. -# -# Usage: -# objdump -h a.out | perl calc_run_size.pl -use strict; - -my $mem_size = 0; -my $file_offset = 0; - -my $sections=" *[0-9]+ \.(?:bss|brk) +"; -while (<>) { - if (/^$sections([0-9a-f]+) +(?:[0-9a-f]+ +){2}([0-9a-f]+)/) { - my $size = hex($1); - my $offset = hex($2); - $mem_size += $size; - if ($file_offset == 0) { - $file_offset = $offset; - } elsif ($file_offset != $offset) { - # BFD linker shows the same file offset in ELF. - # Gold linker shows them as consecutive. - next if ($file_offset + $mem_size == $offset + $size); - - printf STDERR "file_offset: 0x%lx\n", $file_offset; - printf STDERR "mem_size: 0x%lx\n", $mem_size; - printf STDERR "offset: 0x%lx\n", $offset; - printf STDERR "size: 0x%lx\n", $size; - - die ".bss and .brk are non-contiguous\n"; - } - } -} - -if ($file_offset == 0) { - die "Never found .bss or .brk file offset\n"; -} -printf("%d\n", $mem_size + $file_offset); diff --git a/arch/x86/tools/calc_run_size.sh b/arch/x86/tools/calc_run_size.sh new file mode 100644 index 000000000000..1a4c17bb3910 --- /dev/null +++ b/arch/x86/tools/calc_run_size.sh @@ -0,0 +1,42 @@ +#!/bin/sh +# +# Calculate the amount of space needed to run the kernel, including room for +# the .bss and .brk sections. +# +# Usage: +# objdump -h a.out | sh calc_run_size.sh + +NUM='\([0-9a-fA-F]*[ \t]*\)' +OUT=$(sed -n 's/^[ \t0-9]*.b[sr][sk][ \t]*'"$NUM$NUM$NUM$NUM"'.*/\1\4/p') +if [ -z "$OUT" ] ; then + echo "Never found .bss or .brk file offset" >&2 + exit 1 +fi + +OUT=$(echo ${OUT# }) +sizeA=$(printf "%d" 0x${OUT%% *}) +OUT=${OUT#* } +offsetA=$(printf "%d" 0x${OUT%% *}) +OUT=${OUT#* } +sizeB=$(printf "%d" 0x${OUT%% *}) +OUT=${OUT#* } +offsetB=$(printf "%d" 0x${OUT%% *}) + +run_size=$(( $offsetA + $sizeA + $sizeB )) + +# BFD linker shows the same file offset in ELF. +if [ "$offsetA" -ne "$offsetB" ] ; then + # Gold linker shows them as consecutive. + endB=$(( $offsetB + $sizeB )) + if [ "$endB" != "$run_size" ] ; then + printf "sizeA: 0x%x\n" $sizeA >&2 + printf "offsetA: 0x%x\n" $offsetA >&2 + printf "sizeB: 0x%x\n" $sizeB >&2 + printf "offsetB: 0x%x\n" $offsetB >&2 + echo ".bss and .brk are non-contiguous" >&2 + exit 1 + fi +fi + +printf "%d\n" $run_size +exit 0 diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 531d4269e2e3..bd16d6c370ec 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -34,7 +34,7 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { +const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 20c3649d0691..5cdfa9db2217 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -47,7 +47,7 @@ typedef void (*sys_call_ptr_t)(void); extern void sys_ni_syscall(void); -const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { +const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 009495b9ab4b..1c9f750c3859 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -41,12 +41,17 @@ void __init init_vdso_image(const struct vdso_image *image) struct linux_binprm; -/* Put the vdso above the (randomized) stack with another randomized offset. - This way there is no hole in the middle of address space. - To save memory make sure it is still in the same PTE as the stack top. - This doesn't give that many random bits. - - Only used for the 64-bit and x32 vdsos. */ +/* + * Put the vdso above the (randomized) stack with another randomized + * offset. This way there is no hole in the middle of address space. + * To save memory make sure it is still in the same PTE as the stack + * top. This doesn't give that many random bits. + * + * Note that this algorithm is imperfect: the distribution of the vdso + * start address within a PMD is biased toward the end. + * + * Only used for the 64-bit and x32 vdsos. + */ static unsigned long vdso_addr(unsigned long start, unsigned len) { #ifdef CONFIG_X86_32 @@ -54,22 +59,30 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) #else unsigned long addr, end; unsigned offset; - end = (start + PMD_SIZE - 1) & PMD_MASK; + + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; if (end >= TASK_SIZE_MAX) end = TASK_SIZE_MAX; end -= len; - /* This loses some more bits than a modulo, but is cheaper */ - offset = get_random_int() & (PTRS_PER_PTE - 1); - addr = start + (offset << PAGE_SHIFT); - if (addr >= end) - addr = end; + + if (end > start) { + offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } /* - * page-align it here so that get_unmapped_area doesn't - * align it wrongfully again to the next page. addr can come in 4K - * unaligned here as a result of stack start randomization. + * Forcibly align the final address in case we have a hardware + * issue that requires alignment for performance reasons. */ - addr = PAGE_ALIGN(addr); addr = align_vdso_addr(addr); return addr; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 6bf3a13e3e0f..78a881b7fc41 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -40,6 +40,7 @@ #include <xen/interface/physdev.h> #include <xen/interface/vcpu.h> #include <xen/interface/memory.h> +#include <xen/interface/nmi.h> #include <xen/interface/xen-mca.h> #include <xen/features.h> #include <xen/page.h> @@ -66,6 +67,7 @@ #include <asm/reboot.h> #include <asm/stackprotector.h> #include <asm/hypervisor.h> +#include <asm/mach_traps.h> #include <asm/mwait.h> #include <asm/pci_x86.h> #include <asm/pat.h> @@ -1351,6 +1353,21 @@ static const struct machine_ops xen_machine_ops __initconst = { .emergency_restart = xen_emergency_restart, }; +static unsigned char xen_get_nmi_reason(void) +{ + unsigned char reason = 0; + + /* Construct a value which looks like it came from port 0x61. */ + if (test_bit(_XEN_NMIREASON_io_error, + &HYPERVISOR_shared_info->arch.nmi_reason)) + reason |= NMI_REASON_IOCHK; + if (test_bit(_XEN_NMIREASON_pci_serr, + &HYPERVISOR_shared_info->arch.nmi_reason)) + reason |= NMI_REASON_SERR; + + return reason; +} + static void __init xen_boot_params_init_edd(void) { #if IS_ENABLED(CONFIG_EDD) @@ -1535,9 +1552,12 @@ asmlinkage __visible void __init xen_start_kernel(void) pv_info = xen_info; pv_init_ops = xen_init_ops; pv_apic_ops = xen_apic_ops; - if (!xen_pvh_domain()) + if (!xen_pvh_domain()) { pv_cpu_ops = xen_cpu_ops; + x86_platform.get_nmi_reason = xen_get_nmi_reason; + } + if (xen_feature(XENFEAT_auto_translated_physmap)) x86_init.resources.memory_setup = xen_auto_xlated_memory_setup; else diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index edbc7a63fd73..70fb5075c901 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -167,10 +167,13 @@ static void * __ref alloc_p2m_page(void) return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); } -/* Only to be called in case of a race for a page just allocated! */ -static void free_p2m_page(void *p) +static void __ref free_p2m_page(void *p) { - BUG_ON(!slab_is_available()); + if (unlikely(!slab_is_available())) { + free_bootmem((unsigned long)p, PAGE_SIZE); + return; + } + free_page((unsigned long)p); } @@ -375,7 +378,7 @@ static void __init xen_rebuild_p2m_list(unsigned long *p2m) p2m_missing_pte : p2m_identity_pte; for (i = 0; i < PMDS_PER_MID_PAGE; i++) { pmdp = populate_extra_pmd( - (unsigned long)(p2m + pfn + i * PTRS_PER_PTE)); + (unsigned long)(p2m + pfn) + i * PMD_SIZE); set_pmd(pmdp, __pmd(__pa(ptep) | _KERNPG_TABLE)); } } @@ -436,10 +439,9 @@ EXPORT_SYMBOL_GPL(get_phys_to_machine); * a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual * pmd. In case of PAE/x86-32 there are multiple pmds to allocate! */ -static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg) +static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) { pte_t *ptechk; - pte_t *pteret = ptep; pte_t *pte_newpg[PMDS_PER_MID_PAGE]; pmd_t *pmdp; unsigned int level; @@ -473,8 +475,6 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg) if (ptechk == pte_pg) { set_pmd(pmdp, __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE)); - if (vaddr == (addr & ~(PMD_SIZE - 1))) - pteret = pte_offset_kernel(pmdp, addr); pte_newpg[i] = NULL; } @@ -488,7 +488,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg) vaddr += PMD_SIZE; } - return pteret; + return lookup_address(addr, &level); } /* @@ -517,7 +517,7 @@ static bool alloc_p2m(unsigned long pfn) if (pte_pg == p2m_missing_pte || pte_pg == p2m_identity_pte) { /* PMD level is missing, allocate a new one */ - ptep = alloc_p2m_pmd(addr, ptep, pte_pg); + ptep = alloc_p2m_pmd(addr, pte_pg); if (!ptep) return false; } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index dfd77dec8e2b..865e56cea7a0 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -140,7 +140,7 @@ static void __init xen_del_extra_mem(u64 start, u64 size) unsigned long __ref xen_chk_extra_mem(unsigned long pfn) { int i; - unsigned long addr = PFN_PHYS(pfn); + phys_addr_t addr = PFN_PHYS(pfn); for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { if (addr >= xen_extra_mem[i].start && @@ -160,6 +160,8 @@ void __init xen_inv_extra_mem(void) int i; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + if (!xen_extra_mem[i].size) + continue; pfn_s = PFN_DOWN(xen_extra_mem[i].start); pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); for (pfn = pfn_s; pfn < pfn_e; pfn++) @@ -229,15 +231,14 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity, - unsigned long *released) + unsigned long end_pfn, unsigned long nr_pages, unsigned long *released) { - unsigned long len = 0; unsigned long pfn, end; int ret; WARN_ON(start_pfn > end_pfn); + /* Release pages first. */ end = min(end_pfn, nr_pages); for (pfn = start_pfn; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -250,16 +251,14 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); if (ret == 1) { + (*released)++; if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY)) break; - len++; } else break; } - /* Need to release pages first */ - *released += len; - *identity += set_phys_range_identity(start_pfn, end_pfn); + set_phys_range_identity(start_pfn, end_pfn); } /* @@ -287,7 +286,7 @@ static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn) } /* Update kernel mapping, but not for highmem. */ - if ((pfn << PAGE_SHIFT) >= __pa(high_memory)) + if (pfn >= PFN_UP(__pa(high_memory - 1))) return; if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT), @@ -318,7 +317,6 @@ static void __init xen_do_set_identity_and_remap_chunk( unsigned long ident_pfn_iter, remap_pfn_iter; unsigned long ident_end_pfn = start_pfn + size; unsigned long left = size; - unsigned long ident_cnt = 0; unsigned int i, chunk; WARN_ON(size == 0); @@ -347,8 +345,7 @@ static void __init xen_do_set_identity_and_remap_chunk( xen_remap_mfn = mfn; /* Set identity map */ - ident_cnt += set_phys_range_identity(ident_pfn_iter, - ident_pfn_iter + chunk); + set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk); left -= chunk; } @@ -371,7 +368,7 @@ static void __init xen_do_set_identity_and_remap_chunk( static unsigned long __init xen_set_identity_and_remap_chunk( const struct e820entry *list, size_t map_size, unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, - unsigned long *identity, unsigned long *released) + unsigned long *released, unsigned long *remapped) { unsigned long pfn; unsigned long i = 0; @@ -386,8 +383,7 @@ static unsigned long __init xen_set_identity_and_remap_chunk( /* Do not remap pages beyond the current allocation */ if (cur_pfn >= nr_pages) { /* Identity map remaining pages */ - *identity += set_phys_range_identity(cur_pfn, - cur_pfn + size); + set_phys_range_identity(cur_pfn, cur_pfn + size); break; } if (cur_pfn + size > nr_pages) @@ -398,7 +394,7 @@ static unsigned long __init xen_set_identity_and_remap_chunk( if (!remap_range_size) { pr_warning("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages, identity, released); + cur_pfn + left, nr_pages, released); break; } /* Adjust size to fit in current e820 RAM region */ @@ -410,7 +406,7 @@ static unsigned long __init xen_set_identity_and_remap_chunk( /* Update variables to reflect new mappings. */ i += size; remap_pfn += size; - *identity += size; + *remapped += size; } /* @@ -427,13 +423,13 @@ static unsigned long __init xen_set_identity_and_remap_chunk( static void __init xen_set_identity_and_remap( const struct e820entry *list, size_t map_size, unsigned long nr_pages, - unsigned long *released) + unsigned long *released, unsigned long *remapped) { phys_addr_t start = 0; - unsigned long identity = 0; unsigned long last_pfn = nr_pages; const struct e820entry *entry; unsigned long num_released = 0; + unsigned long num_remapped = 0; int i; /* @@ -460,14 +456,14 @@ static void __init xen_set_identity_and_remap( last_pfn = xen_set_identity_and_remap_chunk( list, map_size, start_pfn, end_pfn, nr_pages, last_pfn, - &identity, &num_released); + &num_released, &num_remapped); start = end; } } *released = num_released; + *remapped = num_remapped; - pr_info("Set %ld page(s) to 1-1 mapping\n", identity); pr_info("Released %ld page(s)\n", num_released); } @@ -586,6 +582,7 @@ char * __init xen_memory_setup(void) struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; + unsigned long remapped_pages; int i; int op; @@ -635,9 +632,10 @@ char * __init xen_memory_setup(void) * underlying RAM. */ xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, - &xen_released_pages); + &xen_released_pages, &remapped_pages); extra_pages += xen_released_pages; + extra_pages += remapped_pages; /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index f473d268d387..69087341d9ae 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -391,7 +391,7 @@ static const struct clock_event_device *xen_clockevent = struct xen_clock_event_device { struct clock_event_device evt; - char *name; + char name[16]; }; static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 }; @@ -420,46 +420,38 @@ void xen_teardown_timer(int cpu) if (evt->irq >= 0) { unbind_from_irqhandler(evt->irq, NULL); evt->irq = -1; - kfree(per_cpu(xen_clock_events, cpu).name); - per_cpu(xen_clock_events, cpu).name = NULL; } } void xen_setup_timer(int cpu) { - char *name; - struct clock_event_device *evt; + struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu); + struct clock_event_device *evt = &xevt->evt; int irq; - evt = &per_cpu(xen_clock_events, cpu).evt; WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu); if (evt->irq >= 0) xen_teardown_timer(cpu); printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); - name = kasprintf(GFP_KERNEL, "timer%d", cpu); - if (!name) - name = "<timer kasprintf failed>"; + snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu); irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| IRQF_FORCE_RESUME|IRQF_EARLY_RESUME, - name, NULL); + xevt->name, NULL); (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX); memcpy(evt, xen_clockevent, sizeof(*evt)); evt->cpumask = cpumask_of(cpu); evt->irq = irq; - per_cpu(xen_clock_events, cpu).name = name; } void xen_setup_cpu_clockevents(void) { - BUG_ON(preemptible()); - clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt)); } |