summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/boot/compressed/aslr.c9
-rw-r--r--arch/x86/boot/header.S26
-rw-r--r--arch/x86/boot/tools/build.c38
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c2
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/include/asm/irqflags.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h4
-rw-r--r--arch/x86/include/asm/ptrace.h16
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c18
-rw-r--r--arch/x86/kernel/apm_32.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c22
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c10
-rw-r--r--arch/x86/kernel/cpu/perf_event.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event.h12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c78
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c11
-rw-r--r--arch/x86/kernel/entry_32.S13
-rw-r--r--arch/x86/kernel/entry_64.S28
-rw-r--r--arch/x86/kernel/espfix_64.c5
-rw-r--r--arch/x86/kernel/kprobes/core.c3
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/traps.c7
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kvm/svm.c1
-rw-r--r--arch/x86/kvm/x86.c14
-rw-r--r--arch/x86/vdso/Makefile24
-rw-r--r--arch/x86/vdso/vclock_gettime.c3
-rw-r--r--arch/x86/vdso/vdso-fakesections.c41
-rw-r--r--arch/x86/vdso/vdso-layout.lds.S64
-rw-r--r--arch/x86/vdso/vdso.lds.S2
-rw-r--r--arch/x86/vdso/vdso2c.c73
-rw-r--r--arch/x86/vdso/vdso2c.h202
-rw-r--r--arch/x86/vdso/vdso32/vdso-fakesections.c1
-rw-r--r--arch/x86/vdso/vdsox32.lds.S2
-rw-r--r--arch/x86/vdso/vma.c4
-rw-r--r--arch/x86/xen/enlighten.c5
-rw-r--r--arch/x86/xen/grant-table.c148
-rw-r--r--arch/x86/xen/setup.c60
-rw-r--r--arch/x86/xen/xen-ops.h1
43 files changed, 691 insertions, 292 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fcefdda5136d..d24887b645dc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -131,6 +131,7 @@ config X86
select HAVE_CC_STACKPROTECTOR
select GENERIC_CPU_AUTOPROBE
select HAVE_ARCH_AUDITSYSCALL
+ select ARCH_SUPPORTS_ATOMIC_RMW
config INSTRUCTION_DECODER
def_bool y
@@ -1672,7 +1673,6 @@ config RELOCATABLE
config RANDOMIZE_BASE
bool "Randomize the address of the kernel image"
depends on RELOCATABLE
- depends on !HIBERNATION
default n
---help---
Randomizes the physical and virtual address at which the
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 4dbf967da50d..fc6091abedb7 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -289,10 +289,17 @@ unsigned char *choose_kernel_location(unsigned char *input,
unsigned long choice = (unsigned long)output;
unsigned long random;
+#ifdef CONFIG_HIBERNATION
+ if (!cmdline_find_option_bool("kaslr")) {
+ debug_putstr("KASLR disabled by default...\n");
+ goto out;
+ }
+#else
if (cmdline_find_option_bool("nokaslr")) {
- debug_putstr("KASLR disabled...\n");
+ debug_putstr("KASLR disabled by cmdline...\n");
goto out;
}
+#endif
/* Record the various known unsafe memory ranges. */
mem_avoid_init((unsigned long)input, input_size,
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 84c223479e3c..7a6d43a554d7 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -91,10 +91,9 @@ bs_die:
.section ".bsdata", "a"
bugger_off_msg:
- .ascii "Direct floppy boot is not supported. "
- .ascii "Use a boot loader program instead.\r\n"
+ .ascii "Use a boot loader.\r\n"
.ascii "\n"
- .ascii "Remove disk and press any key to reboot ...\r\n"
+ .ascii "Remove disk and press any key to reboot...\r\n"
.byte 0
#ifdef CONFIG_EFI_STUB
@@ -108,7 +107,7 @@ coff_header:
#else
.word 0x8664 # x86-64
#endif
- .word 3 # nr_sections
+ .word 4 # nr_sections
.long 0 # TimeDateStamp
.long 0 # PointerToSymbolTable
.long 1 # NumberOfSymbols
@@ -250,6 +249,25 @@ section_table:
.word 0 # NumberOfLineNumbers
.long 0x60500020 # Characteristics (section flags)
+ #
+ # The offset & size fields are filled in by build.c.
+ #
+ .ascii ".bss"
+ .byte 0
+ .byte 0
+ .byte 0
+ .byte 0
+ .long 0
+ .long 0x0
+ .long 0 # Size of initialized data
+ # on disk
+ .long 0x0
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long 0xc8000080 # Characteristics (section flags)
+
#endif /* CONFIG_EFI_STUB */
# Kernel attributes; used by setup. This is part 1 of the
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 1a2f2121cada..a7661c430cd9 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -143,7 +143,7 @@ static void usage(void)
#ifdef CONFIG_EFI_STUB
-static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset)
{
unsigned int pe_header;
unsigned short num_sections;
@@ -164,10 +164,10 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
put_unaligned_le32(size, section + 0x8);
/* section header vma field */
- put_unaligned_le32(offset, section + 0xc);
+ put_unaligned_le32(vma, section + 0xc);
/* section header 'size of initialised data' field */
- put_unaligned_le32(size, section + 0x10);
+ put_unaligned_le32(datasz, section + 0x10);
/* section header 'file offset' field */
put_unaligned_le32(offset, section + 0x14);
@@ -179,6 +179,11 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
}
}
+static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+{
+ update_pecoff_section_header_fields(section_name, offset, size, size, offset);
+}
+
static void update_pecoff_setup_and_reloc(unsigned int size)
{
u32 setup_offset = 0x200;
@@ -203,9 +208,6 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
pe_header = get_unaligned_le32(&buf[0x3c]);
- /* Size of image */
- put_unaligned_le32(file_sz, &buf[pe_header + 0x50]);
-
/*
* Size of code: Subtract the size of the first sector (512 bytes)
* which includes the header.
@@ -220,6 +222,22 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
update_pecoff_section_header(".text", text_start, text_sz);
}
+static void update_pecoff_bss(unsigned int file_sz, unsigned int init_sz)
+{
+ unsigned int pe_header;
+ unsigned int bss_sz = init_sz - file_sz;
+
+ pe_header = get_unaligned_le32(&buf[0x3c]);
+
+ /* Size of uninitialized data */
+ put_unaligned_le32(bss_sz, &buf[pe_header + 0x24]);
+
+ /* Size of image */
+ put_unaligned_le32(init_sz, &buf[pe_header + 0x50]);
+
+ update_pecoff_section_header_fields(".bss", file_sz, bss_sz, 0, 0);
+}
+
static int reserve_pecoff_reloc_section(int c)
{
/* Reserve 0x20 bytes for .reloc section */
@@ -259,6 +277,8 @@ static void efi_stub_entry_update(void)
static inline void update_pecoff_setup_and_reloc(unsigned int size) {}
static inline void update_pecoff_text(unsigned int text_start,
unsigned int file_sz) {}
+static inline void update_pecoff_bss(unsigned int file_sz,
+ unsigned int init_sz) {}
static inline void efi_stub_defaults(void) {}
static inline void efi_stub_entry_update(void) {}
@@ -310,7 +330,7 @@ static void parse_zoffset(char *fname)
int main(int argc, char ** argv)
{
- unsigned int i, sz, setup_sectors;
+ unsigned int i, sz, setup_sectors, init_sz;
int c;
u32 sys_size;
struct stat sb;
@@ -376,7 +396,9 @@ int main(int argc, char ** argv)
buf[0x1f1] = setup_sectors-1;
put_unaligned_le32(sys_size, &buf[0x1f4]);
- update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz));
+ update_pecoff_text(setup_sectors * 512, i + (sys_size * 16));
+ init_sz = get_unaligned_le32(&buf[0x260]);
+ update_pecoff_bss(i + (sys_size * 16), init_sz);
efi_stub_entry_update();
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index f30cd10293f0..8626b03e83b7 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -141,7 +141,7 @@ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
/* save number of bits */
bits[1] = cpu_to_be64(sctx->count[0] << 3);
- bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61;
+ bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
/* Pad out to 112 mod 128 and append length */
index = sctx->count[0] & 0x7f;
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index cb6cfcd034cf..a80cbb88ea91 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
extern void init_ISA_irqs(void);
#ifdef CONFIG_X86_LOCAL_APIC
-void arch_trigger_all_cpu_backtrace(void);
+void arch_trigger_all_cpu_backtrace(bool);
#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
#endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index bba3cf88e624..0a8b519226b8 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -129,7 +129,7 @@ static inline notrace unsigned long arch_local_irq_save(void)
#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
-#define INTERRUPT_RETURN iretq
+#define INTERRUPT_RETURN jmp native_iret
#define USERGS_SYSRET64 \
swapgs; \
sysretq;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 49314155b66c..49205d01b9ad 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -95,7 +95,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 80
#define KVM_NR_FIXED_MTRR_REGION 88
-#define KVM_NR_VAR_MTRR 8
+#define KVM_NR_VAR_MTRR 10
#define ASYNC_PF_PER_VCPU 64
@@ -461,7 +461,7 @@ struct kvm_vcpu_arch {
bool nmi_injected; /* Trying to inject an NMI this entry */
struct mtrr_state_type mtrr_state;
- u32 pat;
+ u64 pat;
unsigned switch_db_regs;
unsigned long db[KVM_NR_DB_REGS];
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 14fd6fd75a19..6205f0c434db 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -231,6 +231,22 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
#define ARCH_HAS_USER_SINGLE_STEP_INFO
+/*
+ * When hitting ptrace_stop(), we cannot return using SYSRET because
+ * that does not restore the full CPU state, only a minimal set. The
+ * ptracer can change arbitrary register values, which is usually okay
+ * because the usual ptrace stops run off the signal delivery path which
+ * forces IRET; however, ptrace_event() stops happen in arbitrary places
+ * in the kernel and don't force IRET path.
+ *
+ * So force IRET path after a ptrace stop.
+ */
+#define arch_ptrace_stop_needed(code, info) \
+({ \
+ set_thread_flag(TIF_NOTIFY_RESUME); \
+ false; \
+})
+
struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index c3fcb5de5083..6a1e71bde323 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -33,31 +33,41 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
{
int i;
+ int cpu = get_cpu();
- if (test_and_set_bit(0, &backtrace_flag))
+ if (test_and_set_bit(0, &backtrace_flag)) {
/*
* If there is already a trigger_all_cpu_backtrace() in progress
* (backtrace_flag == 1), don't output double cpu dump infos.
*/
+ put_cpu();
return;
+ }
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+ if (!include_self)
+ cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
- printk(KERN_INFO "sending NMI to all CPUs:\n");
- apic->send_IPI_all(NMI_VECTOR);
+ if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+ pr_info("sending NMI to %s CPUs:\n",
+ (include_self ? "all" : "other"));
+ apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
+ }
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
if (cpumask_empty(to_cpumask(backtrace_mask)))
break;
mdelay(1);
+ touch_softlockup_watchdog();
}
clear_bit(0, &backtrace_flag);
smp_mb__after_atomic();
+ put_cpu();
}
static int
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index f3a1f04ed4cb..584874451414 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -841,7 +841,6 @@ static int apm_do_idle(void)
u32 eax;
u8 ret = 0;
int idled = 0;
- int polling;
int err = 0;
if (!need_resched()) {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index a80029035bf2..f9e4fdd3b877 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -370,6 +370,17 @@ static void init_intel(struct cpuinfo_x86 *c)
*/
detect_extended_topology(c);
+ if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
+ /*
+ * let's use the legacy cpuid vector 0x1 and 0x4 for topology
+ * detection.
+ */
+ c->x86_max_cores = intel_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
+#endif
+ }
+
l2 = init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
@@ -438,17 +449,6 @@ static void init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_P3);
#endif
- if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
- /*
- * let's use the legacy cpuid vector 0x1 and 0x4 for topology
- * detection.
- */
- c->x86_max_cores = intel_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
- }
-
/* Work around errata */
srat_detect_node(c);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a952e9c85b6f..9c8f7394c612 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -730,6 +730,18 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
#endif
}
+#ifdef CONFIG_X86_HT
+ /*
+ * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
+ * turns means that the only possibility is SMT (as indicated in
+ * cpuid1). Since cpuid2 doesn't specify shared caches, and we know
+ * that SMT shares all caches, we can unconditionally set cpu_llc_id to
+ * c->phys_proc_id.
+ */
+ if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
+ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+#endif
+
c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
return l2;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index bb92f38153b2..9a79c8dbd8e8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2451,6 +2451,12 @@ static __init int mcheck_init_device(void)
for_each_online_cpu(i) {
err = mce_device_create(i);
if (err) {
+ /*
+ * Register notifier anyway (and do not unreg it) so
+ * that we don't leave undeleted timers, see notifier
+ * callback above.
+ */
+ __register_hotcpu_notifier(&mce_cpu_notifier);
cpu_notifier_register_done();
goto err_device_create;
}
@@ -2471,10 +2477,6 @@ static __init int mcheck_init_device(void)
err_register:
unregister_syscore_ops(&mce_syscore_ops);
- cpu_notifier_register_begin();
- __unregister_hotcpu_notifier(&mce_cpu_notifier);
- cpu_notifier_register_done();
-
err_device_create:
/*
* We didn't keep track of which devices were created above, but
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2bdfbff8a4f6..2879ecdaac43 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -118,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
continue;
if (event->attr.config1 & ~er->valid_mask)
return -EINVAL;
+ /* Check if the extra msrs can be safely accessed*/
+ if (!er->extra_msr_access)
+ return -ENXIO;
reg->idx = er->idx;
reg->config = event->attr.config1;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 3b2f9bdd974b..8ade93111e03 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -295,14 +295,16 @@ struct extra_reg {
u64 config_mask;
u64 valid_mask;
int idx; /* per_xxx->regs[] reg index */
+ bool extra_msr_access;
};
#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
- .event = (e), \
- .msr = (ms), \
- .config_mask = (m), \
- .valid_mask = (vm), \
- .idx = EXTRA_REG_##i, \
+ .event = (e), \
+ .msr = (ms), \
+ .config_mask = (m), \
+ .valid_mask = (vm), \
+ .idx = EXTRA_REG_##i, \
+ .extra_msr_access = true, \
}
#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index adb02aa62af5..2502d0d9d246 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1382,6 +1382,15 @@ again:
intel_pmu_lbr_read();
/*
+ * CondChgd bit 63 doesn't mean any overflow status. Ignore
+ * and clear the bit.
+ */
+ if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+ if (!status)
+ goto done;
+ }
+
+ /*
* PEBS overflow sets bit 62 in the global status register
*/
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
@@ -2173,6 +2182,41 @@ static void intel_snb_check_microcode(void)
}
}
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+ u64 val_old, val_new, val_tmp;
+
+ /*
+ * Read the current value, change it and read it back to see if it
+ * matches, this is needed to detect certain hardware emulators
+ * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+ */
+ if (rdmsrl_safe(msr, &val_old))
+ return false;
+
+ /*
+ * Only change the bits which can be updated by wrmsrl.
+ */
+ val_tmp = val_old ^ mask;
+ if (wrmsrl_safe(msr, val_tmp) ||
+ rdmsrl_safe(msr, &val_new))
+ return false;
+
+ if (val_new != val_tmp)
+ return false;
+
+ /* Here it's sure that the MSR can be safely accessed.
+ * Restore the old value and return.
+ */
+ wrmsrl(msr, val_old);
+
+ return true;
+}
+
static __init void intel_sandybridge_quirk(void)
{
x86_pmu.check_microcode = intel_snb_check_microcode;
@@ -2262,7 +2306,8 @@ __init int intel_pmu_init(void)
union cpuid10_ebx ebx;
struct event_constraint *c;
unsigned int unused;
- int version;
+ struct extra_reg *er;
+ int version, i;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
@@ -2465,6 +2510,9 @@ __init int intel_pmu_init(void)
case 62: /* IvyBridge EP */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ /* dTLB-load-misses on IVB is different than SNB */
+ hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
sizeof(hw_cache_extra_regs));
@@ -2565,6 +2613,34 @@ __init int intel_pmu_init(void)
}
}
+ /*
+ * Access LBR MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support LBR MSR
+ * Check all LBT MSR here.
+ * Disable LBR access if any LBR MSRs can not be accessed.
+ */
+ if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+ x86_pmu.lbr_nr = 0;
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+ check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+ x86_pmu.lbr_nr = 0;
+ }
+
+ /*
+ * Access extra MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support offcore event
+ * Check all extra_regs here.
+ */
+ if (x86_pmu.extra_regs) {
+ for (er = x86_pmu.extra_regs; er->msr; er++) {
+ er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
+ /* Disable LBR select mapping */
+ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+ x86_pmu.lbr_sel_map = NULL;
+ }
+ }
+
/* Support full width counters using alternative MSR range */
if (x86_pmu.intel_cap.full_width_write) {
x86_pmu.max_period = x86_pmu.cntval_mask;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 980970cb744d..696ade311ded 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -311,9 +311,11 @@ static int alloc_bts_buffer(int cpu)
if (!x86_pmu.bts)
return 0;
- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node);
- if (unlikely(!buffer))
+ buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+ if (unlikely(!buffer)) {
+ WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
+ }
max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
thresh = max / 16;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 65bbbea38b9c..ae6552a0701f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -550,16 +550,16 @@ static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
@@ -1222,6 +1222,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+
SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
@@ -1245,7 +1246,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f0da82b8e634..0d0c9d4ab6d5 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -423,8 +423,9 @@ sysenter_past_esp:
jnz sysenter_audit
sysenter_do_call:
cmpl $(NR_syscalls), %eax
- jae syscall_badsys
+ jae sysenter_badsys
call *sys_call_table(,%eax,4)
+sysenter_after_call:
movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
@@ -501,6 +502,7 @@ ENTRY(system_call)
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
+syscall_after_call:
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
@@ -674,8 +676,13 @@ syscall_fault:
END(syscall_fault)
syscall_badsys:
- movl $-ENOSYS,PT_EAX(%esp)
- jmp resume_userspace
+ movl $-ENOSYS,%eax
+ jmp syscall_after_call
+END(syscall_badsys)
+
+sysenter_badsys:
+ movl $-ENOSYS,%eax
+ jmp sysenter_after_call
END(syscall_badsys)
CFI_ENDPROC
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b25ca969edd2..c844f0816ab8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -830,27 +830,24 @@ restore_args:
RESTORE_ARGS 1,8,1
irq_return:
+ INTERRUPT_RETURN
+
+ENTRY(native_iret)
/*
* Are we returning to a stack segment from the LDT? Note: in
* 64-bit mode SS:RSP on the exception stack is always valid.
*/
#ifdef CONFIG_X86_ESPFIX64
testb $4,(SS-RIP)(%rsp)
- jnz irq_return_ldt
+ jnz native_irq_return_ldt
#endif
-irq_return_iret:
- INTERRUPT_RETURN
- _ASM_EXTABLE(irq_return_iret, bad_iret)
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
+native_irq_return_iret:
iretq
- _ASM_EXTABLE(native_iret, bad_iret)
-#endif
+ _ASM_EXTABLE(native_irq_return_iret, bad_iret)
#ifdef CONFIG_X86_ESPFIX64
-irq_return_ldt:
+native_irq_return_ldt:
pushq_cfi %rax
pushq_cfi %rdi
SWAPGS
@@ -872,7 +869,7 @@ irq_return_ldt:
SWAPGS
movq %rax,%rsp
popq_cfi %rax
- jmp irq_return_iret
+ jmp native_irq_return_iret
#endif
.section .fixup,"ax"
@@ -956,13 +953,8 @@ __do_double_fault:
cmpl $__KERNEL_CS,CS(%rdi)
jne do_double_fault
movq RIP(%rdi),%rax
- cmpq $irq_return_iret,%rax
-#ifdef CONFIG_PARAVIRT
- je 1f
- cmpq $native_iret,%rax
-#endif
+ cmpq $native_irq_return_iret,%rax
jne do_double_fault /* This shouldn't happen... */
-1:
movq PER_CPU_VAR(kernel_stack),%rax
subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
movq %rax,RSP(%rdi)
@@ -1428,7 +1420,7 @@ error_sti:
*/
error_kernelspace:
incl %ebx
- leaq irq_return_iret(%rip),%rcx
+ leaq native_irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
movl %ecx,%eax /* zero extend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 6afbb16e9b79..94d857fb1033 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -175,7 +175,7 @@ void init_espfix_ap(void)
if (!pud_present(pud)) {
pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
- paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PUD_CLONES; n++)
set_pud(&pud_p[n], pud);
}
@@ -185,7 +185,7 @@ void init_espfix_ap(void)
if (!pmd_present(pmd)) {
pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
- paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PMD_CLONES; n++)
set_pmd(&pmd_p[n], pmd);
}
@@ -193,7 +193,6 @@ void init_espfix_ap(void)
pte_p = pte_offset_kernel(&pmd, addr);
stack_page = (void *)__get_free_page(GFP_KERNEL);
pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
- paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PTE_CLONES; n++)
set_pte(&pte_p[n*PTE_STRIDE], pte);
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 7596df664901..67e6d19ef1be 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -574,6 +574,9 @@ int kprobe_int3_handler(struct pt_regs *regs)
struct kprobe *p;
struct kprobe_ctlblk *kcb;
+ if (user_mode_vm(regs))
+ return 0;
+
addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
/*
* We don't want to be preempted for the entire
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34f93eb..a1da6737ba5b 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
-DEF_NATIVE(pv_cpu_ops, iret, "iretq");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, iret);
PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index a0da58db43a8..2851d63c1202 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -363,7 +363,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
/* Set up to return from userspace. */
restorer = current->mm->context.vdso +
- selected_vdso32->sym___kernel_sigreturn;
+ selected_vdso32->sym___kernel_rt_sigreturn;
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
put_user_ex(restorer, &frame->pretcode);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c6eb418c5627..0d0e922fafc1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -343,6 +343,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
if (poke_int3_handler(regs))
return;
+ prev_state = exception_enter();
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
@@ -351,9 +352,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
#ifdef CONFIG_KPROBES
if (kprobe_int3_handler(regs))
- return;
+ goto exit;
#endif
- prev_state = exception_enter();
if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
@@ -433,6 +433,8 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
unsigned long dr6;
int si_code;
+ prev_state = exception_enter();
+
get_debugreg(dr6, 6);
/* Filter out all the reserved bits which are preset to 1 */
@@ -465,7 +467,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
if (kprobe_debug_handler(regs))
goto exit;
#endif
- prev_state = exception_enter();
if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
SIGTRAP) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 57e5ce126d5a..ea030319b321 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -920,9 +920,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- }
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu);
+ }
return 0;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ec8366c5cfea..b5e994ad0135 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1462,6 +1462,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
*/
if (var->unusable)
var->db = 0;
+ var->dpl = to_svm(vcpu)->vmcb->save.cpl;
break;
}
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f32a02578c0d..ef432f891d30 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1898,7 +1898,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
break;
gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
- if (kvm_write_guest(kvm, data,
+ if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
&tsc_ref, sizeof(tsc_ref)))
return 1;
mark_page_dirty(kvm, gfn);
@@ -5887,6 +5887,18 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
kvm_x86_ops->set_nmi(vcpu);
}
} else if (kvm_cpu_has_injectable_intr(vcpu)) {
+ /*
+ * Because interrupts can be injected asynchronously, we are
+ * calling check_nested_events again here to avoid a race condition.
+ * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
+ * proposal and current concerns. Perhaps we should be setting
+ * KVM_REQ_EVENT only on certain events and not unconditionally?
+ */
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ if (r != 0)
+ return r;
+ }
if (kvm_x86_ops->interrupt_allowed(vcpu)) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
false);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 3c0809a0631f..61b04fe36e66 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -11,7 +11,6 @@ VDSO32-$(CONFIG_COMPAT) := y
# files to link into the vdso
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vdso-fakesections.o
-vobjs-nox32 := vdso-fakesections.o
# files to link into kernel
obj-y += vma.o
@@ -67,7 +66,8 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso2c FORCE
#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
$(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
- -fno-omit-frame-pointer -foptimize-sibling-calls
+ -fno-omit-frame-pointer -foptimize-sibling-calls \
+ -DDISABLE_BRANCH_PROFILING
$(vobjs): KBUILD_CFLAGS += $(CFL)
@@ -134,7 +134,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
targets += vdso32/vdso32.lds
targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o)
-targets += vdso32/vclock_gettime.o
+targets += vdso32/vclock_gettime.o vdso32/vdso-fakesections.o
$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%)
@@ -150,11 +150,13 @@ KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
+KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
$(obj)/vdso32/vdso32.lds \
$(obj)/vdso32/vclock_gettime.o \
+ $(obj)/vdso32/vdso-fakesections.o \
$(obj)/vdso32/note.o \
$(obj)/vdso32/%.o
$(call if_changed,vdso)
@@ -169,14 +171,24 @@ quiet_cmd_vdso = VDSO $@
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
- -Wl,-Bsymbolic $(LTO_CFLAGS)
+ $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
GCOV_PROFILE := n
#
-# Install the unstripped copies of vdso*.so.
+# Install the unstripped copies of vdso*.so. If our toolchain supports
+# build-id, install .build-id links as well.
#
quiet_cmd_vdso_install = INSTALL $(@:install_%=%)
- cmd_vdso_install = cp $< $(MODLIB)/vdso/$(@:install_%=%)
+define cmd_vdso_install
+ cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \
+ if readelf -n $< |grep -q 'Build ID'; then \
+ buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \
+ first=`echo $$buildid | cut -b-2`; \
+ last=`echo $$buildid | cut -b3-`; \
+ mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \
+ ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \
+ fi
+endef
vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index b2e4f493e5b0..9793322751e0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -11,9 +11,6 @@
* Check with readelf after changing.
*/
-/* Disable profiling for userspace code: */
-#define DISABLE_BRANCH_PROFILING
-
#include <uapi/linux/time.h>
#include <asm/vgtod.h>
#include <asm/hpet.h>
diff --git a/arch/x86/vdso/vdso-fakesections.c b/arch/x86/vdso/vdso-fakesections.c
index cb8a8d72c24b..aa5fbfab20a5 100644
--- a/arch/x86/vdso/vdso-fakesections.c
+++ b/arch/x86/vdso/vdso-fakesections.c
@@ -2,31 +2,20 @@
* Copyright 2014 Andy Lutomirski
* Subject to the GNU Public License, v.2
*
- * Hack to keep broken Go programs working.
- *
- * The Go runtime had a couple of bugs: it would read the section table to try
- * to figure out how many dynamic symbols there were (it shouldn't have looked
- * at the section table at all) and, if there were no SHT_SYNDYM section table
- * entry, it would use an uninitialized value for the number of symbols. As a
- * workaround, we supply a minimal section table. vdso2c will adjust the
- * in-memory image so that "vdso_fake_sections" becomes the section table.
- *
- * The bug was introduced by:
- * https://code.google.com/p/go/source/detail?r=56ea40aac72b (2012-08-31)
- * and is being addressed in the Go runtime in this issue:
- * https://code.google.com/p/go/issues/detail?id=8197
+ * String table for loadable section headers. See vdso2c.h for why
+ * this exists.
*/
-#ifndef __x86_64__
-#error This hack is specific to the 64-bit vDSO
-#endif
-
-#include <linux/elf.h>
-
-extern const __visible struct elf64_shdr vdso_fake_sections[];
-const __visible struct elf64_shdr vdso_fake_sections[] = {
- {
- .sh_type = SHT_DYNSYM,
- .sh_entsize = sizeof(Elf64_Sym),
- }
-};
+const char fake_shstrtab[] __attribute__((section(".fake_shstrtab"))) =
+ ".hash\0"
+ ".dynsym\0"
+ ".dynstr\0"
+ ".gnu.version\0"
+ ".gnu.version_d\0"
+ ".dynamic\0"
+ ".rodata\0"
+ ".fake_shstrtab\0" /* Yay, self-referential code. */
+ ".note\0"
+ ".eh_frame_hdr\0"
+ ".eh_frame\0"
+ ".text";
diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S
index 2ec72f651ebf..9197544eea9a 100644
--- a/arch/x86/vdso/vdso-layout.lds.S
+++ b/arch/x86/vdso/vdso-layout.lds.S
@@ -6,6 +6,16 @@
* This script controls its layout.
*/
+#if defined(BUILD_VDSO64)
+# define SHDR_SIZE 64
+#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32)
+# define SHDR_SIZE 40
+#else
+# error unknown VDSO target
+#endif
+
+#define NUM_FAKE_SHDRS 13
+
SECTIONS
{
. = SIZEOF_HEADERS;
@@ -18,36 +28,53 @@ SECTIONS
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .rodata : {
+ *(.rodata*)
+ *(.data*)
+ *(.sdata*)
+ *(.got.plt) *(.got)
+ *(.gnu.linkonce.d.*)
+ *(.bss*)
+ *(.dynbss*)
+ *(.gnu.linkonce.b.*)
+
+ /*
+ * Ideally this would live in a C file, but that won't
+ * work cleanly for x32 until we start building the x32
+ * C code using an x32 toolchain.
+ */
+ VDSO_FAKE_SECTION_TABLE_START = .;
+ . = . + NUM_FAKE_SHDRS * SHDR_SIZE;
+ VDSO_FAKE_SECTION_TABLE_END = .;
+ } :text
+
+ .fake_shstrtab : { *(.fake_shstrtab) } :text
+
+
.note : { *(.note.*) } :text :note
.eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
.eh_frame : { KEEP (*(.eh_frame)) } :text
- .dynamic : { *(.dynamic) } :text :dynamic
-
- .rodata : { *(.rodata*) } :text
- .data : {
- *(.data*)
- *(.sdata*)
- *(.got.plt) *(.got)
- *(.gnu.linkonce.d.*)
- *(.bss*)
- *(.dynbss*)
- *(.gnu.linkonce.b.*)
- }
-
- .altinstructions : { *(.altinstructions) }
- .altinstr_replacement : { *(.altinstr_replacement) }
/*
- * Align the actual code well away from the non-instruction data.
- * This is the best thing for the I-cache.
+ * Text is well-separated from actual data: there's plenty of
+ * stuff that isn't used at runtime in between.
*/
- . = ALIGN(0x100);
.text : { *(.text*) } :text =0x90909090,
/*
+ * At the end so that eu-elflint stays happy when vdso2c strips
+ * these. A better implementation would avoid allocating space
+ * for these.
+ */
+ .altinstructions : { *(.altinstructions) } :text
+ .altinstr_replacement : { *(.altinstr_replacement) } :text
+
+ /*
* The remainder of the vDSO consists of special pages that are
* shared between the kernel and userspace. It needs to be at the
* end so that it doesn't overlap the mapping of the actual
@@ -75,6 +102,7 @@ SECTIONS
/DISCARD/ : {
*(.discard)
*(.discard.*)
+ *(__bug_table)
}
}
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index 75e3404c83b1..6807932643c2 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -6,6 +6,8 @@
* the DSO.
*/
+#define BUILD_VDSO64
+
#include "vdso-layout.lds.S"
/*
diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c
index 7a6bf50f9165..238dbe82776e 100644
--- a/arch/x86/vdso/vdso2c.c
+++ b/arch/x86/vdso/vdso2c.c
@@ -23,6 +23,8 @@ enum {
sym_vvar_page,
sym_hpet_page,
sym_end_mapping,
+ sym_VDSO_FAKE_SECTION_TABLE_START,
+ sym_VDSO_FAKE_SECTION_TABLE_END,
};
const int special_pages[] = {
@@ -30,15 +32,26 @@ const int special_pages[] = {
sym_hpet_page,
};
-char const * const required_syms[] = {
- [sym_vvar_page] = "vvar_page",
- [sym_hpet_page] = "hpet_page",
- [sym_end_mapping] = "end_mapping",
- "VDSO32_NOTE_MASK",
- "VDSO32_SYSENTER_RETURN",
- "__kernel_vsyscall",
- "__kernel_sigreturn",
- "__kernel_rt_sigreturn",
+struct vdso_sym {
+ const char *name;
+ bool export;
+};
+
+struct vdso_sym required_syms[] = {
+ [sym_vvar_page] = {"vvar_page", true},
+ [sym_hpet_page] = {"hpet_page", true},
+ [sym_end_mapping] = {"end_mapping", true},
+ [sym_VDSO_FAKE_SECTION_TABLE_START] = {
+ "VDSO_FAKE_SECTION_TABLE_START", false
+ },
+ [sym_VDSO_FAKE_SECTION_TABLE_END] = {
+ "VDSO_FAKE_SECTION_TABLE_END", false
+ },
+ {"VDSO32_NOTE_MASK", true},
+ {"VDSO32_SYSENTER_RETURN", true},
+ {"__kernel_vsyscall", true},
+ {"__kernel_sigreturn", true},
+ {"__kernel_rt_sigreturn", true},
};
__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
@@ -83,37 +96,21 @@ extern void bad_put_le(void);
#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0]))
-#define BITS 64
-#define GOFUNC go64
-#define Elf_Ehdr Elf64_Ehdr
-#define Elf_Shdr Elf64_Shdr
-#define Elf_Phdr Elf64_Phdr
-#define Elf_Sym Elf64_Sym
-#define Elf_Dyn Elf64_Dyn
+#define BITSFUNC3(name, bits) name##bits
+#define BITSFUNC2(name, bits) BITSFUNC3(name, bits)
+#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS)
+
+#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x
+#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
+#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x)
+
+#define ELF_BITS 64
#include "vdso2c.h"
-#undef BITS
-#undef GOFUNC
-#undef Elf_Ehdr
-#undef Elf_Shdr
-#undef Elf_Phdr
-#undef Elf_Sym
-#undef Elf_Dyn
-
-#define BITS 32
-#define GOFUNC go32
-#define Elf_Ehdr Elf32_Ehdr
-#define Elf_Shdr Elf32_Shdr
-#define Elf_Phdr Elf32_Phdr
-#define Elf_Sym Elf32_Sym
-#define Elf_Dyn Elf32_Dyn
+#undef ELF_BITS
+
+#define ELF_BITS 32
#include "vdso2c.h"
-#undef BITS
-#undef GOFUNC
-#undef Elf_Ehdr
-#undef Elf_Shdr
-#undef Elf_Phdr
-#undef Elf_Sym
-#undef Elf_Dyn
+#undef ELF_BITS
static void go(void *addr, size_t len, FILE *outfile, const char *name)
{
diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h
index c6eefaf389b9..11b65d4f9414 100644
--- a/arch/x86/vdso/vdso2c.h
+++ b/arch/x86/vdso/vdso2c.h
@@ -4,23 +4,139 @@
* are built for 32-bit userspace.
*/
-static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
+/*
+ * We're writing a section table for a few reasons:
+ *
+ * The Go runtime had a couple of bugs: it would read the section
+ * table to try to figure out how many dynamic symbols there were (it
+ * shouldn't have looked at the section table at all) and, if there
+ * were no SHT_SYNDYM section table entry, it would use an
+ * uninitialized value for the number of symbols. An empty DYNSYM
+ * table would work, but I see no reason not to write a valid one (and
+ * keep full performance for old Go programs). This hack is only
+ * needed on x86_64.
+ *
+ * The bug was introduced on 2012-08-31 by:
+ * https://code.google.com/p/go/source/detail?r=56ea40aac72b
+ * and was fixed on 2014-06-13 by:
+ * https://code.google.com/p/go/source/detail?r=fc1cd5e12595
+ *
+ * Binutils has issues debugging the vDSO: it reads the section table to
+ * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which
+ * would break build-id if we removed the section table. Binutils
+ * also requires that shstrndx != 0. See:
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=17064
+ *
+ * elfutils might not look for PT_NOTE if there is a section table at
+ * all. I don't know whether this matters for any practical purpose.
+ *
+ * For simplicity, rather than hacking up a partial section table, we
+ * just write a mostly complete one. We omit non-dynamic symbols,
+ * though, since they're rather large.
+ *
+ * Once binutils gets fixed, we might be able to drop this for all but
+ * the 64-bit vdso, since build-id only works in kernel RPMs, and
+ * systems that update to new enough kernel RPMs will likely update
+ * binutils in sync. build-id has never worked for home-built kernel
+ * RPMs without manual symlinking, and I suspect that no one ever does
+ * that.
+ */
+struct BITSFUNC(fake_sections)
+{
+ ELF(Shdr) *table;
+ unsigned long table_offset;
+ int count, max_count;
+
+ int in_shstrndx;
+ unsigned long shstr_offset;
+ const char *shstrtab;
+ size_t shstrtab_len;
+
+ int out_shstrndx;
+};
+
+static unsigned int BITSFUNC(find_shname)(struct BITSFUNC(fake_sections) *out,
+ const char *name)
+{
+ const char *outname = out->shstrtab;
+ while (outname - out->shstrtab < out->shstrtab_len) {
+ if (!strcmp(name, outname))
+ return (outname - out->shstrtab) + out->shstr_offset;
+ outname += strlen(outname) + 1;
+ }
+
+ if (*name)
+ printf("Warning: could not find output name \"%s\"\n", name);
+ return out->shstr_offset + out->shstrtab_len - 1; /* Use a null. */
+}
+
+static void BITSFUNC(init_sections)(struct BITSFUNC(fake_sections) *out)
+{
+ if (!out->in_shstrndx)
+ fail("didn't find the fake shstrndx\n");
+
+ memset(out->table, 0, out->max_count * sizeof(ELF(Shdr)));
+
+ if (out->max_count < 1)
+ fail("we need at least two fake output sections\n");
+
+ PUT_LE(&out->table[0].sh_type, SHT_NULL);
+ PUT_LE(&out->table[0].sh_name, BITSFUNC(find_shname)(out, ""));
+
+ out->count = 1;
+}
+
+static void BITSFUNC(copy_section)(struct BITSFUNC(fake_sections) *out,
+ int in_idx, const ELF(Shdr) *in,
+ const char *name)
+{
+ uint64_t flags = GET_LE(&in->sh_flags);
+
+ bool copy = flags & SHF_ALLOC &&
+ (GET_LE(&in->sh_size) ||
+ (GET_LE(&in->sh_type) != SHT_RELA &&
+ GET_LE(&in->sh_type) != SHT_REL)) &&
+ strcmp(name, ".altinstructions") &&
+ strcmp(name, ".altinstr_replacement");
+
+ if (!copy)
+ return;
+
+ if (out->count >= out->max_count)
+ fail("too many copied sections (max = %d)\n", out->max_count);
+
+ if (in_idx == out->in_shstrndx)
+ out->out_shstrndx = out->count;
+
+ out->table[out->count] = *in;
+ PUT_LE(&out->table[out->count].sh_name,
+ BITSFUNC(find_shname)(out, name));
+
+ /* elfutils requires that a strtab have the correct type. */
+ if (!strcmp(name, ".fake_shstrtab"))
+ PUT_LE(&out->table[out->count].sh_type, SHT_STRTAB);
+
+ out->count++;
+}
+
+static void BITSFUNC(go)(void *addr, size_t len,
+ FILE *outfile, const char *name)
{
int found_load = 0;
unsigned long load_size = -1; /* Work around bogus warning */
unsigned long data_size;
- Elf_Ehdr *hdr = (Elf_Ehdr *)addr;
+ ELF(Ehdr) *hdr = (ELF(Ehdr) *)addr;
int i;
unsigned long j;
- Elf_Shdr *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
+ ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
*alt_sec = NULL;
- Elf_Dyn *dyn = 0, *dyn_end = 0;
+ ELF(Dyn) *dyn = 0, *dyn_end = 0;
const char *secstrings;
uint64_t syms[NSYMS] = {};
- uint64_t fake_sections_value = 0, fake_sections_size = 0;
+ struct BITSFUNC(fake_sections) fake_sections = {};
- Elf_Phdr *pt = (Elf_Phdr *)(addr + GET_LE(&hdr->e_phoff));
+ ELF(Phdr) *pt = (ELF(Phdr) *)(addr + GET_LE(&hdr->e_phoff));
/* Walk the segment table. */
for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
@@ -51,7 +167,7 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
for (i = 0; dyn + i < dyn_end &&
GET_LE(&dyn[i].d_tag) != DT_NULL; i++) {
typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag);
- if (tag == DT_REL || tag == DT_RELSZ ||
+ if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA ||
tag == DT_RELENT || tag == DT_TEXTREL)
fail("vdso image contains dynamic relocations\n");
}
@@ -61,7 +177,7 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx);
secstrings = addr + GET_LE(&secstrings_hdr->sh_offset);
for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
- Elf_Shdr *sh = addr + GET_LE(&hdr->e_shoff) +
+ ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * i;
if (GET_LE(&sh->sh_type) == SHT_SYMTAB)
symtab_hdr = sh;
@@ -82,29 +198,63 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
i++) {
int k;
- Elf_Sym *sym = addr + GET_LE(&symtab_hdr->sh_offset) +
+ ELF(Sym) *sym = addr + GET_LE(&symtab_hdr->sh_offset) +
GET_LE(&symtab_hdr->sh_entsize) * i;
const char *name = addr + GET_LE(&strtab_hdr->sh_offset) +
GET_LE(&sym->st_name);
for (k = 0; k < NSYMS; k++) {
- if (!strcmp(name, required_syms[k])) {
+ if (!strcmp(name, required_syms[k].name)) {
if (syms[k]) {
fail("duplicate symbol %s\n",
- required_syms[k]);
+ required_syms[k].name);
}
syms[k] = GET_LE(&sym->st_value);
}
}
- if (!strcmp(name, "vdso_fake_sections")) {
- if (fake_sections_value)
- fail("duplicate vdso_fake_sections\n");
- fake_sections_value = GET_LE(&sym->st_value);
- fake_sections_size = GET_LE(&sym->st_size);
+ if (!strcmp(name, "fake_shstrtab")) {
+ ELF(Shdr) *sh;
+
+ fake_sections.in_shstrndx = GET_LE(&sym->st_shndx);
+ fake_sections.shstrtab = addr + GET_LE(&sym->st_value);
+ fake_sections.shstrtab_len = GET_LE(&sym->st_size);
+ sh = addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize) *
+ fake_sections.in_shstrndx;
+ fake_sections.shstr_offset = GET_LE(&sym->st_value) -
+ GET_LE(&sh->sh_addr);
}
}
+ /* Build the output section table. */
+ if (!syms[sym_VDSO_FAKE_SECTION_TABLE_START] ||
+ !syms[sym_VDSO_FAKE_SECTION_TABLE_END])
+ fail("couldn't find fake section table\n");
+ if ((syms[sym_VDSO_FAKE_SECTION_TABLE_END] -
+ syms[sym_VDSO_FAKE_SECTION_TABLE_START]) % sizeof(ELF(Shdr)))
+ fail("fake section table size isn't a multiple of sizeof(Shdr)\n");
+ fake_sections.table = addr + syms[sym_VDSO_FAKE_SECTION_TABLE_START];
+ fake_sections.table_offset = syms[sym_VDSO_FAKE_SECTION_TABLE_START];
+ fake_sections.max_count = (syms[sym_VDSO_FAKE_SECTION_TABLE_END] -
+ syms[sym_VDSO_FAKE_SECTION_TABLE_START]) /
+ sizeof(ELF(Shdr));
+
+ BITSFUNC(init_sections)(&fake_sections);
+ for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
+ ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize) * i;
+ BITSFUNC(copy_section)(&fake_sections, i, sh,
+ secstrings + GET_LE(&sh->sh_name));
+ }
+ if (!fake_sections.out_shstrndx)
+ fail("didn't generate shstrndx?!?\n");
+
+ PUT_LE(&hdr->e_shoff, fake_sections.table_offset);
+ PUT_LE(&hdr->e_shentsize, sizeof(ELF(Shdr)));
+ PUT_LE(&hdr->e_shnum, fake_sections.count);
+ PUT_LE(&hdr->e_shstrndx, fake_sections.out_shstrndx);
+
/* Validate mapping addresses. */
for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
if (!syms[i])
@@ -112,25 +262,17 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
if (syms[i] % 4096)
fail("%s must be a multiple of 4096\n",
- required_syms[i]);
+ required_syms[i].name);
if (syms[i] < data_size)
fail("%s must be after the text mapping\n",
- required_syms[i]);
+ required_syms[i].name);
if (syms[sym_end_mapping] < syms[i] + 4096)
- fail("%s overruns end_mapping\n", required_syms[i]);
+ fail("%s overruns end_mapping\n",
+ required_syms[i].name);
}
if (syms[sym_end_mapping] % 4096)
fail("end_mapping must be a multiple of 4096\n");
- /* Remove sections or use fakes */
- if (fake_sections_size % sizeof(Elf_Shdr))
- fail("vdso_fake_sections size is not a multiple of %ld\n",
- (long)sizeof(Elf_Shdr));
- PUT_LE(&hdr->e_shoff, fake_sections_value);
- PUT_LE(&hdr->e_shentsize, fake_sections_value ? sizeof(Elf_Shdr) : 0);
- PUT_LE(&hdr->e_shnum, fake_sections_size / sizeof(Elf_Shdr));
- PUT_LE(&hdr->e_shstrndx, SHN_UNDEF);
-
if (!name) {
fwrite(addr, load_size, 1, outfile);
return;
@@ -168,9 +310,9 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
(unsigned long)GET_LE(&alt_sec->sh_size));
}
for (i = 0; i < NSYMS; i++) {
- if (syms[i])
+ if (required_syms[i].export && syms[i])
fprintf(outfile, "\t.sym_%s = 0x%" PRIx64 ",\n",
- required_syms[i], syms[i]);
+ required_syms[i].name, syms[i]);
}
fprintf(outfile, "};\n");
}
diff --git a/arch/x86/vdso/vdso32/vdso-fakesections.c b/arch/x86/vdso/vdso32/vdso-fakesections.c
new file mode 100644
index 000000000000..541468e25265
--- /dev/null
+++ b/arch/x86/vdso/vdso32/vdso-fakesections.c
@@ -0,0 +1 @@
+#include "../vdso-fakesections.c"
diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S
index 46b991b578a8..697c11ece90c 100644
--- a/arch/x86/vdso/vdsox32.lds.S
+++ b/arch/x86/vdso/vdsox32.lds.S
@@ -6,6 +6,8 @@
* the DSO.
*/
+#define BUILD_VDSOX32
+
#include "vdso-layout.lds.S"
/*
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index e1513c47872a..5a5176de8d0a 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -62,6 +62,9 @@ struct linux_binprm;
Only used for the 64-bit and x32 vdsos. */
static unsigned long vdso_addr(unsigned long start, unsigned len)
{
+#ifdef CONFIG_X86_32
+ return 0;
+#else
unsigned long addr, end;
unsigned offset;
end = (start + PMD_SIZE - 1) & PMD_MASK;
@@ -83,6 +86,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
addr = align_vdso_addr(addr);
return addr;
+#endif
}
static int map_vdso(const struct vdso_image *image, bool calculate_addr)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f17b29210ac4..ffb101e45731 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1537,7 +1537,10 @@ asmlinkage __visible void __init xen_start_kernel(void)
if (!xen_pvh_domain())
pv_cpu_ops = xen_cpu_ops;
- x86_init.resources.memory_setup = xen_memory_setup;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ x86_init.resources.memory_setup = xen_auto_xlated_memory_setup;
+ else
+ x86_init.resources.memory_setup = xen_memory_setup;
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index c98583588580..ebfa9b2c871d 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -36,99 +36,133 @@
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
+#include <xen/xen.h>
#include <asm/pgtable.h>
-static int map_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
+static struct gnttab_vm_area {
+ struct vm_struct *area;
+ pte_t **ptes;
+} gnttab_shared_vm_area, gnttab_status_vm_area;
+
+int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ void **__shared)
{
- unsigned long **frames = (unsigned long **)data;
+ void *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
- set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
- (*frames)++;
- return 0;
-}
+ if (shared == NULL)
+ *__shared = shared = gnttab_shared_vm_area.area->addr;
-/*
- * This function is used to map shared frames to store grant status. It is
- * different from map_pte_fn above, the frames type here is uint64_t.
- */
-static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
-{
- uint64_t **frames = (uint64_t **)data;
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
- set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
- (*frames)++;
return 0;
}
-static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ grant_status_t **__shared)
{
+ grant_status_t *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == NULL)
+ *__shared = shared = gnttab_status_vm_area.area->addr;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
- set_pte_at(&init_mm, addr, pte, __pte(0));
return 0;
}
-int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
- unsigned long max_nr_gframes,
- void **__shared)
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
{
- int rc;
- void *shared = *__shared;
+ pte_t **ptes;
+ unsigned long addr;
+ unsigned long i;
- if (shared == NULL) {
- struct vm_struct *area =
- alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
- BUG_ON(area == NULL);
- shared = area->addr;
- *__shared = shared;
- }
+ if (shared == gnttab_status_vm_area.area->addr)
+ ptes = gnttab_status_vm_area.ptes;
+ else
+ ptes = gnttab_shared_vm_area.ptes;
- rc = apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes,
- map_pte_fn, &frames);
- return rc;
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, ptes[i], __pte(0));
+ addr += PAGE_SIZE;
+ }
}
-int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
- unsigned long max_nr_gframes,
- grant_status_t **__shared)
+static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
{
- int rc;
- grant_status_t *shared = *__shared;
+ area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL);
+ if (area->ptes == NULL)
+ return -ENOMEM;
- if (shared == NULL) {
- /* No need to pass in PTE as we are going to do it
- * in apply_to_page_range anyhow. */
- struct vm_struct *area =
- alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
- BUG_ON(area == NULL);
- shared = area->addr;
- *__shared = shared;
+ area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes);
+ if (area->area == NULL) {
+ kfree(area->ptes);
+ return -ENOMEM;
}
- rc = apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes,
- map_pte_fn_status, &frames);
- return rc;
+ return 0;
}
-void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
+static void arch_gnttab_vfree(struct gnttab_vm_area *area)
+{
+ free_vm_area(area->area);
+ kfree(area->ptes);
+}
+
+int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
{
- apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
+ int ret;
+
+ if (!xen_pv_domain())
+ return 0;
+
+ ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Always allocate the space for the status frames in case
+ * we're migrated to a host with V2 support.
+ */
+ ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+ err:
+ arch_gnttab_vfree(&gnttab_shared_vm_area);
+ return -ENOMEM;
}
+
#ifdef CONFIG_XEN_PVH
#include <xen/balloon.h>
#include <xen/events.h>
-#include <xen/xen.h>
#include <linux/slab.h>
static int __init xlated_setup_gnttab_pages(void)
{
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 821a11ada590..2e555163c2fe 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -27,7 +27,6 @@
#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
-#include "mmu.h"
#include "xen-ops.h"
#include "vdso.h"
@@ -82,9 +81,6 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
memblock_reserve(start, size);
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
xen_max_p2m_pfn = PFN_DOWN(start + size);
for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
unsigned long mfn = pfn_to_mfn(pfn);
@@ -107,7 +103,6 @@ static unsigned long __init xen_do_chunk(unsigned long start,
.domid = DOMID_SELF
};
unsigned long len = 0;
- int xlated_phys = xen_feature(XENFEAT_auto_translated_physmap);
unsigned long pfn;
int ret;
@@ -121,7 +116,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
continue;
frame = mfn;
} else {
- if (!xlated_phys && mfn != INVALID_P2M_ENTRY)
+ if (mfn != INVALID_P2M_ENTRY)
continue;
frame = pfn;
}
@@ -159,13 +154,6 @@ static unsigned long __init xen_do_chunk(unsigned long start,
static unsigned long __init xen_release_chunk(unsigned long start,
unsigned long end)
{
- /*
- * Xen already ballooned out the E820 non RAM regions for us
- * and set them up properly in EPT.
- */
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return end - start;
-
return xen_do_chunk(start, end, true);
}
@@ -234,13 +222,7 @@ static void __init xen_set_identity_and_release_chunk(
* (except for the ISA region which must be 1:1 mapped) to
* release the refcounts (in Xen) on the original frames.
*/
-
- /*
- * PVH E820 matches the hypervisor's P2M which means we need to
- * account for the proper values of *release and *identity.
- */
- for (pfn = start_pfn; !xen_feature(XENFEAT_auto_translated_physmap) &&
- pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
+ for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
pte_t pte = __pte_ma(0);
if (pfn < PFN_UP(ISA_END_ADDRESS))
@@ -518,6 +500,35 @@ char * __init xen_memory_setup(void)
}
/*
+ * Machine specific memory setup for auto-translated guests.
+ */
+char * __init xen_auto_xlated_memory_setup(void)
+{
+ static struct e820entry map[E820MAX] __initdata;
+
+ struct xen_memory_map memmap;
+ int i;
+ int rc;
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if (rc < 0)
+ panic("No memory map (%d)\n", rc);
+
+ sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+
+ for (i = 0; i < memmap.nr_entries; i++)
+ e820_add_region(map[i].addr, map[i].size, map[i].type);
+
+ memblock_reserve(__pa(xen_start_info->mfn_list),
+ xen_start_info->pt_base - xen_start_info->mfn_list);
+
+ return "Xen";
+}
+
+/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
* can have un-truncated segments, so wrapping around is allowed.
@@ -590,13 +601,7 @@ void xen_enable_syscall(void)
}
#endif /* CONFIG_X86_64 */
}
-void xen_enable_nmi(void)
-{
-#ifdef CONFIG_X86_64
- if (register_callback(CALLBACKTYPE_nmi, (char *)nmi))
- BUG();
-#endif
-}
+
void __init xen_pvmmu_arch_setup(void)
{
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
@@ -611,7 +616,6 @@ void __init xen_pvmmu_arch_setup(void)
xen_enable_sysenter();
xen_enable_syscall();
- xen_enable_nmi();
}
/* This function is not called for HVM domains */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c834d4b231f0..97d87659f779 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -36,6 +36,7 @@ void xen_mm_unpin_all(void);
void xen_set_pat(u64);
char * __init xen_memory_setup(void);
+char * xen_auto_xlated_memory_setup(void);
void __init xen_arch_setup(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);