diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-22 17:05:15 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-22 17:05:15 -0700 |
commit | 8e204874db000928e37199c2db82b7eb8966cc3c (patch) | |
tree | eae66035cb761c3c5a79e98b92280b5156bc01ef /arch/x86/kernel | |
parent | 3e0b8df79ddb8955d2cce5e858972a9cfe763384 (diff) | |
parent | aafade242ff24fac3aabf61c7861dfa44a3c2445 (diff) |
Merge branch 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
x86-64, vdso: Do not allocate memory for the vDSO
clocksource: Change __ARCH_HAS_CLOCKSOURCE_DATA to a CONFIG option
x86, vdso: Drop now wrong comment
Document the vDSO and add a reference parser
ia64: Replace clocksource.fsys_mmio with generic arch data
x86-64: Move vread_tsc and vread_hpet into the vDSO
clocksource: Replace vread with generic arch data
x86-64: Add --no-undefined to vDSO build
x86-64: Allow alternative patching in the vDSO
x86: Make alternative instruction pointers relative
x86-64: Improve vsyscall emulation CS and RIP handling
x86-64: Emulate legacy vsyscalls
x86-64: Fill unused parts of the vsyscall page with 0xcc
x86-64: Remove vsyscall number 3 (venosys)
x86-64: Map the HPET NX
x86-64: Remove kernel.vsyscall64 sysctl
x86-64: Give vvars their own page
x86-64: Document some of entry_64.S
x86-64: Fix alignment of jiffies variable
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/Makefile | 8 | ||||
-rw-r--r-- | arch/x86/kernel/alternative.c | 23 | ||||
-rw-r--r-- | arch/x86/kernel/entry_64.S | 4 | ||||
-rw-r--r-- | arch/x86/kernel/hpet.c | 11 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/tsc.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/vmlinux.lds.S | 49 | ||||
-rw-r--r-- | arch/x86/kernel/vread_tsc_64.c | 36 | ||||
-rw-r--r-- | arch/x86/kernel/vsyscall_64.c | 310 | ||||
-rw-r--r-- | arch/x86/kernel/vsyscall_emu_64.S | 27 |
10 files changed, 216 insertions, 260 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 11817ff85399..04105574c8e9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -24,17 +24,12 @@ endif nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) -CFLAGS_vread_tsc_64.o := $(nostackp) CFLAGS_paravirt.o := $(nostackp) GCOV_PROFILE_vsyscall_64.o := n GCOV_PROFILE_hpet.o := n GCOV_PROFILE_tsc.o := n -GCOV_PROFILE_vread_tsc_64.o := n GCOV_PROFILE_paravirt.o := n -# vread_tsc_64 is hot and should be fully optimized: -CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls - obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o @@ -43,7 +38,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o +obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o +obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a81f2d52f869..c63822816249 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -14,7 +14,6 @@ #include <asm/pgtable.h> #include <asm/mce.h> #include <asm/nmi.h> -#include <asm/vsyscall.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/io.h> @@ -250,7 +249,6 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -extern char __vsyscall_0; void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. @@ -263,6 +261,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *end) { struct alt_instr *a; + u8 *instr, *replacement; u8 insnbuf[MAX_PATCH_LEN]; DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); @@ -276,25 +275,23 @@ void __init_or_module apply_alternatives(struct alt_instr *start, * order. */ for (a = start; a < end; a++) { - u8 *instr = a->instr; + instr = (u8 *)&a->instr_offset + a->instr_offset; + replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); BUG_ON(a->cpuid >= NCAPINTS*32); if (!boot_cpu_has(a->cpuid)) continue; -#ifdef CONFIG_X86_64 - /* vsyscall code is not mapped yet. resolve it manually. */ - if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { - instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); - DPRINTK("%s: vsyscall fixup: %p => %p\n", - __func__, a->instr, instr); - } -#endif - memcpy(insnbuf, a->replacement, a->replacementlen); + + memcpy(insnbuf, replacement, a->replacementlen); + + /* 0xe8 is a relative jump; fix the offset. */ if (*insnbuf == 0xe8 && a->replacementlen == 5) - *(s32 *)(insnbuf + 1) += a->replacement - a->instr; + *(s32 *)(insnbuf + 1) += replacement - instr; + add_nops(insnbuf + a->replacementlen, a->instrlen - a->replacementlen); + text_poke_early(instr, insnbuf, a->instrlen); } } diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 37e895a1c74d..e13329d800c8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -9,6 +9,8 @@ /* * entry.S contains the system-call and fault low-level handling routines. * + * Some of this is documented in Documentation/x86/entry_64.txt + * * NOTE: This code handles signal-recognition, which happens every time * after an interrupt and after each system call. * @@ -1109,6 +1111,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug zeroentry coprocessor_error do_coprocessor_error errorentry alignment_check do_alignment_check zeroentry simd_coprocessor_error do_simd_coprocessor_error +zeroentry emulate_vsyscall do_emulate_vsyscall + /* Reload gs selector with exception handling */ /* edi: new selector */ diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 0f4b0651cd3f..4aecc54236a9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -72,7 +72,7 @@ static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); #ifdef CONFIG_X86_64 - __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE); #endif } @@ -739,13 +739,6 @@ static cycle_t read_hpet(struct clocksource *cs) return (cycle_t)hpet_readl(HPET_COUNTER); } -#ifdef CONFIG_X86_64 -static cycle_t __vsyscall_fn vread_hpet(void) -{ - return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); -} -#endif - static struct clocksource clocksource_hpet = { .name = "hpet", .rating = 250, @@ -754,7 +747,7 @@ static struct clocksource clocksource_hpet = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, #ifdef CONFIG_X86_64 - .vread = vread_hpet, + .archdata = { .vclock_mode = VCLOCK_HPET }, #endif }; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b9b67166f9de..fbc097a085ca 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -872,6 +872,12 @@ void __init trap_init(void) set_bit(SYSCALL_VECTOR, used_vectors); #endif +#ifdef CONFIG_X86_64 + BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors)); + set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall); + set_bit(VSYSCALL_EMU_VECTOR, used_vectors); +#endif + /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6cc6922262af..56c633a5db72 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -777,7 +777,7 @@ static struct clocksource clocksource_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, #ifdef CONFIG_X86_64 - .vread = vread_tsc, + .archdata = { .vclock_mode = VCLOCK_TSC }, #endif }; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 89aed99aafce..4aa9c54a9b76 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -161,50 +161,47 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) -#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \ - ADDR(.vsyscall_0) + offset \ - : AT(VLOAD(.vsyscall_var_ ## x)) { \ - *(.vsyscall_var_ ## x) \ - } \ - x = VVIRT(.vsyscall_var_ ## x); . = ALIGN(4096); __vsyscall_0 = .; . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VLOAD(.vsyscall_0)) { + .vsyscall : AT(VLOAD(.vsyscall)) { *(.vsyscall_0) - } :user - . = ALIGN(L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { - *(.vsyscall_fn) - } - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + . = 1024; *(.vsyscall_1) - } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { - *(.vsyscall_2) - } - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { - *(.vsyscall_3) - } - -#define __VVAR_KERNEL_LDS -#include <asm/vvar.h> -#undef __VVAR_KERNEL_LDS + . = 2048; + *(.vsyscall_2) - . = __vsyscall_0 + PAGE_SIZE; + . = 4096; /* Pad the whole page. */ + } :user =0xcc + . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); #undef VSYSCALL_ADDR #undef VLOAD_OFFSET #undef VLOAD #undef VVIRT_OFFSET #undef VVIRT + + __vvar_page = .; + + .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = offset; \ + *(.vvar_ ## name) +#define __VVAR_KERNEL_LDS +#include <asm/vvar.h> +#undef __VVAR_KERNEL_LDS #undef EMIT_VVAR + } :data + + . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); + #endif /* CONFIG_X86_64 */ /* Init code and data - will be freed after init */ diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c deleted file mode 100644 index a81aa9e9894c..000000000000 --- a/arch/x86/kernel/vread_tsc_64.c +++ /dev/null @@ -1,36 +0,0 @@ -/* This code runs in userspace. */ - -#define DISABLE_BRANCH_PROFILING -#include <asm/vgtod.h> - -notrace cycle_t __vsyscall_fn vread_tsc(void) -{ - cycle_t ret; - u64 last; - - /* - * Empirically, a fence (of type that depends on the CPU) - * before rdtsc is enough to ensure that rdtsc is ordered - * with respect to loads. The various CPU manuals are unclear - * as to whether rdtsc can be reordered with later loads, - * but no one has ever seen it happen. - */ - rdtsc_barrier(); - ret = (cycle_t)vget_cycles(); - - last = VVAR(vsyscall_gtod_data).clock.cycle_last; - - if (likely(ret >= last)) - return ret; - - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a funciton of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3e682184d76c..dda7dff9cef7 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -2,6 +2,8 @@ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * + * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] + * * Thanks to hpa@transmeta.com for some useful hint. * Special thanks to Ingo Molnar for his early experience with * a different vsyscall implementation for Linux/IA32 and for the name. @@ -11,10 +13,9 @@ * vsyscalls. One vsyscall can reserve more than 1 slot to avoid * jumping out of line if necessary. We cannot add more with this * mechanism because older kernels won't return -ENOSYS. - * If we want more than four we need a vDSO. * - * Note: the concept clashes with user mode linux. If you use UML and - * want per guest time just set the kernel.vsyscall64 sysctl to 0. + * Note: the concept clashes with user mode linux. UML users should + * use the vDSO. */ /* Disable profiling for userspace code: */ @@ -32,9 +33,12 @@ #include <linux/cpu.h> #include <linux/smp.h> #include <linux/notifier.h> +#include <linux/syscalls.h> +#include <linux/ratelimit.h> #include <asm/vsyscall.h> #include <asm/pgtable.h> +#include <asm/compat.h> #include <asm/page.h> #include <asm/unistd.h> #include <asm/fixmap.h> @@ -44,16 +48,12 @@ #include <asm/desc.h> #include <asm/topology.h> #include <asm/vgtod.h> - -#define __vsyscall(nr) \ - __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace -#define __syscall_clobber "r11","cx","memory" +#include <asm/traps.h> DEFINE_VVAR(int, vgetcpu_mode); DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = { .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), - .sysctl_enabled = 1, }; void update_vsyscall_tz(void) @@ -72,179 +72,149 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, unsigned long flags; write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* copy vsyscall data */ - vsyscall_gtod_data.clock.vread = clock->vread; - vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; - vsyscall_gtod_data.clock.mask = clock->mask; - vsyscall_gtod_data.clock.mult = mult; - vsyscall_gtod_data.clock.shift = clock->shift; - vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; - vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = *wtm; - vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; + vsyscall_gtod_data.clock.mask = clock->mask; + vsyscall_gtod_data.clock.mult = mult; + vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + vsyscall_gtod_data.wall_to_monotonic = *wtm; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -/* RED-PEN may want to readd seq locking, but then the variable should be - * write-once. - */ -static __always_inline void do_get_tz(struct timezone * tz) +static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, + const char *message) { - *tz = VVAR(vsyscall_gtod_data).sys_tz; -} + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + struct task_struct *tsk; -static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) -{ - int ret; - asm volatile("syscall" - : "=a" (ret) - : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) - : __syscall_clobber ); - return ret; -} + if (!show_unhandled_signals || !__ratelimit(&rs)) + return; -static __always_inline long time_syscall(long *t) -{ - long secs; - asm volatile("syscall" - : "=a" (secs) - : "0" (__NR_time),"D" (t) : __syscall_clobber); - return secs; -} + tsk = current; -static __always_inline void do_vgettimeofday(struct timeval * tv) -{ - cycle_t now, base, mask, cycle_delta; - unsigned seq; - unsigned long mult, shift, nsec; - cycle_t (*vread)(void); - do { - seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); - - vread = VVAR(vsyscall_gtod_data).clock.vread; - if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled || - !vread)) { - gettimeofday(tv,NULL); - return; - } - - now = vread(); - base = VVAR(vsyscall_gtod_data).clock.cycle_last; - mask = VVAR(vsyscall_gtod_data).clock.mask; - mult = VVAR(vsyscall_gtod_data).clock.mult; - shift = VVAR(vsyscall_gtod_data).clock.shift; - - tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec; - nsec = VVAR(vsyscall_gtod_data).wall_time_nsec; - } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); - - /* calculate interval: */ - cycle_delta = (now - base) & mask; - /* convert to nsecs: */ - nsec += (cycle_delta * mult) >> shift; - - while (nsec >= NSEC_PER_SEC) { - tv->tv_sec += 1; - nsec -= NSEC_PER_SEC; - } - tv->tv_usec = nsec / NSEC_PER_USEC; + printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, tsk->comm, task_pid_nr(tsk), + message, regs->ip - 2, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); } -int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) +static int addr_to_vsyscall_nr(unsigned long addr) { - if (tv) - do_vgettimeofday(tv); - if (tz) - do_get_tz(tz); - return 0; -} + int nr; -/* This will break when the xtime seconds get inaccurate, but that is - * unlikely */ -time_t __vsyscall(1) vtime(time_t *t) -{ - unsigned seq; - time_t result; - if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) - return time_syscall(t); + if ((addr & ~0xC00UL) != VSYSCALL_START) + return -EINVAL; - do { - seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); + nr = (addr & 0xC00UL) >> 10; + if (nr >= 3) + return -EINVAL; - result = VVAR(vsyscall_gtod_data).wall_time_sec; + return nr; +} - } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); +void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk; + unsigned long caller; + int vsyscall_nr; + long ret; + + local_irq_enable(); + + /* + * Real 64-bit user mode code has cs == __USER_CS. Anything else + * is bogus. + */ + if (regs->cs != __USER_CS) { + /* + * If we trapped from kernel mode, we might as well OOPS now + * instead of returning to some random address and OOPSing + * then. + */ + BUG_ON(!user_mode(regs)); + + /* Compat mode and non-compat 32-bit CS should both segfault. */ + warn_bad_vsyscall(KERN_WARNING, regs, + "illegal int 0xcc from 32-bit mode"); + goto sigsegv; + } - if (t) - *t = result; - return result; -} + /* + * x86-ism here: regs->ip points to the instruction after the int 0xcc, + * and int 0xcc is two bytes long. + */ + vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); + if (vsyscall_nr < 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "illegal int 0xcc (exploit attempt?)"); + goto sigsegv; + } -/* Fast way to get current CPU and node. - This helps to do per node and per CPU caches in user space. - The result is not guaranteed without CPU affinity, but usually - works out because the scheduler tries to keep a thread on the same - CPU. + if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { + warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); + goto sigsegv; + } - tcache must point to a two element sized long array. - All arguments can be NULL. */ -long __vsyscall(2) -vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) -{ - unsigned int p; - unsigned long j = 0; - - /* Fast cache - only recompute value once per jiffies and avoid - relatively costly rdtscp/cpuid otherwise. - This works because the scheduler usually keeps the process - on the same CPU and this syscall doesn't guarantee its - results anyways. - We do this here because otherwise user space would do it on - its own in a likely inferior way (no access to jiffies). - If you don't like it pass NULL. */ - if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) { - p = tcache->blob[1]; - } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ - native_read_tscp(&p); - } else { - /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + tsk = current; + if (seccomp_mode(&tsk->seccomp)) + do_exit(SIGKILL); + + switch (vsyscall_nr) { + case 0: + ret = sys_gettimeofday( + (struct timeval __user *)regs->di, + (struct timezone __user *)regs->si); + break; + + case 1: + ret = sys_time((time_t __user *)regs->di); + break; + + case 2: + ret = sys_getcpu((unsigned __user *)regs->di, + (unsigned __user *)regs->si, + 0); + break; } - if (tcache) { - tcache->blob[0] = j; - tcache->blob[1] = p; + + if (ret == -EFAULT) { + /* + * Bad news -- userspace fed a bad pointer to a vsyscall. + * + * With a real vsyscall, that would have caused SIGSEGV. + * To make writing reliable exploits using the emulated + * vsyscalls harder, generate SIGSEGV here as well. + */ + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall fault (exploit attempt?)"); + goto sigsegv; } - if (cpu) - *cpu = p & 0xfff; - if (node) - *node = p >> 12; - return 0; -} -static long __vsyscall(3) venosys_1(void) -{ - return -ENOSYS; -} + regs->ax = ret; -#ifdef CONFIG_SYSCTL -static ctl_table kernel_table2[] = { - { .procname = "vsyscall64", - .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec }, - {} -}; + /* Emulate a ret instruction. */ + regs->ip = caller; + regs->sp += 8; -static ctl_table kernel_root_table2[] = { - { .procname = "kernel", .mode = 0555, - .child = kernel_table2 }, - {} -}; -#endif + local_irq_disable(); + return; + +sigsegv: + regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ + force_sig(SIGSEGV, current); + local_irq_disable(); +} -/* Assume __initcall executes before all user space. Hopefully kmod - doesn't violate that. We'll find out if it does. */ +/* + * Assume __initcall executes before all user space. Hopefully kmod + * doesn't violate that. We'll find out if it does. + */ static void __cpuinit vsyscall_set_cpu(int cpu) { unsigned long d; @@ -255,13 +225,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu) if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) write_rdtscp_aux((node << 12) | cpu); - /* Store cpu number in limit so that it can be loaded quickly - in user space in vgetcpu. - 12 bits for the CPU and 8 bits for the node. */ + /* + * Store cpu number in limit so that it can be loaded quickly + * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) + */ d = 0x0f40000000000ULL; d |= cpu; d |= (node & 0xf) << 12; d |= (node >> 4) << 48; + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } @@ -275,8 +247,10 @@ static int __cpuinit cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) { long cpu = (long)arg; + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); + return NOTIFY_DONE; } @@ -284,25 +258,23 @@ void __init map_vsyscall(void) { extern char __vsyscall_0; unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + extern char __vvar_page; + unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); + __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS); } static int __init vsyscall_init(void) { - BUG_ON(((unsigned long) &vgettimeofday != - VSYSCALL_ADDR(__NR_vgettimeofday))); - BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); - BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); - BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); -#ifdef CONFIG_SYSCTL - register_sysctl_table(kernel_root_table2); -#endif + BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); + on_each_cpu(cpu_vsyscall_init, NULL, 1); /* notifier priority > KVM */ hotcpu_notifier(cpu_vsyscall_notifier, 30); + return 0; } - __initcall(vsyscall_init); diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S new file mode 100644 index 000000000000..ffa845eae5ca --- /dev/null +++ b/arch/x86/kernel/vsyscall_emu_64.S @@ -0,0 +1,27 @@ +/* + * vsyscall_emu_64.S: Vsyscall emulation page + * + * Copyright (c) 2011 Andy Lutomirski + * + * Subject to the GNU General Public License, version 2 + */ + +#include <linux/linkage.h> +#include <asm/irq_vectors.h> + +/* The unused parts of the page are filled with 0xcc by the linker script. */ + +.section .vsyscall_0, "a" +ENTRY(vsyscall_0) + int $VSYSCALL_EMU_VECTOR +END(vsyscall_0) + +.section .vsyscall_1, "a" +ENTRY(vsyscall_1) + int $VSYSCALL_EMU_VECTOR +END(vsyscall_1) + +.section .vsyscall_2, "a" +ENTRY(vsyscall_2) + int $VSYSCALL_EMU_VECTOR +END(vsyscall_2) |