From e37e43a497d5a8b7c0cc1736d56986f432c394c9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Aug 2016 02:35:23 -0700 Subject: x86/mm/64: Enable vmapped stacks (CONFIG_HAVE_ARCH_VMAP_STACK=y) This allows x86_64 kernels to enable vmapped stacks by setting HAVE_ARCH_VMAP_STACK=y - which enables the CONFIG_VMAP_STACK=y high level Kconfig option. There are a couple of interesting bits: First, x86 lazily faults in top-level paging entries for the vmalloc area. This won't work if we get a page fault while trying to access the stack: the CPU will promote it to a double-fault and we'll die. To avoid this problem, probe the new stack when switching stacks and forcibly populate the pgd entry for the stack when switching mms. Second, once we have guard pages around the stack, we'll want to detect and handle stack overflow. I didn't enable it on x86_32. We'd need to rework the double-fault code a bit and I'm concerned about running out of vmalloc virtual addresses under some workloads. This patch, by itself, will behave somewhat erratically when the stack overflows while RSP is still more than a few tens of bytes above the bottom of the stack. Specifically, we'll get #PF and make it to no_context and them oops without reliably triggering a double-fault, and no_context doesn't know about stack overflows. The next patch will improve that case. Thank you to Nadav and Brian for helping me pay enough attention to the SDM to hopefully get this right. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Nadav Amit Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c88f3e2920b18e6cc621d772a04a62c06869037e.1470907718.git.luto@kernel.org [ Minor edits. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/switch_to.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'arch/x86/include/asm/switch_to.h') diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8f321a1b03a1..14e4b20f0aaf 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -8,6 +8,28 @@ struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); +/* This runs runs on the previous thread's stack. */ +static inline void prepare_switch_to(struct task_struct *prev, + struct task_struct *next) +{ +#ifdef CONFIG_VMAP_STACK + /* + * If we switch to a stack that has a top-level paging entry + * that is not present in the current mm, the resulting #PF will + * will be promoted to a double-fault and we'll panic. Probe + * the new stack now so that vmalloc_fault can fix up the page + * tables if needed. This can only happen if we use a stack + * in vmap space. + * + * We assume that the stack is aligned so that it never spans + * more than one top-level paging entry. + * + * To minimize cache pollution, just follow the stack pointer. + */ + READ_ONCE(*(unsigned char *)next->thread.sp); +#endif +} + #ifdef CONFIG_X86_32 #ifdef CONFIG_CC_STACKPROTECTOR @@ -39,6 +61,8 @@ do { \ */ \ unsigned long ebx, ecx, edx, esi, edi; \ \ + prepare_switch_to(prev, next); \ + \ asm volatile("pushl %%ebp\n\t" /* save EBP */ \ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ @@ -103,7 +127,9 @@ do { \ * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL * has no effect. */ -#define switch_to(prev, next, last) \ +#define switch_to(prev, next, last) \ + prepare_switch_to(prev, next); \ + \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ -- cgit v1.2.3 From 7b32aeadbc95d4a41402c1c0da6aa3ab51af4c10 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:18 -0400 Subject: sched/x86: Add 'struct inactive_task_frame' to better document the sleeping task stack frame Add 'struct inactive_task_frame', which defines the layout of the stack for a sleeping process. For now, the only defined field is the BP register (frame pointer). Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-4-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/switch_to.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/include/asm/switch_to.h') diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 14e4b20f0aaf..ec689c62c01f 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -30,6 +30,11 @@ static inline void prepare_switch_to(struct task_struct *prev, #endif } +/* data that is pointed to by thread.sp */ +struct inactive_task_frame { + unsigned long bp; +}; + #ifdef CONFIG_X86_32 #ifdef CONFIG_CC_STACKPROTECTOR -- cgit v1.2.3 From 0100301bfdf56a2a370c7157b5ab0fbf9313e1cd Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:19 -0400 Subject: sched/x86: Rewrite the switch_to() code Move the low-level context switch code to an out-of-line asm stub instead of using complex inline asm. This allows constructing a new stack frame for the child process to make it seamlessly flow to ret_from_fork without an extra test and branch in __switch_to(). It also improves code generation for __schedule() by using the C calling convention instead of clobbering all registers. Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-5-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/switch_to.h | 139 ++++++--------------------------------- 1 file changed, 21 insertions(+), 118 deletions(-) (limited to 'arch/x86/include/asm/switch_to.h') diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index ec689c62c01f..886d5ea09dba 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -2,8 +2,12 @@ #define _ASM_X86_SWITCH_TO_H struct task_struct; /* one of the stranger aspects of C forward declarations */ + +struct task_struct *__switch_to_asm(struct task_struct *prev, + struct task_struct *next); + __visible struct task_struct *__switch_to(struct task_struct *prev, - struct task_struct *next); + struct task_struct *next); struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); @@ -32,131 +36,30 @@ static inline void prepare_switch_to(struct task_struct *prev, /* data that is pointed to by thread.sp */ struct inactive_task_frame { +#ifdef CONFIG_X86_64 + unsigned long r15; + unsigned long r14; + unsigned long r13; + unsigned long r12; +#else + unsigned long si; + unsigned long di; +#endif + unsigned long bx; unsigned long bp; + unsigned long ret_addr; }; -#ifdef CONFIG_X86_32 - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movl %P[task_canary](%[next]), %%ebx\n\t" \ - "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" -#define __switch_canary_oparam \ - , [stack_canary] "=m" (stack_canary.canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ +struct fork_frame { + struct inactive_task_frame frame; + struct pt_regs regs; +}; -/* - * Saving eflags is important. It switches not only IOPL between tasks, - * it also protects other tasks from NT leaking through sysenter etc. - */ #define switch_to(prev, next, last) \ do { \ - /* \ - * Context-switching clobbers all registers, so we clobber \ - * them explicitly, via unused output variables. \ - * (EAX and EBP is not listed because EBP is saved/restored \ - * explicitly for wchan access and EAX is the return value of \ - * __switch_to()) \ - */ \ - unsigned long ebx, ecx, edx, esi, edi; \ - \ prepare_switch_to(prev, next); \ \ - asm volatile("pushl %%ebp\n\t" /* save EBP */ \ - "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ - "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ - "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ - "pushl %[next_ip]\n\t" /* restore EIP */ \ - __switch_canary \ - "jmp __switch_to\n" /* regparm call */ \ - "1:\t" \ - "popl %%ebp\n\t" /* restore EBP */ \ - \ - /* output parameters */ \ - : [prev_sp] "=m" (prev->thread.sp), \ - [prev_ip] "=m" (prev->thread.ip), \ - "=a" (last), \ - \ - /* clobbered output registers: */ \ - "=b" (ebx), "=c" (ecx), "=d" (edx), \ - "=S" (esi), "=D" (edi) \ - \ - __switch_canary_oparam \ - \ - /* input parameters: */ \ - : [next_sp] "m" (next->thread.sp), \ - [next_ip] "m" (next->thread.ip), \ - \ - /* regparm parameters for __switch_to(): */ \ - [prev] "a" (prev), \ - [next] "d" (next) \ - \ - __switch_canary_iparam \ - \ - : /* reloaded segment registers */ \ - "memory"); \ + ((last) = __switch_to_asm((prev), (next))); \ } while (0) -#else /* CONFIG_X86_32 */ - -/* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" - -#define __EXTRA_CLOBBER \ - , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15", "flags" - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movq %P[task_canary](%%rsi),%%r8\n\t" \ - "movq %%r8,"__percpu_arg([gs_canary])"\n\t" -#define __switch_canary_oparam \ - , [gs_canary] "=m" (irq_stack_union.stack_canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ - -/* - * There is no need to save or restore flags, because flags are always - * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL - * has no effect. - */ -#define switch_to(prev, next, last) \ - prepare_switch_to(prev, next); \ - \ - asm volatile(SAVE_CONTEXT \ - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ - "call __switch_to\n\t" \ - "movq "__percpu_arg([current_task])",%%rsi\n\t" \ - __switch_canary \ - "movq %P[thread_info](%%rsi),%%r8\n\t" \ - "movq %%rax,%%rdi\n\t" \ - "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ - "jnz ret_from_fork\n\t" \ - RESTORE_CONTEXT \ - : "=a" (last) \ - __switch_canary_oparam \ - : [next] "S" (next), [prev] "D" (prev), \ - [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ - [ti_flags] "i" (offsetof(struct thread_info, flags)), \ - [_tif_fork] "i" (_TIF_FORK), \ - [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [current_task] "m" (current_task) \ - __switch_canary_iparam \ - : "memory", "cc" __EXTRA_CLOBBER) - -#endif /* CONFIG_X86_32 */ - #endif /* _ASM_X86_SWITCH_TO_H */ -- cgit v1.2.3 From 616d24835eeafa8ef3466479db028abfdfc77531 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:20 -0400 Subject: sched/x86: Pass kernel thread parameters in 'struct fork_frame' Instead of setting up a fake pt_regs context, put the kernel thread function pointer and arg into the unused callee-restored registers of 'struct fork_frame'. Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-6-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/switch_to.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include/asm/switch_to.h') diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 886d5ea09dba..5cb436acd463 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -34,6 +34,8 @@ static inline void prepare_switch_to(struct task_struct *prev, #endif } +asmlinkage void ret_from_fork(void); + /* data that is pointed to by thread.sp */ struct inactive_task_frame { #ifdef CONFIG_X86_64 -- cgit v1.2.3