From becf292446e9f2dc8842c448836bbe8005e24db0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 29 Jan 2016 11:42:57 -0800 Subject: x86/mm: Add INVPCID helpers commit 060a402a1ddb551455ee410de2eadd3349f2801b upstream. This adds helpers for each of the four currently-specified INVPCID modes. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/8a62b23ad686888cee01da134c91409e22064db9.1454096309.git.luto@kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/tlbflush.h | 48 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6433e28dc9c8..5f494d8bd158 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -7,6 +7,54 @@ #include #include +static inline void __invpcid(unsigned long pcid, unsigned long addr, + unsigned long type) +{ + u64 desc[2] = { pcid, addr }; + + /* + * The memory clobber is because the whole point is to invalidate + * stale TLB entries and, especially if we're flushing global + * mappings, we don't want the compiler to reorder any subsequent + * memory accesses before the TLB flush. + * + * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and + * invpcid (%rcx), %rax in long mode. + */ + asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" + : : "m" (desc), "a" (type), "c" (desc) : "memory"); +} + +#define INVPCID_TYPE_INDIV_ADDR 0 +#define INVPCID_TYPE_SINGLE_CTXT 1 +#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 +#define INVPCID_TYPE_ALL_NON_GLOBAL 3 + +/* Flush all mappings for a given pcid and addr, not including globals. */ +static inline void invpcid_flush_one(unsigned long pcid, + unsigned long addr) +{ + __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invpcid_flush_single_context(unsigned long pcid) +{ + __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invpcid_flush_all(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invpcid_flush_all_nonglobals(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +} + #ifdef CONFIG_PARAVIRT #include #else -- cgit v1.2.3 From 04ec428b15f161ce8449756fb64b6f380c8d95fd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 10 Feb 2016 15:51:16 +0100 Subject: x86/mm: Fix INVPCID asm constraint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e2c7698cd61f11d4077fdb28148b2d31b82ac848 upstream. So we want to specify the dependency on both @pcid and @addr so that the compiler doesn't reorder accesses to them *before* the TLB flush. But for that to work, we need to express this properly in the inline asm and deref the whole desc array, not the pointer to it. See clwb() for an example. This fixes the build error on 32-bit: arch/x86/include/asm/tlbflush.h: In function ‘__invpcid’: arch/x86/include/asm/tlbflush.h:26:18: error: memory input 0 is not directly addressable which gcc4.7 caught but 5.x didn't. Which is strange. :-\ Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Michael Matz Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/tlbflush.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 5f494d8bd158..bd0ee0639bf0 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -10,7 +10,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, unsigned long type) { - u64 desc[2] = { pcid, addr }; + struct { u64 d[2]; } desc = { { pcid, addr } }; /* * The memory clobber is because the whole point is to invalidate @@ -22,7 +22,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, * invpcid (%rcx), %rax in long mode. */ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" - : : "m" (desc), "a" (type), "c" (desc) : "memory"); + : : "m" (desc), "a" (type), "c" (&desc) : "memory"); } #define INVPCID_TYPE_INDIV_ADDR 0 -- cgit v1.2.3 From 85d3700c744a11ee2989252acf50ccbbd814167a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 29 Jan 2016 11:42:59 -0800 Subject: x86/mm: If INVPCID is available, use it to flush global mappings commit d8bced79af1db6734f66b42064cc773cada2ce99 upstream. On my Skylake laptop, INVPCID function 2 (flush absolutely everything) takes about 376ns, whereas saving flags, twiddling CR4.PGE to flush global mappings, and restoring flags takes about 539ns. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/ed0ef62581c0ea9c99b9bf6df726015e96d44743.1454096309.git.luto@kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/tlbflush.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index bd0ee0639bf0..6cf0893e9b70 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -159,6 +159,15 @@ static inline void __native_flush_tlb_global(void) { unsigned long flags; + if (static_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. + */ + invpcid_flush_all(); + return; + } + /* * Read-modify-write to CR4 - protect it from preemption and * from interrupts. (Use the raw variant because this code can -- cgit v1.2.3 From 8d5ee51a6bce71d8905d8f01d0931f69be4489d5 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 1 Apr 2016 14:31:26 -0700 Subject: mm/rmap: batched invalidations should use existing api commit 858eaaa711700ce4595e039441e239e56d7b9514 upstream. The recently introduced batched invalidations mechanism uses its own mechanism for shootdown. However, it does wrong accounting of interrupts (e.g., inc_irq_stat is called for local invalidations), trace-points (e.g., TLB_REMOTE_SHOOTDOWN for local invalidations) and may break some platforms as it bypasses the invalidation mechanisms of Xen and SGI UV. This patch reuses the existing TLB flushing mechnaisms instead. We use NULL as mm to indicate a global invalidation is required. Fixes 72b252aed506b8 ("mm: send one IPI per CPU to TLB flush all entries after unmapping pages") Signed-off-by: Nadav Amit Cc: Mel Gorman Cc: Rik van Riel Cc: Dave Hansen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/tlbflush.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6cf0893e9b70..4dc534175b5e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -325,12 +325,6 @@ static inline void reset_lazy_tlbstate(void) #endif /* SMP */ -/* Not inlined due to inc_irq_stat not being defined yet */ -#define flush_tlb_local() { \ - inc_irq_stat(irq_tlb_count); \ - local_flush_tlb(); \ -} - #ifndef CONFIG_PARAVIRT #define flush_tlb_others(mask, mm, start, end) \ native_flush_tlb_others(mask, mm, start, end) -- cgit v1.2.3 From 70a39c7fd167399fde76aeac314dce026a255b49 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 26 Apr 2016 09:39:08 -0700 Subject: x86/mm, sched/core: Uninline switch_mm() commit 69c0319aabba45bcf33178916a2f06967b4adede upstream. It's fairly large and it has quite a few callers. This may also help untangle some headers down the road. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/54f3367803e7f80b2be62c8a21879aa74b1a5f57.1461688545.git.luto@kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/mmu_context.h | 98 +------------------------------------- 1 file changed, 2 insertions(+), 96 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index bfd9b2a35a0b..96e233b3597a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -104,103 +104,9 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) #endif } -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) -{ - unsigned cpu = smp_processor_id(); - - if (likely(prev != next)) { -#ifdef CONFIG_SMP - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - this_cpu_write(cpu_tlbstate.active_mm, next); -#endif - cpumask_set_cpu(cpu, mm_cpumask(next)); - - /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - * - */ - load_cr3(next->pgd); - - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - - /* Stop flush ipis for the previous mm */ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); - - /* Load per-mm CR4 state */ - load_mm_cr4(next); +extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); -#ifdef CONFIG_MODIFY_LDT_SYSCALL - /* - * Load the LDT, if the LDT is different. - * - * It's possible that prev->context.ldt doesn't match - * the LDT register. This can happen if leave_mm(prev) - * was called and then modify_ldt changed - * prev->context.ldt but suppressed an IPI to this CPU. - * In this case, prev->context.ldt != NULL, because we - * never set context.ldt to NULL while the mm still - * exists. That means that next->context.ldt != - * prev->context.ldt, because mms never share an LDT. - */ - if (unlikely(prev->context.ldt != next->context.ldt)) - load_mm_ldt(next); -#endif - } -#ifdef CONFIG_SMP - else { - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - - if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { - /* - * On established mms, the mm_cpumask is only changed - * from irq context, from ptep_clear_flush() while in - * lazy tlb mode, and here. Irqs are blocked during - * schedule, protecting us from simultaneous changes. - */ - cpumask_set_cpu(cpu, mm_cpumask(next)); - - /* - * We were in lazy tlb mode and leave_mm disabled - * tlb flush IPI delivery. We must reload CR3 - * to make sure to use no freed page tables. - * - * As above, load_cr3() is serializing and orders TLB - * fills with respect to the mm_cpumask write. - */ - load_cr3(next->pgd); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - load_mm_cr4(next); - load_mm_ldt(next); - } - } -#endif -} #define activate_mm(prev, next) \ do { \ -- cgit v1.2.3 From 4ead44fd2525ed97e5362a806d312a0e3b0ea445 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 26 Apr 2016 09:39:09 -0700 Subject: x86/mm, sched/core: Turn off IRQs in switch_mm() commit 078194f8e9fe3cf54c8fd8bded48a1db5bd8eb8a upstream. Potential races between switch_mm() and TLB-flush or LDT-flush IPIs could be very messy. AFAICT the code is currently okay, whether by accident or by careful design, but enabling PCID will make it considerably more complicated and will no longer be obviously safe. Fix it with a big hammer: run switch_mm() with IRQs off. To avoid a performance hit in the scheduler, we take advantage of our knowledge that the scheduler already has IRQs disabled when it calls switch_mm(). Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f19baf759693c9dcae64bbff76189db77cb13398.1461688545.git.luto@kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/mmu_context.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 96e233b3597a..44fc93987869 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -107,6 +107,9 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk); +extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); +#define switch_mm_irqs_off switch_mm_irqs_off #define activate_mm(prev, next) \ do { \ -- cgit v1.2.3 From 3b9d9ec0d8261bb9b12f858e66f0c84cd2a6a3bb Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 11 Aug 2016 15:44:30 +0800 Subject: x86/irq: Do not substract irq_tlb_count from irq_call_count commit 82ba4faca1bffad429f15c90c980ffd010366c25 upstream. Since commit: 52aec3308db8 ("x86/tlb: replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR") the TLB remote shootdown is done through call function vector. That commit didn't take care of irq_tlb_count, which a later commit: fd0f5869724f ("x86: Distinguish TLB shootdown interrupts from other functions call interrupts") ... tried to fix. The fix assumes every increase of irq_tlb_count has a corresponding increase of irq_call_count. So the irq_call_count is always bigger than irq_tlb_count and we could substract irq_tlb_count from irq_call_count. Unfortunately this is not true for the smp_call_function_single() case. The IPI is only sent if the target CPU's call_single_queue is empty when adding a csd into it in generic_exec_single. That means if two threads are both adding flush tlb csds to the same CPU's call_single_queue, only one IPI is sent. In other words, the irq_call_count is incremented by 1 but irq_tlb_count is incremented by 2. Over time, irq_tlb_count will be bigger than irq_call_count and the substract will produce a very large irq_call_count value due to overflow. Considering that: 1) it's not worth to send more IPIs for the sake of accurate counting of irq_call_count in generic_exec_single(); 2) it's not easy to tell if the call function interrupt is for TLB shootdown in __smp_call_function_single_interrupt(). Not to exclude TLB shootdown from call function count seems to be the simplest fix and this patch just does that. This bug was found by LKP's cyclic performance regression tracking recently with the vm-scalability test suite. I have bisected to commit: 3dec0ba0be6a ("mm/rmap: share the i_mmap_rwsem") This commit didn't do anything wrong but revealed the irq_call_count problem. IIUC, the commit makes rwc->remap_one in rmap_walk_file concurrent with multiple threads. When remap_one is try_to_unmap_one(), then multiple threads could queue flush TLB to the same CPU but only one IPI will be sent. Since the commit was added in Linux v3.19, the counting problem only shows up from v3.19 onwards. Signed-off-by: Aaron Lu Cc: Alex Shi Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Ying Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tomoki Sekiyama Link: http://lkml.kernel.org/r/20160811074430.GA18163@aaronlu.sh.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/hardirq.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 7178043b0e1d..59405a248fc2 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -22,10 +22,6 @@ typedef struct { #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; - /* - * irq_tlb_count is double-counted in irq_call_count, so it must be - * subtracted from irq_call_count when displaying irq_call_count - */ unsigned int irq_tlb_count; #endif #ifdef CONFIG_X86_THERMAL_VECTOR -- cgit v1.2.3 From 227d6f0e79f809e448d3157fbfd00eb54dcbb54e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 22 Apr 2017 00:01:20 -0700 Subject: x86/mm: Remove flush_tlb() and flush_tlb_current_task() commit 29961b59a51f8c6838a26a45e871a7ed6771809b upstream. I was trying to figure out what how flush_tlb_current_task() would possibly work correctly if current->mm != current->active_mm, but I realized I could spare myself the effort: it has no callers except the unused flush_tlb() macro. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e52d64c11690f85e9f1d69d7b48cc2269cd2e94b.1492844372.git.luto@kernel.org Signed-off-by: Ingo Molnar Cc: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/tlbflush.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4dc534175b5e..d9b3a9937166 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -204,7 +204,6 @@ static inline void __flush_tlb_one(unsigned long addr) /* * TLB flushing: * - * - flush_tlb() flushes the current mm struct TLBs * - flush_tlb_all() flushes all processes TLBs * - flush_tlb_mm(mm) flushes the specified mm context TLB's * - flush_tlb_page(vma, vmaddr) flushes one page @@ -236,11 +235,6 @@ static inline void flush_tlb_all(void) __flush_tlb_all(); } -static inline void flush_tlb(void) -{ - __flush_tlb_up(); -} - static inline void local_flush_tlb(void) { __flush_tlb_up(); @@ -302,14 +296,11 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) extern void flush_tlb_all(void); -extern void flush_tlb_current_task(void); extern void flush_tlb_page(struct vm_area_struct *, unsigned long); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); -#define flush_tlb() flush_tlb_current_task() - void native_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end); -- cgit v1.2.3 From 3efba6062a410a2a65fc9d6f53dca63db2602e65 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 22 May 2017 15:30:01 -0700 Subject: x86/mm: Reimplement flush_tlb_page() using flush_tlb_mm_range() commit ca6c99c0794875c6d1db6e22f246699691ab7e6b upstream. flush_tlb_page() was very similar to flush_tlb_mm_range() except that it had a couple of issues: - It was missing an smp_mb() in the case where current->active_mm != mm. (This is a longstanding bug reported by Nadav Amit) - It was missing tracepoints and vm counter updates. The only reason that I can see for keeping it at as a separate function is that it could avoid a few branches that flush_tlb_mm_range() needs to decide to flush just one page. This hardly seems worthwhile. If we decide we want to get rid of those branches again, a better way would be to introduce an __flush_tlb_mm_range() helper and make both flush_tlb_page() and flush_tlb_mm_range() use it. Signed-off-by: Andy Lutomirski Acked-by: Kees Cook Cc: Andrew Morton Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Michal Hocko Cc: Nadav Amit Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/3cc3847cf888d8907577569b8bac3f01992ef8f9.1495492063.git.luto@kernel.org Signed-off-by: Ingo Molnar Cc: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/tlbflush.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d9b3a9937166..47f628464935 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -296,11 +296,15 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) extern void flush_tlb_all(void); -extern void flush_tlb_page(struct vm_area_struct *, unsigned long); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); +static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) +{ + flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE); +} + void native_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end); -- cgit v1.2.3 From b2e24274d50e0ecdf560ebe06dbed0cc648ad3f9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 28 May 2017 10:00:14 -0700 Subject: x86/mm: Remove the UP asm/tlbflush.h code, always use the (formerly) SMP code commit ce4a4e565f5264909a18c733b864c3f74467f69e upstream. The UP asm/tlbflush.h generates somewhat nicer code than the SMP version. Aside from that, it's fallen quite a bit behind the SMP code: - flush_tlb_mm_range() didn't flush individual pages if the range was small. - The lazy TLB code was much weaker. This usually wouldn't matter, but, if a kernel thread flushed its lazy "active_mm" more than once (due to reclaim or similar), it wouldn't be unlazied and would instead pointlessly flush repeatedly. - Tracepoints were missing. Aside from that, simply having the UP code around was a maintanence burden, since it means that any change to the TLB flush code had to make sure not to break it. Simplify everything by deleting the UP code. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Michal Hocko Cc: Nadav Amit Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar Cc: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/hardirq.h | 2 +- arch/x86/include/asm/mmu.h | 6 --- arch/x86/include/asm/mmu_context.h | 2 - arch/x86/include/asm/tlbflush.h | 78 +------------------------------------- 4 files changed, 2 insertions(+), 86 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 59405a248fc2..9b76cd331990 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -22,8 +22,8 @@ typedef struct { #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; - unsigned int irq_tlb_count; #endif + unsigned int irq_tlb_count; #ifdef CONFIG_X86_THERMAL_VECTOR unsigned int irq_thermal_count; #endif diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 55234d5e7160..7680b76adafc 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -24,12 +24,6 @@ typedef struct { atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ } mm_context_t; -#ifdef CONFIG_SMP void leave_mm(int cpu); -#else -static inline void leave_mm(int cpu) -{ -} -#endif #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 44fc93987869..9bfc5fd77015 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -98,10 +98,8 @@ static inline void load_mm_ldt(struct mm_struct *mm) static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { -#ifdef CONFIG_SMP if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); -#endif } extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 47f628464935..822ad36be17b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -6,6 +6,7 @@ #include #include +#include static inline void __invpcid(unsigned long pcid, unsigned long addr, unsigned long type) @@ -64,10 +65,8 @@ static inline void invpcid_flush_all_nonglobals(void) #endif struct tlb_state { -#ifdef CONFIG_SMP struct mm_struct *active_mm; int state; -#endif /* * Access to this CR4 shadow and to H/W CR4 is protected by @@ -215,79 +214,6 @@ static inline void __flush_tlb_one(unsigned long addr) * and page-granular flushes are available only on i486 and up. */ -#ifndef CONFIG_SMP - -/* "_up" is for UniProcessor. - * - * This is a helper for other header functions. *Not* intended to be called - * directly. All global TLB flushes need to either call this, or to bump the - * vm statistics themselves. - */ -static inline void __flush_tlb_up(void) -{ - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); -} - -static inline void flush_tlb_all(void) -{ - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb_all(); -} - -static inline void local_flush_tlb(void) -{ - __flush_tlb_up(); -} - -static inline void flush_tlb_mm(struct mm_struct *mm) -{ - if (mm == current->active_mm) - __flush_tlb_up(); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long addr) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb_one(addr); -} - -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb_up(); -} - -static inline void flush_tlb_mm_range(struct mm_struct *mm, - unsigned long start, unsigned long end, unsigned long vmflag) -{ - if (mm == current->active_mm) - __flush_tlb_up(); -} - -static inline void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ -} - -static inline void reset_lazy_tlbstate(void) -{ -} - -static inline void flush_tlb_kernel_range(unsigned long start, - unsigned long end) -{ - flush_tlb_all(); -} - -#else /* SMP */ - -#include - #define local_flush_tlb() __flush_tlb() #define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL) @@ -318,8 +244,6 @@ static inline void reset_lazy_tlbstate(void) this_cpu_write(cpu_tlbstate.active_mm, &init_mm); } -#endif /* SMP */ - #ifndef CONFIG_PARAVIRT #define flush_tlb_others(mask, mm, start, end) \ native_flush_tlb_others(mask, mm, start, end) -- cgit v1.2.3 From 78043e5b6fb2921d836b31f23e89e52925191153 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:19 -0700 Subject: x86/mm: Disable PCID on 32-bit kernels commit cba4671af7550e008f7a7835f06df0763825bf3e upstream. 32-bit kernels on new hardware will see PCID in CPUID, but PCID can only be used in 64-bit mode. Rather than making all PCID code conditional, just disable the feature on 32-bit builds. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/2e391769192a4d31b808410c383c6bf0734bc6ea.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar Cc: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/disabled-features.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index f226df064660..8b17c2ad1048 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -21,11 +21,13 @@ # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) +# define DISABLE_PCID 0 #else # define DISABLE_VME 0 # define DISABLE_K6_MTRR 0 # define DISABLE_CYRIX_ARR 0 # define DISABLE_CENTAUR_MCR 0 +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) #endif /* CONFIG_X86_64 */ /* @@ -35,7 +37,7 @@ #define DISABLED_MASK1 0 #define DISABLED_MASK2 0 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 0 +#define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 -- cgit v1.2.3 From fd0504525efd2ce2063cd4229baabd3e3a56ecbc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:21 -0700 Subject: x86/mm: Enable CR4.PCIDE on supported systems commit 660da7c9228f685b2ebe664f9fd69aaddcc420b5 upstream. We can use PCID if the CPU has PCID and PGE and we're not on Xen. By itself, this has no effect. A followup patch will start using PCID. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Boris Ostrovsky Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Juergen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/6327ecd907b32f79d5aa0d466f04503bbec5df88.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar Cc: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/tlbflush.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 822ad36be17b..9fc5968da820 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -190,6 +190,14 @@ static inline void __flush_tlb_all(void) __flush_tlb_global(); else __flush_tlb(); + + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- + * we'd end up flushing kernel translations for the current ASID but + * we might fail to flush kernel translations for other cached ASIDs. + * + * To avoid this issue, we force PCID off if PGE is off. + */ } static inline void __flush_tlb_one(unsigned long addr) -- cgit v1.2.3 From 0fa147b407478e73fe7a478677ff2b12bb824014 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:33 -0500 Subject: x86/boot: Add early cmdline parsing for options with arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e505371dd83963caae1a37ead9524e8d997341be upstream. Add a cmdline_find_option() function to look for cmdline options that take arguments. The argument is returned in a supplied buffer and the argument length (regardless of whether it fits in the supplied buffer) is returned, with -1 indicating not found. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/36b5f97492a9745dce27682305f990fc20e5cf8a.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cmdline.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h index e01f7f7ccb0c..84ae170bc3d0 100644 --- a/arch/x86/include/asm/cmdline.h +++ b/arch/x86/include/asm/cmdline.h @@ -2,5 +2,7 @@ #define _ASM_X86_CMDLINE_H int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); +int cmdline_find_option(const char *cmdline_ptr, const char *option, + char *buffer, int bufsize); #endif /* _ASM_X86_CMDLINE_H */ -- cgit v1.2.3 From 8a43ddfb93a0c6ae1a6e1f5c25705ec5d1843c40 Mon Sep 17 00:00:00 2001 From: Richard Fellner Date: Thu, 4 May 2017 14:26:50 +0200 Subject: KAISER: Kernel Address Isolation This patch introduces our implementation of KAISER (Kernel Address Isolation to have Side-channels Efficiently Removed), a kernel isolation technique to close hardware side channels on kernel address information. More information about the patch can be found on: https://github.com/IAIK/KAISER From: Richard Fellner From: Daniel Gruss X-Subject: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode Date: Thu, 4 May 2017 14:26:50 +0200 Link: http://marc.info/?l=linux-kernel&m=149390087310405&w=2 Kaiser-4.10-SHA1: c4b1831d44c6144d3762ccc72f0c4e71a0c713e5 To: To: Cc: Cc: Cc: Michael Schwarz Cc: Richard Fellner Cc: Ingo Molnar Cc: Cc: After several recent works [1,2,3] KASLR on x86_64 was basically considered dead by many researchers. We have been working on an efficient but effective fix for this problem and found that not mapping the kernel space when running in user mode is the solution to this problem [4] (the corresponding paper [5] will be presented at ESSoS17). With this RFC patch we allow anybody to configure their kernel with the flag CONFIG_KAISER to add our defense mechanism. If there are any questions we would love to answer them. We also appreciate any comments! Cheers, Daniel (+ the KAISER team from Graz University of Technology) [1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf [2] https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-CPU-Behaviour-To-See-Into-Kernel-Mode-And-Break-KASLR-In-The-Process.pdf [3] https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Address-Space-Layout-Randomization-KASLR-With-Intel-TSX.pdf [4] https://github.com/IAIK/KAISER [5] https://gruss.cc/files/kaiser.pdf [patch based also on https://raw.githubusercontent.com/IAIK/KAISER/master/KAISER/0001-KAISER-Kernel-Address-Isolation.patch] Signed-off-by: Richard Fellner Signed-off-by: Moritz Lipp Signed-off-by: Daniel Gruss Signed-off-by: Michael Schwarz Acked-by: Jiri Kosina Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/include/asm/kaiser.h | 113 +++++++++++++++++++++++++++++++++++ arch/x86/include/asm/pgtable.h | 4 ++ arch/x86/include/asm/pgtable_64.h | 21 +++++++ arch/x86/include/asm/pgtable_types.h | 12 +++- arch/x86/include/asm/processor.h | 7 ++- 6 files changed, 155 insertions(+), 4 deletions(-) create mode 100644 arch/x86/include/asm/kaiser.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 59caa55fb9b5..ee52ff858699 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -187,7 +187,7 @@ extern char irq_entries_start[]; #define VECTOR_RETRIGGERED ((void *)~0UL) typedef struct irq_desc* vector_irq_t[NR_VECTORS]; -DECLARE_PER_CPU(vector_irq_t, vector_irq); +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq); #endif /* !ASSEMBLY_ */ diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h new file mode 100644 index 000000000000..63ee8309b35b --- /dev/null +++ b/arch/x86/include/asm/kaiser.h @@ -0,0 +1,113 @@ +#ifndef _ASM_X86_KAISER_H +#define _ASM_X86_KAISER_H + +/* This file includes the definitions for the KAISER feature. + * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory. + * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped, + * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled, + * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled. + * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory. + * + * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions + * of the user space, or the stacks. + */ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_KAISER + +.macro _SWITCH_TO_KERNEL_CR3 reg +movq %cr3, \reg +andq $(~0x1000), \reg +movq \reg, %cr3 +.endm + +.macro _SWITCH_TO_USER_CR3 reg +movq %cr3, \reg +orq $(0x1000), \reg +movq \reg, %cr3 +.endm + +.macro SWITCH_KERNEL_CR3 +pushq %rax +_SWITCH_TO_KERNEL_CR3 %rax +popq %rax +.endm + +.macro SWITCH_USER_CR3 +pushq %rax +_SWITCH_TO_USER_CR3 %rax +popq %rax +.endm + +.macro SWITCH_KERNEL_CR3_NO_STACK +movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +_SWITCH_TO_KERNEL_CR3 %rax +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax +.endm + + +.macro SWITCH_USER_CR3_NO_STACK + +movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +_SWITCH_TO_USER_CR3 %rax +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax + +.endm + +#else /* CONFIG_KAISER */ + +.macro SWITCH_KERNEL_CR3 reg +.endm +.macro SWITCH_USER_CR3 reg +.endm +.macro SWITCH_USER_CR3_NO_STACK +.endm +.macro SWITCH_KERNEL_CR3_NO_STACK +.endm + +#endif /* CONFIG_KAISER */ +#else /* __ASSEMBLY__ */ + + +#ifdef CONFIG_KAISER +// Upon kernel/user mode switch, it may happen that +// the address space has to be switched before the registers have been stored. +// To change the address space, another register is needed. +// A register therefore has to be stored/restored. +// +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +#endif /* CONFIG_KAISER */ + +/** + * shadowmem_add_mapping - map a virtual memory part to the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + * @flags: The mapping flags of the pages + * + * the mapping is done on a global scope, so no bigger synchronization has to be done. + * the pages have to be manually unmapped again when they are not needed any longer. + */ +extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + + +/** + * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + */ +extern void kaiser_remove_mapping(unsigned long start, unsigned long size); + +/** + * shadowmem_initialize_mapping - Initalize the shadow mapping + * + * most parts of the shadow mapping can be mapped upon boot time. + * only the thread stacks have to be mapped on runtime. + * the mapped regions are not unmapped at all. + */ +extern void kaiser_init(void); + +#endif + + + +#endif /* _ASM_X86_KAISER_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6ec0c8b2e9df..6a843d254114 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -856,6 +856,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); +#ifdef CONFIG_KAISER + // clone the shadow pgd part as well + memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t)); +#endif } #define PTE_SHIFT ilog2(PTRS_PER_PTE) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 2ee781114d34..2131edda6620 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud) native_set_pud(pud, native_make_pud(0)); } +#ifdef CONFIG_KAISER +static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) { + return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); +} + +static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) { + return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); +} +#endif /* CONFIG_KAISER */ + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { +#ifdef CONFIG_KAISER + // We know that a pgd is page aligned. + // Therefore the lower indices have to be mapped to user space. + // These pages are mapped to the shadow mapping. + if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) { + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + } + + pgdp->pgd = pgd.pgd & ~_PAGE_USER; +#else /* CONFIG_KAISER */ *pgdp = pgd; +#endif } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 79c91853e50e..984670491901 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -39,7 +39,11 @@ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) -#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#ifdef CONFIG_KAISER +#define _PAGE_GLOBAL (_AT(pteval_t, 0)) +#else +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#endif #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) @@ -89,7 +93,11 @@ #define _PAGE_NX (_AT(pteval_t, 0)) #endif -#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +#ifdef CONFIG_KAISER +#define _PAGE_PROTNONE (_AT(pteval_t, 0)) +#else +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +#endif #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2d5a50cb61a2..6a2e0a0b4a96 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -305,7 +305,7 @@ struct tss_struct { } ____cacheline_aligned; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss); #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); @@ -332,6 +332,11 @@ union irq_stack_union { char gs_base[40]; unsigned long stack_canary; }; + + struct { + char irq_stack_pointer[64]; + char unused[IRQ_STACK_SIZE - 64]; + }; }; DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; -- cgit v1.2.3 From bed9bb7f3e6d4045013d2bb9e4004896de57f02b Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 30 Aug 2017 16:23:00 -0700 Subject: kaiser: merged update Merged fixes and cleanups, rebased to 4.4.89 tree (no 5-level paging). Signed-off-by: Dave Hansen Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 43 +++++++++++++++++++------------- arch/x86/include/asm/pgtable.h | 18 +++++++++++--- arch/x86/include/asm/pgtable_64.h | 48 ++++++++++++++++++++++++++++++------ arch/x86/include/asm/pgtable_types.h | 6 +---- 4 files changed, 82 insertions(+), 33 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 63ee8309b35b..0703f48777f3 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -16,13 +16,17 @@ .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg +#ifdef CONFIG_KAISER_REAL_SWITCH andq $(~0x1000), \reg +#endif movq \reg, %cr3 .endm .macro _SWITCH_TO_USER_CR3 reg movq %cr3, \reg +#ifdef CONFIG_KAISER_REAL_SWITCH orq $(0x1000), \reg +#endif movq \reg, %cr3 .endm @@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .endm #endif /* CONFIG_KAISER */ + #else /* __ASSEMBLY__ */ #ifdef CONFIG_KAISER -// Upon kernel/user mode switch, it may happen that -// the address space has to be switched before the registers have been stored. -// To change the address space, another register is needed. -// A register therefore has to be stored/restored. -// -DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); +/* + * Upon kernel/user mode switch, it may happen that the address + * space has to be switched before the registers have been + * stored. To change the address space, another register is + * needed. A register therefore has to be stored/restored. +*/ -#endif /* CONFIG_KAISER */ +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); /** - * shadowmem_add_mapping - map a virtual memory part to the shadow mapping + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping * @addr: the start address of the range * @size: the size of the range * @flags: The mapping flags of the pages * - * the mapping is done on a global scope, so no bigger synchronization has to be done. - * the pages have to be manually unmapped again when they are not needed any longer. + * The mapping is done on a global scope, so no bigger + * synchronization has to be done. the pages have to be + * manually unmapped again when they are not needed any longer. */ -extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); /** - * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping + * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping * @addr: the start address of the range * @size: the size of the range */ extern void kaiser_remove_mapping(unsigned long start, unsigned long size); /** - * shadowmem_initialize_mapping - Initalize the shadow mapping + * kaiser_initialize_mapping - Initalize the shadow mapping * - * most parts of the shadow mapping can be mapped upon boot time. - * only the thread stacks have to be mapped on runtime. - * the mapped regions are not unmapped at all. + * Most parts of the shadow mapping can be mapped upon boot + * time. Only per-process things like the thread stacks + * or a new LDT have to be mapped at runtime. These boot- + * time mappings are permanent and nevertunmapped. */ extern void kaiser_init(void); -#endif +#endif /* CONFIG_KAISER */ + +#endif /* __ASSEMBLY */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6a843d254114..176035fa057e 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -653,7 +653,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) static inline int pgd_bad(pgd_t pgd) { - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; + pgdval_t ignore_flags = _PAGE_USER; + /* + * We set NX on KAISER pgds that map userspace memory so + * that userspace can not meaningfully use the kernel + * page table by accident; it will fault on the first + * instruction it tries to run. See native_set_pgd(). + */ + if (IS_ENABLED(CONFIG_KAISER)) + ignore_flags |= _PAGE_NX; + + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; } static inline int pgd_none(pgd_t pgd) @@ -857,8 +867,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); #ifdef CONFIG_KAISER - // clone the shadow pgd part as well - memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t)); + /* Clone the shadow pgd part as well */ + memcpy(native_get_shadow_pgd(dst), + native_get_shadow_pgd(src), + count * sizeof(pgd_t)); #endif } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 2131edda6620..b0adf0d11c9a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud) } #ifdef CONFIG_KAISER -static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) { +static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) +{ return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); } -static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) { +static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) +{ return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); } +#else +static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) +{ + BUILD_BUG_ON(1); + return NULL; +} +static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) +{ + return pgdp; +} #endif /* CONFIG_KAISER */ +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * This returns true for user pages that need to get copied into + * both the user and kernel copies of the page tables, and false + * for kernel pages that should only be in the kernel copy. + */ +static inline bool is_userspace_pgd(void *__ptr) +{ + unsigned long ptr = (unsigned long)__ptr; + + return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2)); +} + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { #ifdef CONFIG_KAISER - // We know that a pgd is page aligned. - // Therefore the lower indices have to be mapped to user space. - // These pages are mapped to the shadow mapping. - if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) { + pteval_t extra_kern_pgd_flags = 0; + /* Do we need to also populate the shadow pgd? */ + if (is_userspace_pgd(pgdp)) { native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + /* + * Even if the entry is *mapping* userspace, ensure + * that userspace can not use it. This way, if we + * get out to userspace running on the kernel CR3, + * userspace will crash instead of running. + */ + extra_kern_pgd_flags = _PAGE_NX; } - - pgdp->pgd = pgd.pgd & ~_PAGE_USER; + pgdp->pgd = pgd.pgd; + pgdp->pgd |= extra_kern_pgd_flags; #else /* CONFIG_KAISER */ *pgdp = pgd; #endif diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 984670491901..a70d6100b3df 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -42,7 +42,7 @@ #ifdef CONFIG_KAISER #define _PAGE_GLOBAL (_AT(pteval_t, 0)) #else -#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #endif #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) @@ -93,11 +93,7 @@ #define _PAGE_NX (_AT(pteval_t, 0)) #endif -#ifdef CONFIG_KAISER -#define _PAGE_PROTNONE (_AT(pteval_t, 0)) -#else #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -#endif #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) -- cgit v1.2.3 From edde73205b3fdde8c8a3adfce78cc6d0de72386b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 5 Sep 2017 12:05:01 -0700 Subject: kaiser: do not set _PAGE_NX on pgd_none native_pgd_clear() uses native_set_pgd(), so native_set_pgd() must avoid setting the _PAGE_NX bit on an otherwise pgd_none() entry: usually that just generated a warning on exit, but sometimes more mysterious and damaging failures (our production machines could not complete booting). The original fix to this just avoided adding _PAGE_NX to an empty entry; but eventually more problems surfaced with kexec, and EFI mapping expected to be a problem too. So now instead change native_set_pgd() to update shadow only if _PAGE_USER: A few places (kernel/machine_kexec_64.c, platform/efi/efi_64.c for sure) use set_pgd() to set up a temporary internal virtual address space, with physical pages remapped at what Kaiser regards as userspace addresses: Kaiser then assumes a shadow pgd follows, which it will try to corrupt. This appears to be responsible for the recent kexec and kdump failures; though it's unclear how those did not manifest as a problem before. Ah, the shadow pgd will only be assumed to "follow" if the requested pgd is on an even-numbered page: so I suppose it was going wrong 50% of the time all along. What we need is a flag to set_pgd(), to tell it we're dealing with userspace. Er, isn't that what the pgd's _PAGE_USER bit is saying? Add a test for that. But we cannot do the same for pgd_clear() (which may be called to clear corrupted entries - set aside the question of "corrupt in which pgd?" until later), so there just rely on pgd_clear() not being called in the problematic cases - with a WARN_ON_ONCE() which should fire half the time if it is. But this is getting too big for an inline function: move it into arch/x86/mm/kaiser.c (which then demands a boot/compressed mod); and de-void and de-space native_get_shadow/normal_pgd() while here. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable_64.h | 51 ++++++++++----------------------------- 1 file changed, 13 insertions(+), 38 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index b0adf0d11c9a..8629be9e6649 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -107,61 +107,36 @@ static inline void native_pud_clear(pud_t *pud) } #ifdef CONFIG_KAISER -static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); + +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) { - return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); } -static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) { - return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); + return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); } #else -static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) { BUILD_BUG_ON(1); return NULL; } -static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) { return pgdp; } #endif /* CONFIG_KAISER */ -/* - * Page table pages are page-aligned. The lower half of the top - * level is used for userspace and the top half for the kernel. - * This returns true for user pages that need to get copied into - * both the user and kernel copies of the page tables, and false - * for kernel pages that should only be in the kernel copy. - */ -static inline bool is_userspace_pgd(void *__ptr) -{ - unsigned long ptr = (unsigned long)__ptr; - - return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2)); -} - static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { -#ifdef CONFIG_KAISER - pteval_t extra_kern_pgd_flags = 0; - /* Do we need to also populate the shadow pgd? */ - if (is_userspace_pgd(pgdp)) { - native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; - /* - * Even if the entry is *mapping* userspace, ensure - * that userspace can not use it. This way, if we - * get out to userspace running on the kernel CR3, - * userspace will crash instead of running. - */ - extra_kern_pgd_flags = _PAGE_NX; - } - pgdp->pgd = pgd.pgd; - pgdp->pgd |= extra_kern_pgd_flags; -#else /* CONFIG_KAISER */ - *pgdp = pgd; -#endif + *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); } static inline void native_pgd_clear(pgd_t *pgd) -- cgit v1.2.3 From 5fbd46c4be78174656b52e1b04d3057a5dd7af66 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 3 Sep 2017 19:18:07 -0700 Subject: kaiser: tidied up asm/kaiser.h somewhat Mainly deleting a surfeit of blank lines, and reflowing header comment. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 0703f48777f3..7394ba9f9951 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -1,15 +1,17 @@ #ifndef _ASM_X86_KAISER_H #define _ASM_X86_KAISER_H - -/* This file includes the definitions for the KAISER feature. - * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory. - * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped, - * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled, - * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled. - * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory. +/* + * This file includes the definitions for the KAISER feature. + * KAISER is a counter measure against x86_64 side channel attacks on + * the kernel virtual memory. It has a shadow pgd for every process: the + * shadow pgd has a minimalistic kernel-set mapped, but includes the whole + * user memory. Within a kernel context switch, or when an interrupt is handled, + * the pgd is switched to the normal one. When the system switches to user mode, + * the shadow pgd is enabled. By this, the virtual memory caches are freed, + * and the user may not attack the whole kernel memory. * - * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions - * of the user space, or the stacks. + * A minimalistic kernel mapping holds the parts needed to be mapped in user + * mode, such as the entry/exit functions of the user space, or the stacks. */ #ifdef __ASSEMBLY__ #ifdef CONFIG_KAISER @@ -48,13 +50,10 @@ _SWITCH_TO_KERNEL_CR3 %rax movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .endm - .macro SWITCH_USER_CR3_NO_STACK - movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) _SWITCH_TO_USER_CR3 %rax movq PER_CPU_VAR(unsafe_stack_register_backup), %rax - .endm #else /* CONFIG_KAISER */ @@ -72,7 +71,6 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax #else /* __ASSEMBLY__ */ - #ifdef CONFIG_KAISER /* * Upon kernel/user mode switch, it may happen that the address @@ -80,7 +78,6 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax * stored. To change the address space, another register is * needed. A register therefore has to be stored/restored. */ - DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); /** @@ -95,7 +92,6 @@ DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); */ extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); - /** * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping * @addr: the start address of the range @@ -104,12 +100,12 @@ extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned l extern void kaiser_remove_mapping(unsigned long start, unsigned long size); /** - * kaiser_initialize_mapping - Initalize the shadow mapping + * kaiser_init - Initialize the shadow mapping * * Most parts of the shadow mapping can be mapped upon boot * time. Only per-process things like the thread stacks * or a new LDT have to be mapped at runtime. These boot- - * time mappings are permanent and nevertunmapped. + * time mappings are permanent and never unmapped. */ extern void kaiser_init(void); @@ -117,6 +113,4 @@ extern void kaiser_init(void); #endif /* __ASSEMBLY */ - - #endif /* _ASM_X86_KAISER_H */ -- cgit v1.2.3 From c52e55a2a82d3a44189810d35717d81cb4cf61d4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 21 Aug 2017 20:11:43 -0700 Subject: kaiser: cleanups while trying for gold link While trying to get our gold link to work, four cleanups: matched the gdt_page declaration to its definition; in fiddling unsuccessfully with PERCPU_INPUT(), lined up backslashes; lined up the backslashes according to convention in percpu-defs.h; deleted the unused irq_stack_pointer addition to irq_stack_union. Sad to report that aligning backslashes does not appear to help gold align to 8192: but while these did not help, they are worth keeping. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/desc.h | 2 +- arch/x86/include/asm/processor.h | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 4e10d73cf018..880db91d9457 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -43,7 +43,7 @@ struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; } __attribute__((aligned(PAGE_SIZE))); -DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); +DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page); static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6a2e0a0b4a96..f3bdaed0188f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -332,11 +332,6 @@ union irq_stack_union { char gs_base[40]; unsigned long stack_canary; }; - - struct { - char irq_stack_pointer[64]; - char unused[IRQ_STACK_SIZE - 64]; - }; }; DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; -- cgit v1.2.3 From aeda21d77e22fb382c51fd3f6bbb18df69bc032f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 9 Sep 2017 17:31:18 -0700 Subject: kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET There's a 0x1000 in various places, which looks better with a name. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 7394ba9f9951..051acf678cda 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -13,13 +13,16 @@ * A minimalistic kernel mapping holds the parts needed to be mapped in user * mode, such as the entry/exit functions of the user space, or the stacks. */ + +#define KAISER_SHADOW_PGD_OFFSET 0x1000 + #ifdef __ASSEMBLY__ #ifdef CONFIG_KAISER .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg #ifdef CONFIG_KAISER_REAL_SWITCH -andq $(~0x1000), \reg +andq $(~KAISER_SHADOW_PGD_OFFSET), \reg #endif movq \reg, %cr3 .endm @@ -27,7 +30,7 @@ movq \reg, %cr3 .macro _SWITCH_TO_USER_CR3 reg movq %cr3, \reg #ifdef CONFIG_KAISER_REAL_SWITCH -orq $(0x1000), \reg +orq $(KAISER_SHADOW_PGD_OFFSET), \reg #endif movq \reg, %cr3 .endm -- cgit v1.2.3 From b9d2ccc54e17b5aa50dd0c036d3f4fb4e5248d54 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 3 Sep 2017 18:30:43 -0700 Subject: kaiser: delete KAISER_REAL_SWITCH option We fail to see what CONFIG_KAISER_REAL_SWITCH is for: it seems to be left over from early development, and now just obscures tricky parts of the code. Delete it before adding PCIDs, or nokaiser boot option. (Or if there is some good reason to keep the option, then it needs a help text - and a "depends on KAISER", so that all those without KAISER are not asked the question.) Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 051acf678cda..e0fc45e77aee 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -21,17 +21,13 @@ .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg -#ifdef CONFIG_KAISER_REAL_SWITCH andq $(~KAISER_SHADOW_PGD_OFFSET), \reg -#endif movq \reg, %cr3 .endm .macro _SWITCH_TO_USER_CR3 reg movq %cr3, \reg -#ifdef CONFIG_KAISER_REAL_SWITCH orq $(KAISER_SHADOW_PGD_OFFSET), \reg -#endif movq \reg, %cr3 .endm -- cgit v1.2.3 From eb82151d0b1df53d1ad8d060ecd554ca12eb552a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 30 Aug 2017 16:23:00 -0700 Subject: kaiser: enhanced by kernel and user PCIDs Merged performance improvements to Kaiser, using distinct kernel and user Process Context Identifiers to minimize the TLB flushing. Signed-off-by: Dave Hansen Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/include/asm/kaiser.h | 15 +++++++-- arch/x86/include/asm/pgtable_types.h | 26 +++++++++++++++ arch/x86/include/asm/tlbflush.h | 52 ++++++++++++++++++++++++----- arch/x86/include/uapi/asm/processor-flags.h | 3 +- 5 files changed, 86 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index f7ba9fbf12ee..72a4343b774f 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -187,6 +187,7 @@ #define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */ #define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */ #define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */ diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index e0fc45e77aee..360ff3bc44a9 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -1,5 +1,8 @@ #ifndef _ASM_X86_KAISER_H #define _ASM_X86_KAISER_H + +#include /* For PCID constants */ + /* * This file includes the definitions for the KAISER feature. * KAISER is a counter measure against x86_64 side channel attacks on @@ -21,13 +24,21 @@ .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg -andq $(~KAISER_SHADOW_PGD_OFFSET), \reg +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg +orq X86_CR3_PCID_KERN_VAR, \reg movq \reg, %cr3 .endm .macro _SWITCH_TO_USER_CR3 reg movq %cr3, \reg -orq $(KAISER_SHADOW_PGD_OFFSET), \reg +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg +/* + * This can obviously be one instruction by putting the + * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR. + * But, just leave it now for simplicity. + */ +orq X86_CR3_PCID_USER_VAR, \reg +orq $(KAISER_SHADOW_PGD_OFFSET), \reg movq \reg, %cr3 .endm diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index a70d6100b3df..79319174e27b 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -106,6 +106,32 @@ _PAGE_SOFT_DIRTY) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) +/* The ASID is the lower 12 bits of CR3 */ +#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL)) + +/* Mask for all the PCID-related bits in CR3: */ +#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) +#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) +#define X86_CR3_PCID_ASID_KERN (_AC(0x4,UL)) +#define X86_CR3_PCID_ASID_USER (_AC(0x6,UL)) + +#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) +#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) +#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) +#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) +#else +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) +#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) +/* + * PCIDs are unsupported on 32-bit and none of these bits can be + * set in CR3: + */ +#define X86_CR3_PCID_KERN_FLUSH (0) +#define X86_CR3_PCID_USER_FLUSH (0) +#define X86_CR3_PCID_KERN_NOFLUSH (0) +#define X86_CR3_PCID_USER_NOFLUSH (0) +#endif + /* * The cache modes defined here are used to translate between pure SW usage * and the HW defined cache mode bits and/or PAT entries. diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 9fc5968da820..48ef37079bc2 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -12,7 +12,6 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, unsigned long type) { struct { u64 d[2]; } desc = { { pcid, addr } }; - /* * The memory clobber is because the whole point is to invalidate * stale TLB entries and, especially if we're flushing global @@ -133,14 +132,25 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) static inline void __native_flush_tlb(void) { + if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) { + /* + * If current->mm == NULL then we borrow a mm which may change during a + * task switch and therefore we must not be preempted while we write CR3 + * back: + */ + preempt_disable(); + native_write_cr3(native_read_cr3()); + preempt_enable(); + return; + } /* - * If current->mm == NULL then we borrow a mm which may change during a - * task switch and therefore we must not be preempted while we write CR3 - * back: + * We are no longer using globals with KAISER, so a + * "nonglobals" flush would work too. But, this is more + * conservative. + * + * Note, this works with CR4.PCIDE=0 or 1. */ - preempt_disable(); - native_write_cr3(native_read_cr3()); - preempt_enable(); + invpcid_flush_all(); } static inline void __native_flush_tlb_global_irq_disabled(void) @@ -162,6 +172,8 @@ static inline void __native_flush_tlb_global(void) /* * Using INVPCID is considerably faster than a pair of writes * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. */ invpcid_flush_all(); return; @@ -181,7 +193,31 @@ static inline void __native_flush_tlb_global(void) static inline void __native_flush_tlb_single(unsigned long addr) { - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + /* + * SIMICS #GP's if you run INVPCID with type 2/3 + * and X86_CR4_PCIDE clear. Shame! + * + * The ASIDs used below are hard-coded. But, we must not + * call invpcid(type=1/2) before CR4.PCIDE=1. Just call + * invpcid in the case we are called early. + */ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + return; + } + /* Flush the address out of both PCIDs. */ + /* + * An optimization here might be to determine addresses + * that are only kernel-mapped and only flush the kernel + * ASID. But, userspace flushes are probably much more + * important performance-wise. + * + * Make sure to do only a single invpcid when KAISER is + * disabled and we have only a single ASID. + */ + if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); } static inline void __flush_tlb_all(void) diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 79887abcb5e1..1361779f44fe 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -77,7 +77,8 @@ #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) /* * Intel CPU features in CR4 -- cgit v1.2.3 From 0731188fc74cc2237975a2b5bedd36e2463ef10b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 17 Aug 2017 15:00:37 -0700 Subject: kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user We have many machines (Westmere, Sandybridge, Ivybridge) supporting PCID but not INVPCID: on these load_new_mm_cr3() simply crashed. Flushing user context inside load_new_mm_cr3() without the use of invpcid is difficult: momentarily switch from kernel to user context and back to do so? I'm not sure whether that can be safely done at all, and would risk polluting user context with kernel internals, and kernel context with stale user externals. Instead, follow the hint in the comment that was there: change X86_CR3_PCID_USER_VAR to be a per-cpu variable, then load_new_mm_cr3() can leave a note in it, for SWITCH_USER_CR3 on return to userspace to flush user context TLB, instead of default X86_CR3_PCID_USER_NOFLUSH. Which works well enough that there's no need to do it this way only when invpcid is unsupported: it's a good alternative to invpcid here. But there's a couple of inlines in asm/tlbflush.h that need to do the same trick, so it's best to localize all this per-cpu business in mm/kaiser.c: moving that part of the initialization from setup_pcid() to kaiser_setup_pcid(); with kaiser_flush_tlb_on_return_to_user() the function for noting an X86_CR3_PCID_USER_FLUSH. And let's keep a KAISER_SHADOW_PGD_OFFSET in there, to avoid the extra OR on exit. I did try to make the feature tests in asm/tlbflush.h more consistent with each other: there seem to be far too many ways of performing such tests, and I don't have a good grasp of their differences. At first I converted them all to be static_cpu_has(): but that proved to be a mistake, as the comment in __native_flush_tlb_single() hints; so then I reversed and made them all this_cpu_has(). Probably all gratuitous change, but that's the way it's working at present. I am slightly bothered by the way non-per-cpu X86_CR3_PCID_KERN_VAR gets re-initialized by each cpu (before and after these changes): no problem when (as usual) all cpus on a machine have the same features, but in principle incorrect. However, my experiment to per-cpu-ify that one did not end well... Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 18 ++++++++----- arch/x86/include/asm/tlbflush.h | 58 ++++++++++++++++++++++++++++------------- 2 files changed, 51 insertions(+), 25 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 360ff3bc44a9..009bca514c20 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -32,13 +32,12 @@ movq \reg, %cr3 .macro _SWITCH_TO_USER_CR3 reg movq %cr3, \reg andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg -/* - * This can obviously be one instruction by putting the - * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR. - * But, just leave it now for simplicity. - */ -orq X86_CR3_PCID_USER_VAR, \reg -orq $(KAISER_SHADOW_PGD_OFFSET), \reg +orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg +js 9f +// FLUSH this time, reset to NOFLUSH for next time +// But if nopcid? Consider using 0x80 for user pcid? +movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7) +9: movq \reg, %cr3 .endm @@ -90,6 +89,11 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax */ DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); +extern unsigned long X86_CR3_PCID_KERN_VAR; +DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR); + +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; + /** * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping * @addr: the start address of the range diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 48ef37079bc2..ff8c5eb95caf 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -12,6 +12,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, unsigned long type) { struct { u64 d[2]; } desc = { { pcid, addr } }; + /* * The memory clobber is because the whole point is to invalidate * stale TLB entries and, especially if we're flushing global @@ -130,27 +131,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) cr4_set_bits(mask); } +/* + * Declare a couple of kaiser interfaces here for convenience, + * to avoid the need for asm/kaiser.h in unexpected places. + */ +#ifdef CONFIG_KAISER +extern void kaiser_setup_pcid(void); +extern void kaiser_flush_tlb_on_return_to_user(void); +#else +static inline void kaiser_setup_pcid(void) +{ +} +static inline void kaiser_flush_tlb_on_return_to_user(void) +{ +} +#endif + static inline void __native_flush_tlb(void) { - if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) { - /* - * If current->mm == NULL then we borrow a mm which may change during a - * task switch and therefore we must not be preempted while we write CR3 - * back: + if (this_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Note, this works with CR4.PCIDE=0 or 1. */ - preempt_disable(); - native_write_cr3(native_read_cr3()); - preempt_enable(); + invpcid_flush_all_nonglobals(); return; } + /* - * We are no longer using globals with KAISER, so a - * "nonglobals" flush would work too. But, this is more - * conservative. - * - * Note, this works with CR4.PCIDE=0 or 1. + * If current->mm == NULL then we borrow a mm which may change during a + * task switch and therefore we must not be preempted while we write CR3 + * back: */ - invpcid_flush_all(); + preempt_disable(); + if (this_cpu_has(X86_FEATURE_PCID)) + kaiser_flush_tlb_on_return_to_user(); + native_write_cr3(native_read_cr3()); + preempt_enable(); } static inline void __native_flush_tlb_global_irq_disabled(void) @@ -166,9 +182,13 @@ static inline void __native_flush_tlb_global_irq_disabled(void) static inline void __native_flush_tlb_global(void) { +#ifdef CONFIG_KAISER + /* Globals are not used at all */ + __native_flush_tlb(); +#else unsigned long flags; - if (static_cpu_has(X86_FEATURE_INVPCID)) { + if (this_cpu_has(X86_FEATURE_INVPCID)) { /* * Using INVPCID is considerably faster than a pair of writes * to CR4 sandwiched inside an IRQ flag save/restore. @@ -185,10 +205,9 @@ static inline void __native_flush_tlb_global(void) * be called from deep inside debugging code.) */ raw_local_irq_save(flags); - __native_flush_tlb_global_irq_disabled(); - raw_local_irq_restore(flags); +#endif } static inline void __native_flush_tlb_single(unsigned long addr) @@ -199,9 +218,12 @@ static inline void __native_flush_tlb_single(unsigned long addr) * * The ASIDs used below are hard-coded. But, we must not * call invpcid(type=1/2) before CR4.PCIDE=1. Just call - * invpcid in the case we are called early. + * invlpg in the case we are called early. */ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { + if (this_cpu_has(X86_FEATURE_PCID)) + kaiser_flush_tlb_on_return_to_user(); asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); return; } -- cgit v1.2.3 From 3b4ce0e1a17228eec71815d7997e49e403ebf2a7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 8 Sep 2017 19:26:30 -0700 Subject: kaiser: PCID 0 for kernel and 128 for user Why was 4 chosen for kernel PCID and 6 for user PCID? No good reason in a backport where PCIDs are only used for Kaiser. If we continue with those, then we shall need to add Andy Lutomirski's 4.13 commit 6c690ee1039b ("x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()"), which deals with the problem of read_cr3() callers finding stray bits in the cr3 that they expected to be page-aligned; and for hibernation, his 4.14 commit f34902c5c6c0 ("x86/hibernate/64: Mask off CR3's PCID bits in the saved CR3"). But if 0 is used for kernel PCID, then there's no need to add in those commits - whenever the kernel looks, it sees 0 in the lower bits; and 0 for kernel seems an obvious choice. And I naughtily propose 128 for user PCID. Because there's a place in _SWITCH_TO_USER_CR3 where it takes note of the need for TLB FLUSH, but needs to reset that to NOFLUSH for the next occasion. Currently it does so with a "movb $(0x80)" into the high byte of the per-cpu quadword, but that will cause a machine without PCID support to crash. Now, if %al just happened to have 0x80 in it at that point, on a machine with PCID support, but 0 on a machine without PCID support... (That will go badly wrong once the pgd can be at a physical address above 2^56, but even with 5-level paging, physical goes up to 2^52.) Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 19 ++++++++++++------- arch/x86/include/asm/pgtable_types.h | 7 ++++--- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 009bca514c20..110a73e0572d 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -29,14 +29,19 @@ orq X86_CR3_PCID_KERN_VAR, \reg movq \reg, %cr3 .endm -.macro _SWITCH_TO_USER_CR3 reg +.macro _SWITCH_TO_USER_CR3 reg regb +/* + * regb must be the low byte portion of reg: because we have arranged + * for the low byte of the user PCID to serve as the high byte of NOFLUSH + * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are + * not enabled): so that the one register can update both memory and cr3. + */ movq %cr3, \reg andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg js 9f -// FLUSH this time, reset to NOFLUSH for next time -// But if nopcid? Consider using 0x80 for user pcid? -movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7) +/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */ +movb \regb, PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7) 9: movq \reg, %cr3 .endm @@ -49,7 +54,7 @@ popq %rax .macro SWITCH_USER_CR3 pushq %rax -_SWITCH_TO_USER_CR3 %rax +_SWITCH_TO_USER_CR3 %rax %al popq %rax .endm @@ -61,7 +66,7 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .macro SWITCH_USER_CR3_NO_STACK movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) -_SWITCH_TO_USER_CR3 %rax +_SWITCH_TO_USER_CR3 %rax %al movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .endm @@ -69,7 +74,7 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .macro SWITCH_KERNEL_CR3 reg .endm -.macro SWITCH_USER_CR3 reg +.macro SWITCH_USER_CR3 reg regb .endm .macro SWITCH_USER_CR3_NO_STACK .endm diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 79319174e27b..22704eca1900 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -111,16 +111,17 @@ /* Mask for all the PCID-related bits in CR3: */ #define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) + #if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) -#define X86_CR3_PCID_ASID_KERN (_AC(0x4,UL)) -#define X86_CR3_PCID_ASID_USER (_AC(0x6,UL)) +/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ +#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) #define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) #define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) #define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) #define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) #else -#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) #define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) /* * PCIDs are unsupported on 32-bit and none of these bits can be -- cgit v1.2.3 From 20268a10ffecd9fcc04880b21fc99a9192394599 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 27 Aug 2017 16:24:27 -0700 Subject: kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user Mostly this commit is just unshouting X86_CR3_PCID_KERN_VAR and X86_CR3_PCID_USER_VAR: we usually name variables in lower-case. But why does x86_cr3_pcid_noflush need to be __aligned(PAGE_SIZE)? Ah, it's a leftover from when kaiser_add_user_map() once complained about mapping the same page twice. Make it __read_mostly instead. (I'm a little uneasy about all the unrelated data which shares its page getting user-mapped too, but that was so before, and not a big deal: though we call it user-mapped, it's not mapped with _PAGE_USER.) And there is a little change around the two calls to do_nmi(). Previously they set the NOFLUSH bit (if PCID supported) when forcing to kernel context before do_nmi(); now they also have the NOFLUSH bit set (if PCID supported) when restoring context after: nothing done in do_nmi() should require a TLB to be flushed here. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 110a73e0572d..48d8d70dd8c7 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -25,7 +25,7 @@ .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg -orq X86_CR3_PCID_KERN_VAR, \reg +orq x86_cr3_pcid_noflush, \reg movq \reg, %cr3 .endm @@ -37,11 +37,10 @@ movq \reg, %cr3 * not enabled): so that the one register can update both memory and cr3. */ movq %cr3, \reg -andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg -orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg +orq PER_CPU_VAR(x86_cr3_pcid_user), \reg js 9f /* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */ -movb \regb, PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7) +movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) 9: movq \reg, %cr3 .endm @@ -94,8 +93,8 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax */ DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); -extern unsigned long X86_CR3_PCID_KERN_VAR; -DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR); +extern unsigned long x86_cr3_pcid_noflush; +DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; -- cgit v1.2.3 From fc8334e6b3e5d28afd4eec8a74493933f73b2784 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 26 Sep 2017 18:43:07 -0700 Subject: kaiser: paranoid_entry pass cr3 need to paranoid_exit Neel Natu points out that paranoid_entry() was wrong to assume that an entry that did not need swapgs would not need SWITCH_KERNEL_CR3: paranoid_entry (used for debug breakpoint, int3, double fault or MCE; though I think it's only the MCE case that is cause for concern here) can break in at an awkward time, between cr3 switch and swapgs, but its handling always needs kernel gs and kernel cr3. Easy to fix in itself, but paranoid_entry() also needs to convey to paranoid_exit() (and my reading of macro idtentry says paranoid_entry and paranoid_exit are always paired) how to restore the prior state. The swapgs state is already conveyed by %ebx (0 or 1), so extend that also to convey when SWITCH_USER_CR3 will be needed (2 or 3). (Yes, I'd much prefer that 0 meant no swapgs, whereas it's the other way round: and a convention shared with error_entry() and error_exit(), which I don't want to touch. Perhaps I should have inverted the bit for switch cr3 too, but did not.) paranoid_exit() would be straightforward, except for TRACE_IRQS: it did TRACE_IRQS_IRETQ when doing swapgs, but TRACE_IRQS_IRETQ_DEBUG when not: which is it supposed to use when SWITCH_USER_CR3 is split apart from that? As best as I can determine, commit 5963e317b1e9 ("ftrace/x86: Do not change stacks in DEBUG when calling lockdep") missed the swapgs case, and should have used TRACE_IRQS_IRETQ_DEBUG there too (the discrepancy has nothing to do with the liberal use of _NO_STACK and _UNSAFE_STACK hereabouts: TRACE_IRQS_OFF_DEBUG has just been used in all cases); discrepancy lovingly preserved across several paranoid_exit() cleanups, but I'm now removing it. Neel further indicates that to use SWITCH_USER_CR3_NO_STACK there in paranoid_exit() is now not only unnecessary but unsafe: might corrupt syscall entry's unsafe_stack_register_backup of %rax. Just use SWITCH_USER_CR3: and delete SWITCH_USER_CR3_NO_STACK altogether, before we make the mistake of using it again. hughd adds: this commit fixes an issue in the Kaiser-without-PCIDs part of the series, and ought to be moved earlier, if you decided to make a release of Kaiser-without-PCIDs. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 48d8d70dd8c7..3dc5f4c39b3e 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -63,20 +63,12 @@ _SWITCH_TO_KERNEL_CR3 %rax movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .endm -.macro SWITCH_USER_CR3_NO_STACK -movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) -_SWITCH_TO_USER_CR3 %rax %al -movq PER_CPU_VAR(unsafe_stack_register_backup), %rax -.endm - #else /* CONFIG_KAISER */ .macro SWITCH_KERNEL_CR3 reg .endm .macro SWITCH_USER_CR3 reg regb .endm -.macro SWITCH_USER_CR3_NO_STACK -.endm .macro SWITCH_KERNEL_CR3_NO_STACK .endm -- cgit v1.2.3 From e345dcc9481543edf4a0a5df4c4c2f9597b0a997 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 24 Sep 2017 16:59:49 -0700 Subject: kaiser: add "nokaiser" boot option, using ALTERNATIVE Added "nokaiser" boot option: an early param like "noinvpcid". Most places now check int kaiser_enabled (#defined 0 when not CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S and entry_64_compat.S are using the ALTERNATIVE technique, which patches in the preferred instructions at runtime. That technique is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated. Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that, but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when nokaiser like when !CONFIG_KAISER, but not setting either when kaiser - neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL won't get set in some obscure corner, or something add PGE into CR4. By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled, all page table setup which uses pte_pfn() masks it out of the ptes. It's slightly shameful that the same declaration versus definition of kaiser_enabled appears in not one, not two, but in three header files (asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h). I felt safer that way, than with #including any of those in any of the others; and did not feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes them all, so we shall hear about it if they get out of synch. Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER from kaiser.c; removed the unused native_get_normal_pgd(); removed the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some comments. But more interestingly, set CR4.PSE in secondary_startup_64: the manual is clear that it does not matter whether it's 0 or 1 when 4-level-pts are enabled, but I was distracted to find cr4 different on BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask(). Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 3 +++ arch/x86/include/asm/kaiser.h | 27 ++++++++++++++++++------- arch/x86/include/asm/pgtable.h | 20 ++++++++++++------ arch/x86/include/asm/pgtable_64.h | 13 ++++-------- arch/x86/include/asm/pgtable_types.h | 4 ---- arch/x86/include/asm/tlbflush.h | 39 +++++++++++++++++++++++------------- 6 files changed, 66 insertions(+), 40 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 72a4343b774f..3fc40a43578f 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -200,6 +200,9 @@ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ + /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 3dc5f4c39b3e..96643a9c194c 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -46,28 +46,33 @@ movq \reg, %cr3 .endm .macro SWITCH_KERNEL_CR3 -pushq %rax +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER _SWITCH_TO_KERNEL_CR3 %rax popq %rax +8: .endm .macro SWITCH_USER_CR3 -pushq %rax +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER _SWITCH_TO_USER_CR3 %rax %al popq %rax +8: .endm .macro SWITCH_KERNEL_CR3_NO_STACK -movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +ALTERNATIVE "jmp 8f", \ + __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \ + X86_FEATURE_KAISER _SWITCH_TO_KERNEL_CR3 %rax movq PER_CPU_VAR(unsafe_stack_register_backup), %rax +8: .endm #else /* CONFIG_KAISER */ -.macro SWITCH_KERNEL_CR3 reg +.macro SWITCH_KERNEL_CR3 .endm -.macro SWITCH_USER_CR3 reg regb +.macro SWITCH_USER_CR3 .endm .macro SWITCH_KERNEL_CR3_NO_STACK .endm @@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; +extern int kaiser_enabled; +#else +#define kaiser_enabled 0 +#endif /* CONFIG_KAISER */ + +/* + * Kaiser function prototypes are needed even when CONFIG_KAISER is not set, + * so as to build with tests on kaiser_enabled instead of #ifdefs. + */ + /** * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping * @addr: the start address of the range @@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsigned long start, unsigned long size); */ extern void kaiser_init(void); -#endif /* CONFIG_KAISER */ - #endif /* __ASSEMBLY */ #endif /* _ASM_X86_KAISER_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 176035fa057e..051beec179f4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -18,6 +18,12 @@ #ifndef __ASSEMBLY__ #include +#ifdef CONFIG_KAISER +extern int kaiser_enabled; +#else +#define kaiser_enabled 0 +#endif + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_checkwx(void); @@ -660,7 +666,7 @@ static inline int pgd_bad(pgd_t pgd) * page table by accident; it will fault on the first * instruction it tries to run. See native_set_pgd(). */ - if (IS_ENABLED(CONFIG_KAISER)) + if (kaiser_enabled) ignore_flags |= _PAGE_NX; return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; @@ -865,12 +871,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, */ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { - memcpy(dst, src, count * sizeof(pgd_t)); + memcpy(dst, src, count * sizeof(pgd_t)); #ifdef CONFIG_KAISER - /* Clone the shadow pgd part as well */ - memcpy(native_get_shadow_pgd(dst), - native_get_shadow_pgd(src), - count * sizeof(pgd_t)); + if (kaiser_enabled) { + /* Clone the shadow pgd part as well */ + memcpy(native_get_shadow_pgd(dst), + native_get_shadow_pgd(src), + count * sizeof(pgd_t)); + } #endif } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 8629be9e6649..233d19c9f22e 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -111,13 +111,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) { +#ifdef CONFIG_DEBUG_VM + /* linux/mmdebug.h may not have been included at this point */ + BUG_ON(!kaiser_enabled); +#endif return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); } - -static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) -{ - return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); -} #else static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) { @@ -128,10 +127,6 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) BUILD_BUG_ON(1); return NULL; } -static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) -{ - return pgdp; -} #endif /* CONFIG_KAISER */ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 22704eca1900..22547baa458a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -39,11 +39,7 @@ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) -#ifdef CONFIG_KAISER -#define _PAGE_GLOBAL (_AT(pteval_t, 0)) -#else #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) -#endif #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index ff8c5eb95caf..b376095a1fd9 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -136,9 +136,11 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) * to avoid the need for asm/kaiser.h in unexpected places. */ #ifdef CONFIG_KAISER +extern int kaiser_enabled; extern void kaiser_setup_pcid(void); extern void kaiser_flush_tlb_on_return_to_user(void); #else +#define kaiser_enabled 0 static inline void kaiser_setup_pcid(void) { } @@ -163,7 +165,7 @@ static inline void __native_flush_tlb(void) * back: */ preempt_disable(); - if (this_cpu_has(X86_FEATURE_PCID)) + if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) kaiser_flush_tlb_on_return_to_user(); native_write_cr3(native_read_cr3()); preempt_enable(); @@ -174,20 +176,30 @@ static inline void __native_flush_tlb_global_irq_disabled(void) unsigned long cr4; cr4 = this_cpu_read(cpu_tlbstate.cr4); - /* clear PGE */ - native_write_cr4(cr4 & ~X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); + if (cr4 & X86_CR4_PGE) { + /* clear PGE and flush TLB of all entries */ + native_write_cr4(cr4 & ~X86_CR4_PGE); + /* restore PGE as it was before */ + native_write_cr4(cr4); + } else { + /* + * x86_64 microcode update comes this way when CR4.PGE is not + * enabled, and it's safer for all callers to allow this case. + */ + native_write_cr3(native_read_cr3()); + } } static inline void __native_flush_tlb_global(void) { -#ifdef CONFIG_KAISER - /* Globals are not used at all */ - __native_flush_tlb(); -#else unsigned long flags; + if (kaiser_enabled) { + /* Globals are not used at all */ + __native_flush_tlb(); + return; + } + if (this_cpu_has(X86_FEATURE_INVPCID)) { /* * Using INVPCID is considerably faster than a pair of writes @@ -207,7 +219,6 @@ static inline void __native_flush_tlb_global(void) raw_local_irq_save(flags); __native_flush_tlb_global_irq_disabled(); raw_local_irq_restore(flags); -#endif } static inline void __native_flush_tlb_single(unsigned long addr) @@ -222,7 +233,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) */ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { - if (this_cpu_has(X86_FEATURE_PCID)) + if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) kaiser_flush_tlb_on_return_to_user(); asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); return; @@ -237,9 +248,9 @@ static inline void __native_flush_tlb_single(unsigned long addr) * Make sure to do only a single invpcid when KAISER is * disabled and we have only a single ASID. */ - if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) - invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); - invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); + if (kaiser_enabled) + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); } static inline void __flush_tlb_all(void) -- cgit v1.2.3 From 2dff99eb0335f9e0817410696a180dba25ca7371 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2017 20:49:04 -0700 Subject: kaiser: use ALTERNATIVE instead of x86_cr3_pcid_noflush Now that we're playing the ALTERNATIVE game, use that more efficient method: instead of user-mapping an extra page, and reading an extra cacheline each time for x86_cr3_pcid_noflush. Neel has found that __stringify(bts $X86_CR3_PCID_NOFLUSH_BIT, %rax) is a working substitute for the "bts $63, %rax" in these ALTERNATIVEs; but the one line with $63 in looks clearer, so let's stick with that. Worried about what happens with an ALTERNATIVE between the jump and jump label in another ALTERNATIVE? I was, but have checked the combinations in SWITCH_KERNEL_CR3_NO_STACK at entry_SYSCALL_64, and it does a good job. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 96643a9c194c..906150d6094e 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -25,7 +25,8 @@ .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg -orq x86_cr3_pcid_noflush, \reg +/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ +ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID movq \reg, %cr3 .endm @@ -39,7 +40,7 @@ movq \reg, %cr3 movq %cr3, \reg orq PER_CPU_VAR(x86_cr3_pcid_user), \reg js 9f -/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */ +/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */ movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) 9: movq \reg, %cr3 @@ -90,7 +91,6 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax */ DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); -extern unsigned long x86_cr3_pcid_noflush; DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; -- cgit v1.2.3 From 0651b3ad99dd59269e2ec883338ab8fba617e203 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 4 Nov 2017 18:23:24 -0700 Subject: kaiser: asm/tlbflush.h handle noPGE at lower level I found asm/tlbflush.h too twisty, and think it safer not to avoid __native_flush_tlb_global_irq_disabled() in the kaiser_enabled case, but instead let it handle kaiser_enabled along with cr3: it can just use __native_flush_tlb() for that, no harm in re-disabling preemption. (This is not the same change as Kirill and Dave have suggested for upstream, flipping PGE in cr4: that's neat, but needs a cpu_has_pge check; cr3 is enough for kaiser, and thought to be cheaper than cr4.) Also delete the X86_FEATURE_INVPCID invpcid_flush_all_nonglobals() preference from __native_flush_tlb(): unlike the invpcid_flush_all() preference in __native_flush_tlb_global(), it's not seen in upstream 4.14, and was recently reported to be surprisingly slow. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/tlbflush.h | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index b376095a1fd9..6fdc8c399601 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -151,14 +151,6 @@ static inline void kaiser_flush_tlb_on_return_to_user(void) static inline void __native_flush_tlb(void) { - if (this_cpu_has(X86_FEATURE_INVPCID)) { - /* - * Note, this works with CR4.PCIDE=0 or 1. - */ - invpcid_flush_all_nonglobals(); - return; - } - /* * If current->mm == NULL then we borrow a mm which may change during a * task switch and therefore we must not be preempted while we write CR3 @@ -182,11 +174,8 @@ static inline void __native_flush_tlb_global_irq_disabled(void) /* restore PGE as it was before */ native_write_cr4(cr4); } else { - /* - * x86_64 microcode update comes this way when CR4.PGE is not - * enabled, and it's safer for all callers to allow this case. - */ - native_write_cr3(native_read_cr3()); + /* do it with cr3, letting kaiser flush user PCID */ + __native_flush_tlb(); } } @@ -194,12 +183,6 @@ static inline void __native_flush_tlb_global(void) { unsigned long flags; - if (kaiser_enabled) { - /* Globals are not used at all */ - __native_flush_tlb(); - return; - } - if (this_cpu_has(X86_FEATURE_INVPCID)) { /* * Using INVPCID is considerably faster than a pair of writes @@ -255,11 +238,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) static inline void __flush_tlb_all(void) { - if (cpu_has_pge) - __flush_tlb_global(); - else - __flush_tlb(); - + __flush_tlb_global(); /* * Note: if we somehow had PCID but not PGE, then this wouldn't work -- * we'd end up flushing kernel translations for the current ASID but -- cgit v1.2.3 From 8eaca4c7d9f167209a9cc568ff028c0a3b0deb2d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 4 Nov 2017 18:43:06 -0700 Subject: kaiser: kaiser_flush_tlb_on_return_to_user() check PCID Let kaiser_flush_tlb_on_return_to_user() do the X86_FEATURE_PCID check, instead of each caller doing it inline first: nobody needs to optimize for the noPCID case, it's clearer this way, and better suits later changes. Replace those no-op X86_CR3_PCID_KERN_FLUSH lines by a BUILD_BUG_ON() in load_new_mm_cr3(), in case something changes. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/tlbflush.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6fdc8c399601..73865b174090 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -157,7 +157,7 @@ static inline void __native_flush_tlb(void) * back: */ preempt_disable(); - if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) + if (kaiser_enabled) kaiser_flush_tlb_on_return_to_user(); native_write_cr3(native_read_cr3()); preempt_enable(); @@ -216,7 +216,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) */ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { - if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) + if (kaiser_enabled) kaiser_flush_tlb_on_return_to_user(); asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); return; -- cgit v1.2.3 From 7f79599df9c4a36130f7a4f6778b334a97632477 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 25 Dec 2017 13:57:16 +0100 Subject: x86/kaiser: Move feature detection up ... before the first use of kaiser_enabled as otherwise funky things happen: about to get started... (XEN) d0v0 Unhandled page fault fault/trap [#14, ec=0000] (XEN) Pagetable walk from ffff88022a449090: (XEN) L4[0x110] = 0000000229e0e067 0000000000001e0e (XEN) L3[0x008] = 0000000000000000 ffffffffffffffff (XEN) domain_crash_sync called from entry.S: fault at ffff82d08033fd08 entry.o#create_bounce_frame+0x135/0x14d (XEN) Domain 0 (vcpu#0) crashed on cpu#0: (XEN) ----[ Xen-4.9.1_02-3.21 x86_64 debug=n Not tainted ]---- (XEN) CPU: 0 (XEN) RIP: e033:[] (XEN) RFLAGS: 0000000000000286 EM: 1 CONTEXT: pv guest (d0v0) Signed-off-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 906150d6094e..b5e46aa683f4 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -96,8 +96,10 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; extern int kaiser_enabled; +extern void __init kaiser_check_boottime_disable(void); #else #define kaiser_enabled 0 +static inline void __init kaiser_check_boottime_disable(void) {} #endif /* CONFIG_KAISER */ /* -- cgit v1.2.3 From 3e1457d6bf26d9ec300781f84cd0057e44deb45d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Jan 2018 10:43:15 -0800 Subject: KPTI: Rename to PAGE_TABLE_ISOLATION This renames CONFIG_KAISER to CONFIG_PAGE_TABLE_ISOLATION. Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 2 +- arch/x86/include/asm/kaiser.h | 12 ++++++------ arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable_64.h | 4 ++-- arch/x86/include/asm/pgtable_types.h | 2 +- arch/x86/include/asm/tlbflush.h | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3fc40a43578f..f6605712ca90 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -201,7 +201,7 @@ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ -#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index b5e46aa683f4..802bbbdfe143 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -20,7 +20,7 @@ #define KAISER_SHADOW_PGD_OFFSET 0x1000 #ifdef __ASSEMBLY__ -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg @@ -69,7 +69,7 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax 8: .endm -#else /* CONFIG_KAISER */ +#else /* CONFIG_PAGE_TABLE_ISOLATION */ .macro SWITCH_KERNEL_CR3 .endm @@ -78,11 +78,11 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .macro SWITCH_KERNEL_CR3_NO_STACK .endm -#endif /* CONFIG_KAISER */ +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ #else /* __ASSEMBLY__ */ -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION /* * Upon kernel/user mode switch, it may happen that the address * space has to be switched before the registers have been @@ -100,10 +100,10 @@ extern void __init kaiser_check_boottime_disable(void); #else #define kaiser_enabled 0 static inline void __init kaiser_check_boottime_disable(void) {} -#endif /* CONFIG_KAISER */ +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ /* - * Kaiser function prototypes are needed even when CONFIG_KAISER is not set, + * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set, * so as to build with tests on kaiser_enabled instead of #ifdefs. */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 051beec179f4..84c62d950023 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -18,7 +18,7 @@ #ifndef __ASSEMBLY__ #include -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION extern int kaiser_enabled; #else #define kaiser_enabled 0 @@ -872,7 +872,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION if (kaiser_enabled) { /* Clone the shadow pgd part as well */ memcpy(native_get_shadow_pgd(dst), diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 233d19c9f22e..c810226e741a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -106,7 +106,7 @@ static inline void native_pud_clear(pud_t *pud) native_set_pud(pud, native_make_pud(0)); } -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) @@ -127,7 +127,7 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) BUILD_BUG_ON(1); return NULL; } -#endif /* CONFIG_KAISER */ +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 22547baa458a..8dba273da25a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -109,7 +109,7 @@ #define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) #define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) -#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64) /* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ #define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 73865b174090..a691b66cc40a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -135,7 +135,7 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) * Declare a couple of kaiser interfaces here for convenience, * to avoid the need for asm/kaiser.h in unexpected places. */ -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION extern int kaiser_enabled; extern void kaiser_setup_pcid(void); extern void kaiser_flush_tlb_on_return_to_user(void); -- cgit v1.2.3 From 755bd549d9328d6d1e949a0a213f9a78e84d11fc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 10 Dec 2015 19:20:20 -0800 Subject: x86/vdso: Get pvclock data from the vvar VMA instead of the fixmap commit dac16fba6fc590fa7239676b35ed75dae4c4cd2b upstream. Signed-off-by: Andy Lutomirski Reviewed-by: Paolo Bonzini Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/9d37826fdc7e2d2809efe31d5345f97186859284.1449702533.git.luto@kernel.org Signed-off-by: Ingo Molnar Cc: Jamie Iles Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pvclock.h | 9 +++++++++ arch/x86/include/asm/vdso.h | 1 + 2 files changed, 10 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index baad72e4c100..6045cef376c2 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -4,6 +4,15 @@ #include #include +#ifdef CONFIG_PARAVIRT_CLOCK +extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void); +#else +static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) +{ + return NULL; +} +#endif + /* some helper functions for xen and kvm pv clock sources */ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 756de9190aec..deabaf9759b6 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -22,6 +22,7 @@ struct vdso_image { long sym_vvar_page; long sym_hpet_page; + long sym_pvclock_page; long sym_VDSO32_NOTE_MASK; long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; -- cgit v1.2.3 From 6dcf5491e01c3d1135497d0661bb5b35a126b9d8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Jan 2018 17:42:45 +0100 Subject: Map the vsyscall page with _PAGE_USER This needs to happen early in kaiser_pagetable_walk(), before the hierarchy is established so that _PAGE_USER permission can be really set. A proper fix would be to teach kaiser_pagetable_walk() to update those permissions but the vsyscall page is the only exception here so ... Signed-off-by: Borislav Petkov Acked-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/vsyscall.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 6ba66ee79710..4865e10dbb55 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -12,12 +12,14 @@ extern void map_vsyscall(void); * Returns true if handled. */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); +extern bool vsyscall_enabled(void); #else static inline void map_vsyscall(void) {} static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { return false; } +static inline bool vsyscall_enabled(void) { return false; } #endif #endif /* _ASM_X86_VSYSCALL_H */ -- cgit v1.2.3 From 1a699374533b23ec4deff885db121a5e4c42aa27 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:18 -0800 Subject: x86/vsdo: Fix build on PARAVIRT_CLOCK=y, KVM_GUEST=n commit 8705d603edd49f1cff165cd3b7998f4c7f098d27 upstream. arch/x86/built-in.o: In function `arch_setup_additional_pages': (.text+0x587): undefined reference to `pvclock_pvti_cpu0_va' KVM_GUEST selects PARAVIRT_CLOCK, so we can make pvclock_pvti_cpu0_va depend on KVM_GUEST. Signed-off-by: Andy Lutomirski Tested-by: Borislav Petkov Cc: Oleg Nesterov Cc: Kees Cook Link: http://lkml.kernel.org/r/444d38a9bcba832685740ea1401b569861d09a72.1451446564.git.luto@kernel.org Signed-off-by: Thomas Gleixner Cc: James Dingwall Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pvclock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 6045cef376c2..c926255745e1 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -4,7 +4,7 @@ #include #include -#ifdef CONFIG_PARAVIRT_CLOCK +#ifdef CONFIG_KVM_GUEST extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void); #else static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) -- cgit v1.2.3 From 7ec5d87df34a90758cf2aaf6824bb748454a8f35 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 12 Jan 2018 15:00:02 -0500 Subject: x86/pti/efi: broken conversion from efi to kernel page table In entry_64.S we have code like this: /* Unconditionally use kernel CR3 for do_nmi() */ /* %rax is saved above, so OK to clobber here */ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID pushq %rax /* mask off "user" bit of pgd address and 12 PCID bits: */ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax movq %rax, %cr3 2: /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ call do_nmi With this instruction: andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax We unconditionally switch from whatever our CR3 was to kernel page table. But, in arch/x86/platform/efi/efi_64.c We temporarily set a different page table, that does not have the kernel page table with 0x1000 offset from it. Look in efi_thunk() and efi_thunk_set_virtual_address_map(). So, while CR3 points to the other page table, we get an NMI interrupt, and clear 0x1000 from CR3, resulting in a bogus CR3 if the 0x1000 bit was set. The efi page table comes from realmode/rm/trampoline_64.S: arch/x86/realmode/rm/trampoline_64.S 141 .bss 142 .balign PAGE_SIZE 143 GLOBAL(trampoline_pgd) .space PAGE_SIZE Notice: alignment is PAGE_SIZE, so after applying KAISER_SHADOW_PGD_OFFSET which equal to PAGE_SIZE, we can get a different page table. But, even if we fix alignment, here the trampoline binary is later copied into dynamically allocated memory in reserve_real_mode(), so we need to fix that place as well. Fixes: 8a43ddfb93a0 ("KAISER: Kernel Address Isolation") Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 802bbbdfe143..48c791a411ab 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -19,6 +19,16 @@ #define KAISER_SHADOW_PGD_OFFSET 0x1000 +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * A page table address must have this alignment to stay the same when + * KAISER_SHADOW_PGD_OFFSET mask is applied + */ +#define KAISER_KERNEL_PGD_ALIGNMENT (KAISER_SHADOW_PGD_OFFSET << 1) +#else +#define KAISER_KERNEL_PGD_ALIGNMENT PAGE_SIZE +#endif + #ifdef __ASSEMBLY__ #ifdef CONFIG_PAGE_TABLE_ISOLATION -- cgit v1.2.3 From 65b28590de24afafcdc737b9fe86af0a9d2fbbdb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:32 +0100 Subject: x86/cpufeatures: Make CPU bugs sticky commit 6cbd2171e89b13377261d15e64384df60ecb530e upstream. There is currently no way to force CPU bug bits like CPU feature bits. That makes it impossible to set a bug bit once at boot and have it stick for all upcoming CPUs. Extend the force set/clear arrays to handle bug bits as well. Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.992156574@linutronix.de Signed-off-by: Ingo Molnar Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 2 ++ arch/x86/include/asm/processor.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index f6605712ca90..34c4106230f1 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -359,6 +359,8 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) +#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) + #define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) #define cpu_has_de boot_cpu_has(X86_FEATURE_DE) #define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f3bdaed0188f..c124d6ab4bf9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -156,8 +156,8 @@ extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; extern struct tss_struct doublefault_tss; -extern __u32 cpu_caps_cleared[NCAPINTS]; -extern __u32 cpu_caps_set[NCAPINTS]; +extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; +extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); -- cgit v1.2.3 From 07c7aa5e7e8ac83768246822b61ebffbdea61ff7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:33 +0100 Subject: x86/cpufeatures: Add X86_BUG_CPU_INSECURE commit a89f040fa34ec9cd682aed98b8f04e3c47d998bd upstream. Many x86 CPUs leak information to user space due to missing isolation of user space and kernel space page tables. There are many well documented ways to exploit that. The upcoming software migitation of isolating the user and kernel space page tables needs a misfeature flag so code can be made runtime conditional. Add the BUG bits which indicates that the CPU is affected and add a feature bit which indicates that the software migitation is enabled. Assume for now that _ALL_ x86 CPUs are affected by this. Exceptions can be made later. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 34c4106230f1..ae023d393c5c 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -277,6 +277,7 @@ #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) -- cgit v1.2.3 From 6349cab425ce91ba71676fba5aa6089cae0e6474 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 5 Jan 2018 15:27:34 +0100 Subject: x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN commit de791821c295cc61419a06fe5562288417d1bc58 upstream. Use the name associated with the particular attack which needs page table isolation for mitigation. Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Alan Cox Cc: Jiri Koshina Cc: Linus Torvalds Cc: Tim Chen Cc: Andi Lutomirski Cc: Andi Kleen Cc: Peter Zijlstra Cc: Paul Turner Cc: Tom Lendacky Cc: Greg KH Cc: Dave Hansen Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801051525300.1724@nanos Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index ae023d393c5c..25055e066b4c 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -277,7 +277,7 @@ #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ -#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ +#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) -- cgit v1.2.3 From caae411b6ee026c7f43d67932e9b5008cf623293 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 6 Jan 2018 11:49:23 +0000 Subject: x86/cpufeatures: Add X86_BUG_SPECTRE_V[12] commit 99c6fa2511d8a683e61468be91b83f85452115fa upstream. Add the bug bits for spectre v1/2 and force them unconditionally for all cpus. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1515239374-23361-2-git-send-email-dwmw@amazon.co.uk Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 25055e066b4c..142028afd049 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -278,6 +278,8 @@ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ #define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ +#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ +#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) -- cgit v1.2.3 From 999d4f1961fa002bda138ddfe9119965421f85da Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 4 Jan 2018 14:37:05 +0000 Subject: x86/alternatives: Add missing '\n' at end of ALTERNATIVE inline asm commit b9e705ef7cfaf22db0daab91ad3cd33b0fa32eb9 upstream. Where an ALTERNATIVE is used in the middle of an inline asm block, this would otherwise lead to the following instruction being appended directly to the trailing ".popsection", and a failed compile. Fixes: 9cebed423c84 ("x86, alternative: Use .pushsection/.popsection") Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: ak@linux.intel.com Cc: Tim Chen Cc: Peter Zijlstra Cc: Paul Turner Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180104143710.8961-8-dwmw@amazon.co.uk Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/alternative.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 09936e9c8154..d1cf17173b1b 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -138,7 +138,7 @@ static inline int alternatives_text_reserved(void *start, void *end) ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ - ".popsection" + ".popsection\n" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ OLDINSTR_2(oldinstr, 1, 2) \ @@ -149,7 +149,7 @@ static inline int alternatives_text_reserved(void *start, void *end) ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ - ".popsection" + ".popsection\n" /* * This must be included *after* the definition of ALTERNATIVE due to -- cgit v1.2.3 From 20c28c04a6bc2ebd60fa20e5c3a6bf3bfa736d81 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 8 Jan 2018 16:09:21 -0600 Subject: x86/cpu/AMD: Make LFENCE a serializing instruction commit e4d0e84e490790798691aaa0f2e598637f1867ec upstream. To aid in speculation control, make LFENCE a serializing instruction since it has less overhead than MFENCE. This is done by setting bit 1 of MSR 0xc0011029 (DE_CFG). Some families that support LFENCE do not have this MSR. For these families, the LFENCE instruction is already serializing. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Reviewed-by: Borislav Petkov Cc: Peter Zijlstra Cc: Tim Chen Cc: Dave Hansen Cc: Borislav Petkov Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David Woodhouse Cc: Paul Turner Link: https://lkml.kernel.org/r/20180108220921.12580.71694.stgit@tlendack-t1.amdoffice.net Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/msr-index.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 37db36fddc88..f00dfff81370 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -330,6 +330,8 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c +#define MSR_F10H_DECFG 0xc0011029 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a -- cgit v1.2.3 From 642ce1bb5ea64b1d036dff04a005d9bbe28ab5b4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 8 Jan 2018 16:09:32 -0600 Subject: x86/cpu/AMD: Use LFENCE_RDTSC in preference to MFENCE_RDTSC commit 9c6a73c75864ad9fa49e5fa6513e4c4071c0e29f upstream. With LFENCE now a serializing instruction, use LFENCE_RDTSC in preference to MFENCE_RDTSC. However, since the kernel could be running under a hypervisor that does not support writing that MSR, read the MSR back and verify that the bit has been set successfully. If the MSR can be read and the bit is set, then set the LFENCE_RDTSC feature, otherwise set the MFENCE_RDTSC feature. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Reviewed-by: Borislav Petkov Cc: Peter Zijlstra Cc: Tim Chen Cc: Dave Hansen Cc: Borislav Petkov Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David Woodhouse Cc: Paul Turner Link: https://lkml.kernel.org/r/20180108220932.12580.52458.stgit@tlendack-t1.amdoffice.net Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/msr-index.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f00dfff81370..b8911aecf035 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -332,6 +332,7 @@ #define MSR_FAM10H_NODE_ID 0xc001100c #define MSR_F10H_DECFG 0xc0011029 #define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a -- cgit v1.2.3 From cfc8c1d61e46fd3c60a34a5b1962eeeb03222a3d Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 29 Sep 2017 17:15:36 +0300 Subject: x86/asm: Use register variable to get stack pointer value commit 196bd485ee4f03ce4c690bfcf38138abfcd0a4bc upstream. Currently we use current_stack_pointer() function to get the value of the stack pointer register. Since commit: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") ... we have a stack register variable declared. It can be used instead of current_stack_pointer() function which allows to optimize away some excessive "mov %rsp, %" instructions: -mov %rsp,%rdx -sub %rdx,%rax -cmp $0x3fff,%rax -ja ffffffff810722fd +sub %rsp,%rax +cmp $0x3fff,%rax +ja ffffffff810722fa Remove current_stack_pointer(), rename __asm_call_sp to current_stack_pointer and use it instead of the removed function. Signed-off-by: Andrey Ryabinin Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170929141537.29167-1-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar [dwmw2: We want ASM_CALL_CONSTRAINT for retpoline] Signed-off-by: David Woodhouse Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/asm.h | 11 +++++++++++ arch/x86/include/asm/thread_info.h | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 189679aba703..b9c6c7a6f5a6 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -105,4 +105,15 @@ /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif +#ifndef __ASSEMBLY__ +/* + * This output constraint should be used for any inline asm which has a "call" + * instruction. Otherwise the asm may be inserted before the frame pointer + * gets set up by the containing function. If you forget to do this, objtool + * may print a "call without frame pointer save/setup" warning. + */ +register unsigned long current_stack_pointer asm(_ASM_SP); +#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) +#endif + #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index c7b551028740..9b028204685d 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -166,17 +166,6 @@ static inline struct thread_info *current_thread_info(void) return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); } -static inline unsigned long current_stack_pointer(void) -{ - unsigned long sp; -#ifdef CONFIG_X86_64 - asm("mov %%rsp,%0" : "=g" (sp)); -#else - asm("mov %%esp,%0" : "=g" (sp)); -#endif - return sp; -} - #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 -- cgit v1.2.3 From b76ac90af34dc08f057da15aa34584a0a3d381b5 Mon Sep 17 00:00:00 2001 From: Adam Borowski Date: Sun, 11 Dec 2016 02:09:18 +0100 Subject: x86/kbuild: enable modversions for symbols exported from asm commit 334bb773876403eae3457d81be0b8ea70f8e4ccc upstream. Commit 4efca4ed ("kbuild: modversions for EXPORT_SYMBOL() for asm") adds modversion support for symbols exported from asm files. Architectures must include C-style declarations for those symbols in asm/asm-prototypes.h in order for them to be versioned. Add these declarations for x86, and an architecture-independent file that can be used for common symbols. With f27c2f6 reverting 8ab2ae6 ("default exported asm symbols to zero") we produce a scary warning on x86, this commit fixes that. Signed-off-by: Adam Borowski Tested-by: Kalle Valo Acked-by: Nicholas Piggin Tested-by: Peter Wu Tested-by: Oliver Hartkopp Signed-off-by: Michal Marek Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/asm-prototypes.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 arch/x86/include/asm/asm-prototypes.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..44b8762fa0c7 --- /dev/null +++ b/arch/x86/include/asm/asm-prototypes.h @@ -0,0 +1,16 @@ +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#ifndef CONFIG_X86_CMPXCHG64 +extern void cmpxchg8b_emu(void); +#endif -- cgit v1.2.3 From b8e7a489b51810992ba3266bfd9e043709e07267 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 26 Apr 2016 12:23:25 -0700 Subject: x86/asm: Make asm/alternative.h safe from assembly commit f005f5d860e0231fe212cfda8c1a3148b99609f4 upstream. asm/alternative.h isn't directly useful from assembly, but it shouldn't break the build. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e5b693fcef99fe6e80341c9e97a002fb23871e91.1461698311.git.luto@kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/alternative.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index d1cf17173b1b..215ea9214215 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_ALTERNATIVE_H #define _ASM_X86_ALTERNATIVE_H +#ifndef __ASSEMBLY__ + #include #include #include @@ -271,4 +273,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +#endif /* __ASSEMBLY__ */ + #endif /* _ASM_X86_ALTERNATIVE_H */ -- cgit v1.2.3 From 3c5e10905263dbe9fbc621d1889b85e9c867da25 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:25 +0000 Subject: x86/retpoline: Add initial retpoline support commit 76b043848fd22dbf7f8bf3a1452f8c70d557b860 upstream. Enable the use of -mindirect-branch=thunk-extern in newer GCC, and provide the corresponding thunks. Provide assembler macros for invoking the thunks in the same way that GCC does, from native and inline assembler. This adds X86_FEATURE_RETPOLINE and sets it by default on all CPUs. In some circumstances, IBRS microcode features may be used instead, and the retpoline can be disabled. On AMD CPUs if lfence is serialising, the retpoline can be dramatically simplified to a simple "lfence; jmp *\reg". A future patch, after it has been verified that lfence really is serialising in all circumstances, can enable this by setting the X86_FEATURE_RETPOLINE_AMD feature bit in addition to X86_FEATURE_RETPOLINE. Do not align the retpoline in the altinstr section, because there is no guarantee that it stays aligned when it's copied over the oldinstr during alternative patching. [ Andi Kleen: Rename the macros, add CONFIG_RETPOLINE option, export thunks] [ tglx: Put actual function CALL/JMP in front of the macros, convert to symbolic labels ] [ dwmw2: Convert back to numeric labels, merge objtool fixes ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-4-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse [ 4.4 backport: removed objtool annotation since there is no objtool ] Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/asm-prototypes.h | 25 ++++++++ arch/x86/include/asm/cpufeature.h | 2 + arch/x86/include/asm/nospec-branch.h | 106 ++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+) create mode 100644 arch/x86/include/asm/nospec-branch.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 44b8762fa0c7..b15aa4083dfd 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -10,7 +10,32 @@ #include #include #include +#include #ifndef CONFIG_X86_CMPXCHG64 extern void cmpxchg8b_emu(void); #endif + +#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_X86_32 +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void); +#else +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void); +INDIRECT_THUNK(8) +INDIRECT_THUNK(9) +INDIRECT_THUNK(10) +INDIRECT_THUNK(11) +INDIRECT_THUNK(12) +INDIRECT_THUNK(13) +INDIRECT_THUNK(14) +INDIRECT_THUNK(15) +#endif +INDIRECT_THUNK(ax) +INDIRECT_THUNK(bx) +INDIRECT_THUNK(cx) +INDIRECT_THUNK(dx) +INDIRECT_THUNK(si) +INDIRECT_THUNK(di) +INDIRECT_THUNK(bp) +INDIRECT_THUNK(sp) +#endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 142028afd049..0fbc98568018 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -200,6 +200,8 @@ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +#define X86_FEATURE_RETPOLINE ( 7*32+29) /* Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+30) /* AMD Retpoline mitigation for Spectre variant 2 */ /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ #define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h new file mode 100644 index 000000000000..5763548fb30b --- /dev/null +++ b/arch/x86/include/asm/nospec-branch.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NOSPEC_BRANCH_H__ +#define __NOSPEC_BRANCH_H__ + +#include +#include +#include + +#ifdef __ASSEMBLY__ + +/* + * These are the bare retpoline primitives for indirect jmp and call. + * Do not use these directly; they only exist to make the ALTERNATIVE + * invocation below less ugly. + */ +.macro RETPOLINE_JMP reg:req + call .Ldo_rop_\@ +.Lspec_trap_\@: + pause + jmp .Lspec_trap_\@ +.Ldo_rop_\@: + mov \reg, (%_ASM_SP) + ret +.endm + +/* + * This is a wrapper around RETPOLINE_JMP so the called function in reg + * returns to the instruction after the macro. + */ +.macro RETPOLINE_CALL reg:req + jmp .Ldo_call_\@ +.Ldo_retpoline_jmp_\@: + RETPOLINE_JMP \reg +.Ldo_call_\@: + call .Ldo_retpoline_jmp_\@ +.endm + +/* + * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple + * indirect jmp/call which may be susceptible to the Spectre variant 2 + * attack. + */ +.macro JMP_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE + ALTERNATIVE_2 __stringify(jmp *\reg), \ + __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD +#else + jmp *\reg +#endif +.endm + +.macro CALL_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE + ALTERNATIVE_2 __stringify(call *\reg), \ + __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ + __stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD +#else + call *\reg +#endif +.endm + +#else /* __ASSEMBLY__ */ + +#if defined(CONFIG_X86_64) && defined(RETPOLINE) + +/* + * Since the inline asm uses the %V modifier which is only in newer GCC, + * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE. + */ +# define CALL_NOSPEC \ + ALTERNATIVE( \ + "call *%[thunk_target]\n", \ + "call __x86_indirect_thunk_%V[thunk_target]\n", \ + X86_FEATURE_RETPOLINE) +# define THUNK_TARGET(addr) [thunk_target] "r" (addr) + +#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE) +/* + * For i386 we use the original ret-equivalent retpoline, because + * otherwise we'll run out of registers. We don't care about CET + * here, anyway. + */ +# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n", \ + " jmp 904f;\n" \ + " .align 16\n" \ + "901: call 903f;\n" \ + "902: pause;\n" \ + " jmp 902b;\n" \ + " .align 16\n" \ + "903: addl $4, %%esp;\n" \ + " pushl %[thunk_target];\n" \ + " ret;\n" \ + " .align 16\n" \ + "904: call 901b;\n", \ + X86_FEATURE_RETPOLINE) + +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#else /* No retpoline */ +# define CALL_NOSPEC "call *%[thunk_target]\n" +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* __NOSPEC_BRANCH_H__ */ -- cgit v1.2.3 From 9f789bc5711bcacb5df003594b992f0c1cc19df4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:26 +0000 Subject: x86/spectre: Add boot time option to select Spectre v2 mitigation commit da285121560e769cc31797bba6422eea71d473e0 upstream. Add a spectre_v2= option to select the mitigation used for the indirect branch speculation vulnerability. Currently, the only option available is retpoline, in its various forms. This will be expanded to cover the new IBRS/IBPB microcode features. The RETPOLINE_AMD feature relies on a serializing LFENCE for speculation control. For AMD hardware, only set RETPOLINE_AMD if LFENCE is a serializing instruction, which is indicated by the LFENCE_RDTSC feature. [ tglx: Folded back the LFENCE/AMD fixes and reworked it so IBRS integration becomes simple ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-5-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 5763548fb30b..fe48aeee79d1 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -102,5 +102,15 @@ # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif +/* The Spectre V2 mitigation variants */ +enum spectre_v2_mitigation { + SPECTRE_V2_NONE, + SPECTRE_V2_RETPOLINE_MINIMAL, + SPECTRE_V2_RETPOLINE_MINIMAL_AMD, + SPECTRE_V2_RETPOLINE_GENERIC, + SPECTRE_V2_RETPOLINE_AMD, + SPECTRE_V2_IBRS, +}; + #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ -- cgit v1.2.3 From 6b222e7483af4fd8f632efbf3b91025c2359b10a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:31 +0000 Subject: x86/retpoline/xen: Convert Xen hypercall indirect jumps commit ea08816d5b185ab3d09e95e393f265af54560350 upstream. Convert indirect call in Xen hypercall to use non-speculative sequence, when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Reviewed-by: Juergen Gross Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-10-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/xen/hypercall.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 85133b2b8e99..0977e7607046 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -215,9 +216,9 @@ privcmd_call(unsigned call, __HYPERCALL_5ARG(a1, a2, a3, a4, a5); stac(); - asm volatile("call *%[call]" + asm volatile(CALL_NOSPEC : __HYPERCALL_5PARAM - : [call] "a" (&hypercall_page[call]) + : [thunk_target] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); clac(); -- cgit v1.2.3 From eebc3f8adee0a6f43a4789ef0bf5c5b35de8cfe4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 12 Jan 2018 11:11:27 +0000 Subject: x86/retpoline: Fill return stack buffer on vmexit commit 117cc7a908c83697b0b737d15ae1eb5943afe35b upstream. In accordance with the Intel and AMD documentation, we need to overwrite all entries in the RSB on exiting a guest, to prevent malicious branch target predictions from affecting the host kernel. This is needed both for retpoline and for IBRS. [ak: numbers again for the RSB stuffing labels] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Tested-by: Peter Zijlstra (Intel) Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515755487-8524-1-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 76 +++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index fe48aeee79d1..1afd04eb5fb7 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -7,6 +7,48 @@ #include #include +/* + * Fill the CPU return stack buffer. + * + * Each entry in the RSB, if used for a speculative 'ret', contains an + * infinite 'pause; jmp' loop to capture speculative execution. + * + * This is required in various cases for retpoline and IBRS-based + * mitigations for the Spectre variant 2 vulnerability. Sometimes to + * eliminate potentially bogus entries from the RSB, and sometimes + * purely to ensure that it doesn't get empty, which on some CPUs would + * allow predictions from other (unwanted!) sources to be used. + * + * We define a CPP macro such that it can be used from both .S files and + * inline assembly. It's possible to do a .macro and then include that + * from C via asm(".include ") but let's not go there. + */ + +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ +#define RSB_FILL_LOOPS 16 /* To avoid underflow */ + +/* + * Google experimented with loop-unrolling and this turned out to be + * the optimal version — two calls, each with their own speculation + * trap should their return address end up getting used, in a loop. + */ +#define __FILL_RETURN_BUFFER(reg, nr, sp) \ + mov $(nr/2), reg; \ +771: \ + call 772f; \ +773: /* speculation trap */ \ + pause; \ + jmp 773b; \ +772: \ + call 774f; \ +775: /* speculation trap */ \ + pause; \ + jmp 775b; \ +774: \ + dec reg; \ + jnz 771b; \ + add $(BITS_PER_LONG/8) * nr, sp; + #ifdef __ASSEMBLY__ /* @@ -59,6 +101,19 @@ #else call *\reg #endif +.endm + + /* + * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP + * monstrosity above, manually. + */ +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req +#ifdef CONFIG_RETPOLINE + ALTERNATIVE "jmp .Lskip_rsb_\@", \ + __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ + \ftr +.Lskip_rsb_\@: +#endif .endm #else /* __ASSEMBLY__ */ @@ -97,7 +152,7 @@ X86_FEATURE_RETPOLINE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) -#else /* No retpoline */ +#else /* No retpoline for C / inline asm */ # define CALL_NOSPEC "call *%[thunk_target]\n" # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif @@ -112,5 +167,24 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS, }; +/* + * On VMEXIT we must ensure that no RSB predictions learned in the guest + * can be followed in the host, by overwriting the RSB completely. Both + * retpoline and IBRS mitigations for Spectre v2 need this; only on future + * CPUs with IBRS_ATT *might* it be avoided. + */ +static inline void vmexit_fill_RSB(void) +{ +#ifdef CONFIG_RETPOLINE + unsigned long loops = RSB_CLEAR_LOOPS / 2; + + asm volatile (ALTERNATIVE("jmp 910f", + __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), + X86_FEATURE_RETPOLINE) + "910:" + : "=&r" (loops), ASM_CALL_CONSTRAINT + : "r" (loops) : "memory" ); +#endif +} #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ -- cgit v1.2.3 From fba063e6dfb413e06b9daa5d45b164761172f5ed Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Sat, 13 Jan 2018 17:27:30 -0600 Subject: x86/retpoline: Add LFENCE to the retpoline/RSB filling RSB macros commit 28d437d550e1e39f805d99f9f8ac399c778827b7 upstream. The PAUSE instruction is currently used in the retpoline and RSB filling macros as a speculation trap. The use of PAUSE was originally suggested because it showed a very, very small difference in the amount of cycles/time used to execute the retpoline as compared to LFENCE. On AMD, the PAUSE instruction is not a serializing instruction, so the pause/jmp loop will use excess power as it is speculated over waiting for return to mispredict to the correct target. The RSB filling macro is applicable to AMD, and, if software is unable to verify that LFENCE is serializing on AMD (possible when running under a hypervisor), the generic retpoline support will be used and, so, is also applicable to AMD. Keep the current usage of PAUSE for Intel, but add an LFENCE instruction to the speculation trap for AMD. The same sequence has been adopted by GCC for the GCC generated retpolines. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Acked-by: David Woodhouse Acked-by: Arjan van de Ven Cc: Rik van Riel Cc: Andi Kleen Cc: Paul Turner Cc: Peter Zijlstra Cc: Tim Chen Cc: Jiri Kosina Cc: Dave Hansen Cc: Andy Lutomirski Cc: Josh Poimboeuf Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: Kees Cook Link: https://lkml.kernel.org/r/20180113232730.31060.36287.stgit@tlendack-t1.amdoffice.net Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 1afd04eb5fb7..e28a9ff1246c 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -11,7 +11,7 @@ * Fill the CPU return stack buffer. * * Each entry in the RSB, if used for a speculative 'ret', contains an - * infinite 'pause; jmp' loop to capture speculative execution. + * infinite 'pause; lfence; jmp' loop to capture speculative execution. * * This is required in various cases for retpoline and IBRS-based * mitigations for the Spectre variant 2 vulnerability. Sometimes to @@ -38,11 +38,13 @@ call 772f; \ 773: /* speculation trap */ \ pause; \ + lfence; \ jmp 773b; \ 772: \ call 774f; \ 775: /* speculation trap */ \ pause; \ + lfence; \ jmp 775b; \ 774: \ dec reg; \ @@ -60,6 +62,7 @@ call .Ldo_rop_\@ .Lspec_trap_\@: pause + lfence jmp .Lspec_trap_\@ .Ldo_rop_\@: mov \reg, (%_ASM_SP) @@ -142,6 +145,7 @@ " .align 16\n" \ "901: call 903f;\n" \ "902: pause;\n" \ + " lfence;\n" \ " jmp 902b;\n" \ " .align 16\n" \ "903: addl $4, %%esp;\n" \ -- cgit v1.2.3 From f59e7ce17ba327245c8feb312d447b09d3b98eba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 18 Jan 2018 16:28:26 +0100 Subject: x86/mce: Make machine check speculation protected commit 6f41c34d69eb005e7848716bbcafc979b35037d5 upstream. The machine check idtentry uses an indirect branch directly from the low level code. This evades the speculation protection. Replace it by a direct call into C code and issue the indirect call there so the compiler can apply the proper speculation protection. Signed-off-by: Thomas Gleixner Reviewed-by:Borislav Petkov Reviewed-by: David Woodhouse Niced-by: Peter Zijlstra Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801181626290.1847@nanos Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/traps.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c3496619740a..156959ca49ce 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -92,6 +92,7 @@ dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); #endif +dotraplinkage void do_mce(struct pt_regs *, long); static inline int get_si_code(unsigned long condition) { -- cgit v1.2.3 From 799dc737680a8074a0c7c2d3426b85f4c439377f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 19 Jan 2018 01:14:21 +0900 Subject: retpoline: Introduce start/end markers of indirect thunk commit 736e80a4213e9bbce40a7c050337047128b472ac upstream. Introduce start/end markers of __x86_indirect_thunk_* functions. To make it easy, consolidate .text.__x86.indirect_thunk.* sections to one .text.__x86.indirect_thunk section and put it in the end of kernel text section and adds __indirect_thunk_start/end so that other subsystem (e.g. kprobes) can identify it. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Andi Kleen Cc: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Link: https://lkml.kernel.org/r/151629206178.10241.6828804696410044771.stgit@devbox Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e28a9ff1246c..4f7a5d3fed91 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -171,6 +171,9 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS, }; +extern char __indirect_thunk_start[]; +extern char __indirect_thunk_end[]; + /* * On VMEXIT we must ensure that no RSB predictions learned in the guest * can be followed in the host, by overwriting the RSB completely. Both -- cgit v1.2.3 From 11e619414b69b7f1e47baac72c5be589d86e5393 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 17 Jan 2018 14:53:28 -0800 Subject: x86/retpoline: Optimize inline assembler for vmexit_fill_RSB commit 3f7d875566d8e79c5e0b2c9a413e91b2c29e0854 upstream. The generated assembler for the C fill RSB inline asm operations has several issues: - The C code sets up the loop register, which is then immediately overwritten in __FILL_RETURN_BUFFER with the same value again. - The C code also passes in the iteration count in another register, which is not used at all. Remove these two unnecessary operations. Just rely on the single constant passed to the macro for the iterations. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: dave.hansen@intel.com Cc: gregkh@linuxfoundation.org Cc: torvalds@linux-foundation.org Cc: arjan@linux.intel.com Link: https://lkml.kernel.org/r/20180117225328.15414-1-andi@firstfloor.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 4f7a5d3fed91..492370b9b35b 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -183,15 +183,16 @@ extern char __indirect_thunk_end[]; static inline void vmexit_fill_RSB(void) { #ifdef CONFIG_RETPOLINE - unsigned long loops = RSB_CLEAR_LOOPS / 2; + unsigned long loops; asm volatile (ALTERNATIVE("jmp 910f", __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), X86_FEATURE_RETPOLINE) "910:" - : "=&r" (loops), ASM_CALL_CONSTRAINT - : "r" (loops) : "memory" ); + : "=r" (loops), ASM_CALL_CONSTRAINT + : : "memory" ); #endif } + #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ -- cgit v1.2.3 From 8bb0c6a1b3b2fe92707ee6f6901bfdfc392f8cd1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 9 Dec 2016 10:24:05 -0800 Subject: x86/asm/32: Make sync_core() handle missing CPUID on all 32-bit kernels commit 1c52d859cb2d417e7216d3e56bb7fea88444cec9 upstream. We support various non-Intel CPUs that don't have the CPUID instruction, so the M486 test was wrong. For now, fix it with a big hammer: handle missing CPUID on all 32-bit CPUs. Reported-by: One Thousand Gnomes Signed-off-by: Andy Lutomirski Cc: Juergen Gross Cc: Peter Zijlstra Cc: Brian Gerst Cc: Matthew Whitehead Cc: Borislav Petkov Cc: Henrique de Moraes Holschuh Cc: Andrew Cooper Cc: Boris Ostrovsky Cc: xen-devel Link: http://lkml.kernel.org/r/685bd083a7c036f7769510b6846315b17d6ba71f.1481307769.git.luto@kernel.org Signed-off-by: Thomas Gleixner Cc: "Zhang, Ning A" Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c124d6ab4bf9..86bccb4bd4dc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -574,7 +574,7 @@ static inline void sync_core(void) { int tmp; -#ifdef CONFIG_M486 +#ifdef CONFIG_X86_32 /* * Do a CPUID if available, otherwise do a jump. The jump * can conveniently enough be the jump around CPUID. -- cgit v1.2.3 From 0dfd5fbcae5d40e5e58bc7c34305498e965bf1ef Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Jun 2016 17:19:27 -0700 Subject: x86/cpu/intel: Introduce macros for Intel family numbers commit 970442c599b22ccd644ebfe94d1d303bf6f87c05 upstream. Problem: We have a boatload of open-coded family-6 model numbers. Half of them have these model numbers in hex and the other half in decimal. This makes grepping for them tons of fun, if you were to try. Solution: Consolidate all the magic numbers. Put all the definitions in one header. The names here are closely derived from the comments describing the models from arch/x86/events/intel/core.c. We could easily make them shorter by doing things like s/SANDYBRIDGE/SNB/, but they seemed fine even with the longer versions to me. Do not take any of these names too literally, like "DESKTOP" or "MOBILE". These are all colloquial names and not precise descriptions of everywhere a given model will show up. Signed-off-by: Dave Hansen Cc: Adrian Hunter Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Darren Hart Cc: Dave Hansen Cc: Denys Vlasenko Cc: Doug Thompson Cc: Eduardo Valentin Cc: H. Peter Anvin Cc: Jacob Pan Cc: Kan Liang Cc: Len Brown Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rajneesh Bhardwaj Cc: Souvik Kumar Chakravarty Cc: Srinivas Pandruvada Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Tony Luck Cc: Ulf Hansson Cc: Viresh Kumar Cc: Vishwanath Somayaji Cc: Zhang Rui Cc: jacob.jun.pan@intel.com Cc: linux-acpi@vger.kernel.org Cc: linux-edac@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: platform-driver-x86@vger.kernel.org Link: http://lkml.kernel.org/r/20160603001927.F2A7D828@viggo.jf.intel.com Signed-off-by: Ingo Molnar Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/intel-family.h | 68 +++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 arch/x86/include/asm/intel-family.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h new file mode 100644 index 000000000000..6999f7d01a0d --- /dev/null +++ b/arch/x86/include/asm/intel-family.h @@ -0,0 +1,68 @@ +#ifndef _ASM_X86_INTEL_FAMILY_H +#define _ASM_X86_INTEL_FAMILY_H + +/* + * "Big Core" Processors (Branded as Core, Xeon, etc...) + * + * The "_X" parts are generally the EP and EX Xeons, or the + * "Extreme" ones, like Broadwell-E. + * + * Things ending in "2" are usually because we have no better + * name for them. There's no processor called "WESTMERE2". + */ + +#define INTEL_FAM6_CORE_YONAH 0x0E +#define INTEL_FAM6_CORE2_MEROM 0x0F +#define INTEL_FAM6_CORE2_MEROM_L 0x16 +#define INTEL_FAM6_CORE2_PENRYN 0x17 +#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D + +#define INTEL_FAM6_NEHALEM 0x1E +#define INTEL_FAM6_NEHALEM_EP 0x1A +#define INTEL_FAM6_NEHALEM_EX 0x2E +#define INTEL_FAM6_WESTMERE 0x25 +#define INTEL_FAM6_WESTMERE2 0x1F +#define INTEL_FAM6_WESTMERE_EP 0x2C +#define INTEL_FAM6_WESTMERE_EX 0x2F + +#define INTEL_FAM6_SANDYBRIDGE 0x2A +#define INTEL_FAM6_SANDYBRIDGE_X 0x2D +#define INTEL_FAM6_IVYBRIDGE 0x3A +#define INTEL_FAM6_IVYBRIDGE_X 0x3E + +#define INTEL_FAM6_HASWELL_CORE 0x3C +#define INTEL_FAM6_HASWELL_X 0x3F +#define INTEL_FAM6_HASWELL_ULT 0x45 +#define INTEL_FAM6_HASWELL_GT3E 0x46 + +#define INTEL_FAM6_BROADWELL_CORE 0x3D +#define INTEL_FAM6_BROADWELL_XEON_D 0x56 +#define INTEL_FAM6_BROADWELL_GT3E 0x47 +#define INTEL_FAM6_BROADWELL_X 0x4F + +#define INTEL_FAM6_SKYLAKE_MOBILE 0x4E +#define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E +#define INTEL_FAM6_SKYLAKE_X 0x55 +#define INTEL_FAM6_KABYLAKE_MOBILE 0x8E +#define INTEL_FAM6_KABYLAKE_DESKTOP 0x9E + +/* "Small Core" Processors (Atom) */ + +#define INTEL_FAM6_ATOM_PINEVIEW 0x1C +#define INTEL_FAM6_ATOM_LINCROFT 0x26 +#define INTEL_FAM6_ATOM_PENWELL 0x27 +#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35 +#define INTEL_FAM6_ATOM_CEDARVIEW 0x36 +#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ +#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ +#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ +#define INTEL_FAM6_ATOM_MERRIFIELD1 0x4A /* Tangier */ +#define INTEL_FAM6_ATOM_MERRIFIELD2 0x5A /* Annidale */ +#define INTEL_FAM6_ATOM_GOLDMONT 0x5C +#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ + +/* Xeon Phi */ + +#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ + +#endif /* _ASM_X86_INTEL_FAMILY_H */ -- cgit v1.2.3 From 18bb117d1b7690181346e6365c6237b6ceaac4c4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 12 Jan 2018 17:49:25 +0000 Subject: x86/retpoline: Fill RSB on context switch for affected CPUs commit c995efd5a740d9cbafbf58bde4973e8b50b4d761 upstream. On context switch from a shallow call stack to a deeper one, as the CPU does 'ret' up the deeper side it may encounter RSB entries (predictions for where the 'ret' goes to) which were populated in userspace. This is problematic if neither SMEP nor KPTI (the latter of which marks userspace pages as NX for the kernel) are active, as malicious code in userspace may then be executed speculatively. Overwrite the CPU's return prediction stack with calls which are predicted to return to an infinite loop, to "capture" speculation if this happens. This is required both for retpoline, and also in conjunction with IBRS for !SMEP && !KPTI. On Skylake+ the problem is slightly different, and an *underflow* of the RSB may cause errant branch predictions to occur. So there it's not so much overwrite, as *filling* the RSB to attempt to prevent it getting empty. This is only a partial solution for Skylake+ since there are many other conditions which may result in the RSB becoming empty. The full solution on Skylake+ is to use IBRS, which will prevent the problem even when the RSB becomes empty. With IBRS, the RSB-stuffing will not be required on context switch. [ tglx: Added missing vendor check and slighty massaged comments and changelog ] [js] backport to 4.4 -- __switch_to_asm does not exist there, we have to patch the switch_to macros for both x86_32 and x86_64. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515779365-9032-1-git-send-email-dwmw@amazon.co.uk Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/include/asm/switch_to.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 0fbc98568018..641f0f2c2982 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -199,6 +199,7 @@ #define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ #define X86_FEATURE_RETPOLINE ( 7*32+29) /* Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+30) /* AMD Retpoline mitigation for Spectre variant 2 */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 751bf4b7bf11..025ecfaba9c9 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_SWITCH_TO_H #define _ASM_X86_SWITCH_TO_H +#include + struct task_struct; /* one of the stranger aspects of C forward declarations */ __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); @@ -24,6 +26,23 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ +#define __retpoline_fill_return_buffer \ + ALTERNATIVE("jmp 910f", \ + __stringify(__FILL_RETURN_BUFFER(%%ebx, RSB_CLEAR_LOOPS, %%esp)),\ + X86_FEATURE_RSB_CTXSW) \ + "910:\n\t" +#else +#define __retpoline_fill_return_buffer +#endif + /* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. @@ -46,6 +65,7 @@ do { \ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ "pushl %[next_ip]\n\t" /* restore EIP */ \ __switch_canary \ + __retpoline_fill_return_buffer \ "jmp __switch_to\n" /* regparm call */ \ "1:\t" \ "popl %%ebp\n\t" /* restore EBP */ \ @@ -100,6 +120,23 @@ do { \ #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ +#define __retpoline_fill_return_buffer \ + ALTERNATIVE("jmp 910f", \ + __stringify(__FILL_RETURN_BUFFER(%%r12, RSB_CLEAR_LOOPS, %%rsp)),\ + X86_FEATURE_RSB_CTXSW) \ + "910:\n\t" +#else +#define __retpoline_fill_return_buffer +#endif + /* * There is no need to save or restore flags, because flags are always * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL @@ -112,6 +149,7 @@ do { \ "call __switch_to\n\t" \ "movq "__percpu_arg([current_task])",%%rsi\n\t" \ __switch_canary \ + __retpoline_fill_return_buffer \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ "movq %%rax,%%rdi\n\t" \ "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ -- cgit v1.2.3 From ed73df0b7f23c95b3243a0f4bfc40f962e61d349 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 26 Jan 2018 16:23:02 +0000 Subject: vsyscall: Fix permissions for emulate mode with KAISER/PTI The backport of KAISER to 4.4 turned vsyscall emulate mode into native mode. Add a vsyscall_pgprot variable to hold the correct page protections, like Borislav and Hugh did for 3.2 and 3.18. Cc: Borislav Petkov Cc: Hugh Dickins Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/vsyscall.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 4865e10dbb55..9ee85066f407 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -13,6 +13,7 @@ extern void map_vsyscall(void); */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); extern bool vsyscall_enabled(void); +extern unsigned long vsyscall_pgprot; #else static inline void map_vsyscall(void) {} static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) -- cgit v1.2.3 From 80d2b5af21d2a29abfd58d378195a446dbc3ae43 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 5 Nov 2017 16:56:34 +0200 Subject: KVM: x86: Don't re-execute instruction when not passing CR2 value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 9b8ae63798cb97e785a667ff27e43fa6220cb734 ] In case of instruction-decode failure or emulation failure, x86_emulate_instruction() will call reexecute_instruction() which will attempt to use the cr2 value passed to x86_emulate_instruction(). However, when x86_emulate_instruction() is called from emulate_instruction(), cr2 is not passed (passed as 0) and therefore it doesn't make sense to execute reexecute_instruction() logic at all. Fixes: 51d8b66199e9 ("KVM: cleanup emulate_instruction") Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk Reviewed-by: Wanpeng Li Signed-off-by: Radim Krčmář Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kvm_host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9d2abb2a41d2..74fda1a453bd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -998,7 +998,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, static inline int emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) { - return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); + return x86_emulate_instruction(vcpu, 0, + emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0); } void kvm_enable_efer_bits(u64); -- cgit v1.2.3 From 69f9dc4b71a70a2bbc23eaa57525df3ecdc881a3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 28 Sep 2017 16:58:26 -0500 Subject: x86/asm: Fix inline asm call constraints for GCC 4.4 commit 520a13c530aeb5f63e011d668c42db1af19ed349 upstream. The kernel test bot (run by Xiaolong Ye) reported that the following commit: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") is causing double faults in a kernel compiled with GCC 4.4. Linus subsequently diagnosed the crash pattern and the buggy commit and found that the issue is with this code: register unsigned int __asm_call_sp asm("esp"); #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) Even on a 64-bit kernel, it's using ESP instead of RSP. That causes GCC to produce the following bogus code: ffffffff8147461d: 89 e0 mov %esp,%eax ffffffff8147461f: 4c 89 f7 mov %r14,%rdi ffffffff81474622: 4c 89 fe mov %r15,%rsi ffffffff81474625: ba 20 00 00 00 mov $0x20,%edx ffffffff8147462a: 89 c4 mov %eax,%esp ffffffff8147462c: e8 bf 52 05 00 callq ffffffff814c98f0 Despite the absurdity of it backing up and restoring the stack pointer for no reason, the bug is actually the fact that it's only backing up and restoring the lower 32 bits of the stack pointer. The upper 32 bits are getting cleared out, corrupting the stack pointer. So change the '__asm_call_sp' register variable to be associated with the actual full-size stack pointer. This also requires changing the __ASM_SEL() macro to be based on the actual compiled arch size, rather than the CONFIG value, because CONFIG_X86_64 compiles some files with '-m32' (e.g., realmode and vdso). Otherwise Clang fails to build the kernel because it complains about the use of a 64-bit register (RSP) in a 32-bit file. Reported-and-Bisected-and-Tested-by: kernel test robot Diagnosed-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: LKP Cc: Linus Torvalds Cc: Matthias Kaehlcke Cc: Miguel Bernal Marin Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") Link: http://lkml.kernel.org/r/20170928215826.6sdpmwtkiydiytim@treble Signed-off-by: Ingo Molnar Cc: Matthias Kaehlcke Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/asm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index b9c6c7a6f5a6..1c79c8add0eb 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -11,10 +11,12 @@ # define __ASM_FORM_COMMA(x) " " #x "," #endif -#ifdef CONFIG_X86_32 +#ifndef __x86_64__ +/* 32 bit */ # define __ASM_SEL(a,b) __ASM_FORM(a) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else +/* 64 bit */ # define __ASM_SEL(a,b) __ASM_FORM(b) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif -- cgit v1.2.3 From 4f3e6ab44b928ef70b77aa374886cb59fd9e4171 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 13 Feb 2018 16:45:20 +0100 Subject: kaiser: fix compile error without vsyscall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tobias noticed a compile error on 4.4.115, and it's the same on 4.9.80: arch/x86/mm/kaiser.c: In function ‘kaiser_init’: arch/x86/mm/kaiser.c:348:8: error: ‘vsyscall_pgprot’ undeclared (first use in this function) It seems like his combination of kernel options doesn't work for KAISER. X86_VSYSCALL_EMULATION is not set on his system, while LEGACY_VSYSCALL is set to NONE (LEGACY_VSYSCALL_NONE=y). He managed to get things compiling again, by moving the 'extern unsigned long vsyscall_pgprot' outside of the preprocessor statement. This works because the optimizer removes that code (vsyscall_enabled() is always false) - and that's how it was done in some older backports. Reported-by: Tobias Jakobi Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/vsyscall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 9ee85066f407..62210da19a92 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -13,7 +13,6 @@ extern void map_vsyscall(void); */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); extern bool vsyscall_enabled(void); -extern unsigned long vsyscall_pgprot; #else static inline void map_vsyscall(void) {} static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) @@ -22,5 +21,6 @@ static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) } static inline bool vsyscall_enabled(void) { return false; } #endif +extern unsigned long vsyscall_pgprot; #endif /* _ASM_X86_VSYSCALL_H */ -- cgit v1.2.3 From ff891875c1a079c55cb5def525c2448c4cbed98c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 13 Feb 2018 13:22:08 -0600 Subject: x86/cpu: Change type of x86_cache_size variable to unsigned int commit 24dbc6000f4b9b0ef5a9daecb161f1907733765a upstream. Currently, x86_cache_size is of type int, which makes no sense as we will never have a valid cache size equal or less than 0. So instead of initializing this variable to -1, it can perfectly be initialized to 0 and use it as an unsigned variable instead. Suggested-by: Thomas Gleixner Signed-off-by: Gustavo A. R. Silva Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Addresses-Coverity-ID: 1464429 Link: http://lkml.kernel.org/r/20180213192208.GA26414@embeddedor.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 86bccb4bd4dc..9e77cea2a8ef 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -113,7 +113,7 @@ struct cpuinfo_x86 { char x86_vendor_id[16]; char x86_model_id[64]; /* in KB - valid for CPUS which support this call: */ - int x86_cache_size; + unsigned int x86_cache_size; int x86_cache_alignment; /* In bytes */ /* Cache QoS architectural values: */ int x86_cache_max_rmid; /* max index */ -- cgit v1.2.3 From 26c3a6a7f5435712eaa70cb413717838abc49258 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 20 Feb 2018 12:55:05 +0100 Subject: x86/microcode/AMD: Change load_microcode_amd()'s param to bool to fix preemptibility bug commit dac6ca243c4c49a9ca7507d3d66140ebfac8b04b upstream. With CONFIG_DEBUG_PREEMPT enabled, I get: BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1 caller is debug_smp_processor_id CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.12.0-rc2+ #2 Call Trace: dump_stack check_preemption_disabled debug_smp_processor_id save_microcode_in_initrd_amd ? microcode_init save_microcode_in_initrd ... because, well, it says it above, we're using smp_processor_id() in preemptible code. But passing the CPU number is not really needed. It is only used to determine whether we're on the BSP, and, if so, to save the microcode patch for early loading. [ We don't absolutely need to do it on the BSP but we do that customarily there. ] Instead, convert that function parameter to a boolean which denotes whether the patch should be saved or not, thereby avoiding the use of smp_processor_id() in preemptible code. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170528200414.31305-1-bp@alien8.de Signed-off-by: Ingo Molnar [arnd: rebased to 4.9, after running into warning: arch/x86/kernel/cpu/microcode/amd.c:881:30: self-comparison always evaluates to true] Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/microcode_amd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index adfc847a395e..fb163f02ebb1 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -59,7 +59,6 @@ static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, extern int __apply_microcode_amd(struct microcode_amd *mc_amd); extern int apply_microcode_amd(int cpu); -extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); #define PATCH_MAX_SIZE PAGE_SIZE extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; -- cgit v1.2.3 From ffe69f2dd11a8ef5f708f116a3bccb80b9d57e7d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 23 Feb 2018 11:41:51 +0100 Subject: x86/retpoline: Remove the esp/rsp thunk commit 1df37383a8aeabb9b418698f0bcdffea01f4b1b2 upstream. It doesn't make sense to have an indirect call thunk with esp/rsp as retpoline code won't work correctly with the stack pointer register. Removing it will help compiler writers to catch error in case such a thunk call is emitted incorrectly. Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support") Suggested-by: Jeff Law Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Tom Lendacky Cc: Kees Cook Cc: Andi Kleen Cc: Tim Chen Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1516658974-27852-1-git-send-email-longman@redhat.com Signed-off-by: David Woodhouse [jwang: cherry pick to 4.4] Signed-off-by: Jack Wang Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/asm-prototypes.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index b15aa4083dfd..5a25ada75aeb 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -37,5 +37,4 @@ INDIRECT_THUNK(dx) INDIRECT_THUNK(si) INDIRECT_THUNK(di) INDIRECT_THUNK(bp) -INDIRECT_THUNK(sp) #endif /* CONFIG_RETPOLINE */ -- cgit v1.2.3 From 3d535a0f55d1ba44b66c88d44e592f12056c188b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 23 Feb 2018 11:41:55 +0100 Subject: x86/nospec: Fix header guards names (cherry picked from commit 7a32fc51ca938e67974cbb9db31e1a43f98345a9) ... to adhere to the _ASM_X86_ naming scheme. No functional change. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: riel@redhat.com Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: David Woodhouse Cc: jikos@kernel.org Cc: luto@amacapital.net Cc: dave.hansen@intel.com Cc: torvalds@linux-foundation.org Cc: keescook@google.com Cc: Josh Poimboeuf Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Cc: pjt@google.com Link: https://lkml.kernel.org/r/20180126121139.31959-3-bp@alien8.de Signed-off-by: David Woodhouse [cherry-pick to 4.4] Signed-off-by: Jack Wang Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 492370b9b35b..82d0d6e5ade8 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NOSPEC_BRANCH_H__ -#define __NOSPEC_BRANCH_H__ +#ifndef _ASM_X86_NOSPEC_BRANCH_H_ +#define _ASM_X86_NOSPEC_BRANCH_H_ #include #include @@ -195,4 +195,4 @@ static inline void vmexit_fill_RSB(void) } #endif /* __ASSEMBLY__ */ -#endif /* __NOSPEC_BRANCH_H__ */ +#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ -- cgit v1.2.3 From f136b56017ad7848449ac8b8aaebc340346acbbd Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 23 Feb 2018 11:42:01 +0100 Subject: x86: Implement array_index_mask_nospec (cherry picked from commit babdde2698d482b6c0de1eab4f697cf5856c5859) array_index_nospec() uses a mask to sanitize user controllable array indexes, i.e. generate a 0 mask if 'index' >= 'size', and a ~0 mask otherwise. While the default array_index_mask_nospec() handles the carry-bit from the (index - size) result in software. The x86 array_index_mask_nospec() does the same, but the carry-bit is handled in the processor CF flag without conditional instructions in the control flow. Suggested-by: Linus Torvalds Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: kernel-hardening@lists.openwall.com Cc: gregkh@linuxfoundation.org Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727414808.33451.1873237130672785331.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse [jwang:chery pick to 4.4] Signed-off-by: Jack Wang Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/barrier.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 0681d2532527..b5028e3ef909 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -24,6 +24,30 @@ #define wmb() asm volatile("sfence" ::: "memory") #endif +/** + * array_index_mask_nospec() - generate a mask that is ~0UL when the + * bounds check succeeds and 0 otherwise + * @index: array element index + * @size: number of elements in array + * + * Returns: + * 0 - (index < size) + */ +static inline unsigned long array_index_mask_nospec(unsigned long index, + unsigned long size) +{ + unsigned long mask; + + asm ("cmp %1,%2; sbb %0,%0;" + :"=r" (mask) + :"r"(size),"r" (index) + :"cc"); + return mask; +} + +/* Override the default implementation from linux/nospec.h. */ +#define array_index_mask_nospec array_index_mask_nospec + #ifdef CONFIG_X86_PPRO_FENCE #define dma_rmb() rmb() #else -- cgit v1.2.3 From 64d41d13ed81d55e03c80d241cf353b1aa0bf1c3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 23 Feb 2018 11:42:02 +0100 Subject: x86: Introduce barrier_nospec (cherry picked from commit b3d7ad85b80bbc404635dca80f5b129f6242bc7a) Rename the open coded form of this instruction sequence from rdtsc_ordered() into a generic barrier primitive, barrier_nospec(). One of the mitigations for Spectre variant1 vulnerabilities is to fence speculative execution after successfully validating a bounds check. I.e. force the result of a bounds check to resolve in the instruction pipeline to ensure speculative execution honors that result before potentially operating on out-of-bounds data. No functional changes. Suggested-by: Linus Torvalds Suggested-by: Andi Kleen Suggested-by: Ingo Molnar Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Tom Lendacky Cc: Kees Cook Cc: kernel-hardening@lists.openwall.com Cc: gregkh@linuxfoundation.org Cc: Al Viro Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727415361.33451.9049453007262764675.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse [jwang: cherry pick to 4.4] Signed-off-by: Jack Wang Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/barrier.h | 4 ++++ arch/x86/include/asm/msr.h | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index b5028e3ef909..814ef83c6720 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -48,6 +48,10 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, /* Override the default implementation from linux/nospec.h. */ #define array_index_mask_nospec array_index_mask_nospec +/* Prevent speculative execution past this barrier. */ +#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \ + "lfence", X86_FEATURE_LFENCE_RDTSC) + #ifdef CONFIG_X86_PPRO_FENCE #define dma_rmb() rmb() #else diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 77d8b284e4a7..5a10ac8c131e 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -147,8 +147,7 @@ static __always_inline unsigned long long rdtsc_ordered(void) * that some other imaginary CPU is updating continuously with a * time stamp. */ - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, - "lfence", X86_FEATURE_LFENCE_RDTSC); + barrier_nospec(); return rdtsc(); } -- cgit v1.2.3 From fd94ae98d2dd6883ed8c7948dcbb48867894045d Mon Sep 17 00:00:00 2001 From: Darren Kenny Date: Fri, 23 Feb 2018 11:42:13 +0100 Subject: x86/speculation: Fix typo IBRS_ATT, which should be IBRS_ALL (cherry picked from commit af189c95a371b59f493dbe0f50c0a09724868881) Fixes: 117cc7a908c83 ("x86/retpoline: Fill return stack buffer on vmexit") Signed-off-by: Darren Kenny Signed-off-by: Thomas Gleixner Reviewed-by: Konrad Rzeszutek Wilk Cc: Tom Lendacky Cc: Andi Kleen Cc: Borislav Petkov Cc: Masami Hiramatsu Cc: Arjan van de Ven Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180202191220.blvgkgutojecxr3b@starbug-vm.ie.oracle.com Signed-off-by: David Woodhouse [jwang: cherry pick to 4.4] Signed-off-by: Jack Wang Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 82d0d6e5ade8..66094a0473a8 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -178,7 +178,7 @@ extern char __indirect_thunk_end[]; * On VMEXIT we must ensure that no RSB predictions learned in the guest * can be followed in the host, by overwriting the RSB completely. Both * retpoline and IBRS mitigations for Spectre v2 need this; only on future - * CPUs with IBRS_ATT *might* it be avoided. + * CPUs with IBRS_ALL *might* it be avoided. */ static inline void vmexit_fill_RSB(void) { -- cgit v1.2.3 From 6f0a79ff1b62e78b782c4f80298c586aa8495259 Mon Sep 17 00:00:00 2001 From: Jan Dakinevich Date: Fri, 23 Feb 2018 11:42:17 +0100 Subject: KVM: VMX: clean up declaration of VPID/EPT invalidation types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 63f3ac48133a19110c8a3666028dbd9b1bf3dcb3 upstream - Remove VMX_EPT_EXTENT_INDIVIDUAL_ADDR, since there is no such type of EPT invalidation - Add missing VPID types names Signed-off-by: Jan Dakinevich Tested-by: Ladi Prosek Signed-off-by: Radim Krčmář [jwang: port to 4.4] Signed-off-by: Jack Wang Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/vmx.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 14c63c7e8337..6b6e16d813b9 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -400,10 +400,11 @@ enum vmcs_field { #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2) #define VMX_NR_VPIDS (1 << 16) +#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 #define VMX_VPID_EXTENT_ALL_CONTEXT 2 +#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL 3 -#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 #define VMX_EPT_EXTENT_SHIFT 24 @@ -420,8 +421,10 @@ enum vmcs_field { #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) #define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ +#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT (1ull << 8) /* (40 - 32) */ #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ +#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */ #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 -- cgit v1.2.3 From ea1c4ebe282d6bb6afca4a42bfbfb933c86b264c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 8 Mar 2018 16:17:34 +0100 Subject: bpf, x64: implement retpoline for tail call [ upstream commit a493a87f38cfa48caaa95c9347be2d914c6fdf29 ] Implement a retpoline [0] for the BPF tail call JIT'ing that converts the indirect jump via jmp %rax that is used to make the long jump into another JITed BPF image. Since this is subject to speculative execution, we need to control the transient instruction sequence here as well when CONFIG_RETPOLINE is set, and direct it into a pause + lfence loop. The latter aligns also with what gcc / clang emits (e.g. [1]). JIT dump after patch: # bpftool p d x i 1 0: (18) r2 = map[id:1] 2: (b7) r3 = 0 3: (85) call bpf_tail_call#12 4: (b7) r0 = 2 5: (95) exit With CONFIG_RETPOLINE: # bpftool p d j i 1 [...] 33: cmp %edx,0x24(%rsi) 36: jbe 0x0000000000000072 |* 38: mov 0x24(%rbp),%eax 3e: cmp $0x20,%eax 41: ja 0x0000000000000072 | 43: add $0x1,%eax 46: mov %eax,0x24(%rbp) 4c: mov 0x90(%rsi,%rdx,8),%rax 54: test %rax,%rax 57: je 0x0000000000000072 | 59: mov 0x28(%rax),%rax 5d: add $0x25,%rax 61: callq 0x000000000000006d |+ 66: pause | 68: lfence | 6b: jmp 0x0000000000000066 | 6d: mov %rax,(%rsp) | 71: retq | 72: mov $0x2,%eax [...] * relative fall-through jumps in error case + retpoline for indirect jump Without CONFIG_RETPOLINE: # bpftool p d j i 1 [...] 33: cmp %edx,0x24(%rsi) 36: jbe 0x0000000000000063 |* 38: mov 0x24(%rbp),%eax 3e: cmp $0x20,%eax 41: ja 0x0000000000000063 | 43: add $0x1,%eax 46: mov %eax,0x24(%rbp) 4c: mov 0x90(%rsi,%rdx,8),%rax 54: test %rax,%rax 57: je 0x0000000000000063 | 59: mov 0x28(%rax),%rax 5d: add $0x25,%rax 61: jmpq *%rax |- 63: mov $0x2,%eax [...] * relative fall-through jumps in error case - plain indirect jump as before [0] https://support.google.com/faqs/answer/7625886 [1] https://github.com/gcc-mirror/gcc/commit/a31e654fa107be968b802786d747e962c2fcdb2b Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 66094a0473a8..249f1c769f21 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -195,4 +195,41 @@ static inline void vmexit_fill_RSB(void) } #endif /* __ASSEMBLY__ */ + +/* + * Below is used in the eBPF JIT compiler and emits the byte sequence + * for the following assembly: + * + * With retpolines configured: + * + * callq do_rop + * spec_trap: + * pause + * lfence + * jmp spec_trap + * do_rop: + * mov %rax,(%rsp) + * retq + * + * Without retpolines configured: + * + * jmp *%rax + */ +#ifdef CONFIG_RETPOLINE +# define RETPOLINE_RAX_BPF_JIT_SIZE 17 +# define RETPOLINE_RAX_BPF_JIT() \ + EMIT1_off32(0xE8, 7); /* callq do_rop */ \ + /* spec_trap: */ \ + EMIT2(0xF3, 0x90); /* pause */ \ + EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ + EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ + /* do_rop: */ \ + EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \ + EMIT1(0xC3); /* retq */ +#else +# define RETPOLINE_RAX_BPF_JIT_SIZE 2 +# define RETPOLINE_RAX_BPF_JIT() \ + EMIT2(0xFF, 0xE0); /* jmp *%rax */ +#endif + #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ -- cgit v1.2.3