diff options
author | Ingo Molnar <mingo@kernel.org> | 2018-12-17 18:48:25 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2018-12-17 18:48:25 +0100 |
commit | 02117e42db7470e59910088b2b0ee42d581d2651 (patch) | |
tree | 7734414b46fb72ffc708ac924f3b6b69de1f0d4c /arch/x86/mm | |
parent | ba6f508d0ec4adb09f0a939af6d5e19cdfa8667d (diff) | |
parent | 721066dfd4d5c0fee5772c777d6930d0f423b4eb (diff) |
Merge branch 'x86/urgent' into x86/mm, to pick up dependent fix
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 15 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 24 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 13 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 115 |
4 files changed, 119 insertions, 48 deletions
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index fc37bbd23eb8..abcb8d00b014 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -55,10 +55,10 @@ struct addr_marker { enum address_markers_idx { USER_SPACE_NR = 0, KERNEL_SPACE_NR, - LOW_KERNEL_NR, -#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) +#ifdef CONFIG_MODIFY_LDT_SYSCALL LDT_NR, #endif + LOW_KERNEL_NR, VMALLOC_START_NR, VMEMMAP_START_NR, #ifdef CONFIG_KASAN @@ -66,9 +66,6 @@ enum address_markers_idx { KASAN_SHADOW_END_NR, #endif CPU_ENTRY_AREA_NR, -#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) - LDT_NR, -#endif #ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, #endif @@ -512,11 +509,11 @@ static inline bool is_hypervisor_range(int idx) { #ifdef CONFIG_X86_64 /* - * ffff800000000000 - ffff87ffffffffff is reserved for - * the hypervisor. + * A hole in the beginning of kernel address space reserved + * for a hypervisor. */ - return (idx >= pgd_index(__PAGE_OFFSET) - 16) && - (idx < pgd_index(__PAGE_OFFSET)); + return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) && + (idx < pgd_index(GUARD_HOLE_END_ADDR)); #else return false; #endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index db7a10082238..a1bcde35db4c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -285,20 +285,16 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static bool __cpa_flush_range(unsigned long start, int numpages, int cache) +static bool __inv_flush_all(int cache) { BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - WARN_ON(PAGE_ALIGN(start) != start); - if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); return true; } - flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); - - return !cache; + return false; } static void cpa_flush_range(unsigned long start, int numpages, int cache) @@ -306,7 +302,14 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) unsigned int i, level; unsigned long addr; - if (__cpa_flush_range(start, numpages, cache)) + WARN_ON(PAGE_ALIGN(start) != start); + + if (__inv_flush_all(cache)) + return; + + flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); + + if (!cache) return; /* @@ -332,7 +335,12 @@ static void cpa_flush_array(unsigned long baddr, unsigned long *start, { unsigned int i, level; - if (__cpa_flush_range(baddr, numpages, cache)) + if (__inv_flush_all(cache)) + return; + + flush_tlb_all(); + + if (!cache) return; /* diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 08013524fba1..4fe956a63b25 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -519,8 +519,13 @@ static u64 sanitize_phys(u64 address) * for a "decoy" virtual address (bit 63 clear) passed to * set_memory_X(). __pa() on a "decoy" address results in a * physical address with bit 63 set. + * + * Decoy addresses are not present for 32-bit builds, see + * set_mce_nospec(). */ - return address & __PHYSICAL_MASK; + if (IS_ENABLED(CONFIG_X86_64)) + return address & __PHYSICAL_MASK; + return address; } /* @@ -546,7 +551,11 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, start = sanitize_phys(start); end = sanitize_phys(end); - BUG_ON(start >= end); /* end is exclusive */ + if (start >= end) { + WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, + start, end - 1, cattr_name(req_type)); + return -EINVAL; + } if (!pat_enabled()) { /* This is identical to page table setting without PAT */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index bddd6b3cee1d..03b6b4c2238d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -7,7 +7,6 @@ #include <linux/export.h> #include <linux/cpu.h> #include <linux/debugfs.h> -#include <linux/ptrace.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -31,6 +30,12 @@ */ /* + * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is + * stored in cpu_tlb_state.last_user_mm_ibpb. + */ +#define LAST_USER_MM_IBPB 0x1UL + +/* * We get here when we do something requiring a TLB invalidation * but could not go invalidate all of the contexts. We do the * necessary invalidation by clearing out the 'ctx_id' which @@ -181,17 +186,87 @@ static void sync_current_stack_to_mm(struct mm_struct *mm) } } -static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id) +static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) +{ + unsigned long next_tif = task_thread_info(next)->flags; + unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; + + return (unsigned long)next->mm | ibpb; +} + +static void cond_ibpb(struct task_struct *next) { + if (!next || !next->mm) + return; + /* - * Check if the current (previous) task has access to the memory - * of the @tsk (next) task. If access is denied, make sure to - * issue a IBPB to stop user->user Spectre-v2 attacks. - * - * Note: __ptrace_may_access() returns 0 or -ERRNO. + * Both, the conditional and the always IBPB mode use the mm + * pointer to avoid the IBPB when switching between tasks of the + * same process. Using the mm pointer instead of mm->context.ctx_id + * opens a hypothetical hole vs. mm_struct reuse, which is more or + * less impossible to control by an attacker. Aside of that it + * would only affect the first schedule so the theoretically + * exposed data is not really interesting. */ - return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id && - ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB)); + if (static_branch_likely(&switch_mm_cond_ibpb)) { + unsigned long prev_mm, next_mm; + + /* + * This is a bit more complex than the always mode because + * it has to handle two cases: + * + * 1) Switch from a user space task (potential attacker) + * which has TIF_SPEC_IB set to a user space task + * (potential victim) which has TIF_SPEC_IB not set. + * + * 2) Switch from a user space task (potential attacker) + * which has TIF_SPEC_IB not set to a user space task + * (potential victim) which has TIF_SPEC_IB set. + * + * This could be done by unconditionally issuing IBPB when + * a task which has TIF_SPEC_IB set is either scheduled in + * or out. Though that results in two flushes when: + * + * - the same user space task is scheduled out and later + * scheduled in again and only a kernel thread ran in + * between. + * + * - a user space task belonging to the same process is + * scheduled in after a kernel thread ran in between + * + * - a user space task belonging to the same process is + * scheduled in immediately. + * + * Optimize this with reasonably small overhead for the + * above cases. Mangle the TIF_SPEC_IB bit into the mm + * pointer of the incoming task which is stored in + * cpu_tlbstate.last_user_mm_ibpb for comparison. + */ + next_mm = mm_mangle_tif_spec_ib(next); + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); + + /* + * Issue IBPB only if the mm's are different and one or + * both have the IBPB bit set. + */ + if (next_mm != prev_mm && + (next_mm | prev_mm) & LAST_USER_MM_IBPB) + indirect_branch_prediction_barrier(); + + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); + } + + if (static_branch_unlikely(&switch_mm_always_ibpb)) { + /* + * Only flush when switching to a user space task with a + * different context than the user space task which ran + * last on this CPU. + */ + if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { + indirect_branch_prediction_barrier(); + this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); + } + } } void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, @@ -292,22 +367,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, new_asid = prev_asid; need_flush = true; } else { - u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); - /* * Avoid user/user BTB poisoning by flushing the branch * predictor when switching between processes. This stops * one process from doing Spectre-v2 attacks on another. - * - * As an optimization, flush indirect branches only when - * switching into a processes that can't be ptrace by the - * current one (as in such case, attacker has much more - * convenient way how to tamper with the next process than - * branch buffer poisoning). */ - if (static_cpu_has(X86_FEATURE_USE_IBPB) && - ibpb_needed(tsk, last_ctx_id)) - indirect_branch_prediction_barrier(); + cond_ibpb(tsk); if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* @@ -365,14 +430,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } - /* - * Record last user mm's context id, so we can avoid - * flushing branch buffer with IBPB if we switch back - * to the same user. - */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - /* Make sure we write CR3 before loaded_mm. */ barrier(); @@ -441,7 +498,7 @@ void initialize_tlbstate_and_flush(void) write_cr3(build_cr3(mm->pgd, 0)); /* Reinitialize tlbstate. */ - this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id); + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); |