diff options
author | Max Krummenacher <max.krummenacher@toradex.com> | 2018-03-13 11:32:58 +0100 |
---|---|---|
committer | Max Krummenacher <max.krummenacher@toradex.com> | 2018-03-13 11:32:58 +0100 |
commit | 6fb9f3c8a4992f67dcb3ce413df2e22e96b2d400 (patch) | |
tree | 6e3071b2f179a62b027669ac2a238383293bf941 /arch/x86/mm | |
parent | a126a5e5dc2fcc5cb36af14c89b440cc8e3bab30 (diff) | |
parent | 8b5ab55d254f36e89b1b53aeac7223d2d102483e (diff) |
Merge tag 'v4.4.121' into toradex_vf_4.4-nextColibri-VF_LXDE-Image_2.8b2.97-20180331
This is the 4.4.121 stable release
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 4 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/kaiser.c | 484 | ||||
-rw-r--r-- | arch/x86/mm/kasan_init_64.c | 17 | ||||
-rw-r--r-- | arch/x86/mm/kmmio.c | 12 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 63 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 5 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 16 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 198 |
11 files changed, 727 insertions, 92 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 65c47fda26fc..61e6cead9c4a 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,5 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o gup.o setup_nx.o + pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) @@ -9,7 +9,6 @@ CFLAGS_setup_nx.o := $(nostackp) CFLAGS_fault.o := -I$(src)/../include/asm/trace obj-$(CONFIG_X86_PAT) += pat_rbtree.o -obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o @@ -33,3 +32,4 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MPX) += mpx.o +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 3aebbd6c6f5f..151fd33e9043 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ - if (cpu_has_pge) { + if (cpu_has_pge && !kaiser_enabled) { cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } else @@ -753,13 +753,11 @@ void __init zone_sizes_init(void) } DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { -#ifdef CONFIG_SMP .active_mm = &init_mm, .state = 0, -#endif .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; -EXPORT_SYMBOL_GPL(cpu_tlbstate); +EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ec081fe0ce2c..d76ec9348cff 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -395,6 +395,16 @@ void __init cleanup_highmap(void) continue; if (vaddr < (unsigned long) _text || vaddr > end) set_pmd(pmd, __pmd(0)); + else if (kaiser_enabled) { + /* + * level2_kernel_pgt is initialized with _PAGE_GLOBAL: + * clear that now. This is not important, so long as + * CR4.PGE remains clear, but it removes an anomaly. + * Physical mapping setup below avoids _PAGE_GLOBAL + * by use of massage_pgprot() inside pfn_pte() etc. + */ + set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL)); + } } } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index b9c78f3bcd67..53ab3f367472 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -348,11 +348,11 @@ void iounmap(volatile void __iomem *addr) (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) return; + mmiotrace_iounmap(addr); + addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); - mmiotrace_iounmap(addr); - /* Use the vm area unlocked, assuming the caller ensures there isn't another iounmap for the same address in parallel. Reuse of the virtual address is prevented by diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c new file mode 100644 index 000000000000..7a72e32e4806 --- /dev/null +++ b/arch/x86/mm/kaiser.c @@ -0,0 +1,484 @@ +#include <linux/bug.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bug.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> + +#undef pr_fmt +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt + +#include <asm/kaiser.h> +#include <asm/tlbflush.h> /* to verify its kaiser declarations */ +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/desc.h> +#include <asm/cmdline.h> +#include <asm/vsyscall.h> + +int kaiser_enabled __read_mostly = 1; +EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ + +__visible +DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +/* + * These can have bit 63 set, so we can not just use a plain "or" + * instruction to get their value or'd into CR3. It would take + * another register. So, we use a memory reference to these instead. + * + * This is also handy because systems that do not support PCIDs + * just end up or'ing a 0 into their CR3, which does no harm. + */ +DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); + +/* + * At runtime, the only things we map are some things for CPU + * hotplug, and stacks for new processes. No two CPUs will ever + * be populating the same addresses, so we only need to ensure + * that we protect between two CPUs trying to allocate and + * populate the same page table page. + * + * Only take this lock when doing a set_p[4um]d(), but it is not + * needed for doing a set_pte(). We assume that only the *owner* + * of a given allocation will be doing this for _their_ + * allocation. + * + * This ensures that once a system has been running for a while + * and there have been stacks all over and these page tables + * are fully populated, there will be no further acquisitions of + * this lock. + */ +static DEFINE_SPINLOCK(shadow_table_allocation_lock); + +/* + * Returns -1 on error. + */ +static inline unsigned long get_pa_from_mapping(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(vaddr); + /* + * We made all the kernel PGDs present in kaiser_init(). + * We expect them to stay that way. + */ + BUG_ON(pgd_none(*pgd)); + /* + * PGDs are either 512GB or 128TB on all x86_64 + * configurations. We don't handle these. + */ + BUG_ON(pgd_large(*pgd)); + + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + WARN_ON_ONCE(1); + return -1; + } + + if (pud_large(*pud)) + return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); + + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + WARN_ON_ONCE(1); + return -1; + } + + if (pmd_large(*pmd)) + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); + + pte = pte_offset_kernel(pmd, vaddr); + if (pte_none(*pte)) { + WARN_ON_ONCE(1); + return -1; + } + + return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); +} + +/* + * This is a relatively normal page table walk, except that it + * also tries to allocate page tables pages along the way. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static pte_t *kaiser_pagetable_walk(unsigned long address, bool user) +{ + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + unsigned long prot = _KERNPG_TABLE; + + if (pgd_none(*pgd)) { + WARN_ONCE(1, "All shadow pgds should have been populated"); + return NULL; + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); + + if (user) { + /* + * The vsyscall page is the only page that will have + * _PAGE_USER set. Catch everything else. + */ + BUG_ON(address != VSYSCALL_ADDR); + + set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); + prot = _PAGE_TABLE; + } + + pud = pud_offset(pgd, address); + /* The shadow page tables do not use large mappings: */ + if (pud_large(*pud)) { + WARN_ON(1); + return NULL; + } + if (pud_none(*pud)) { + unsigned long new_pmd_page = __get_free_page(gfp); + if (!new_pmd_page) + return NULL; + spin_lock(&shadow_table_allocation_lock); + if (pud_none(*pud)) { + set_pud(pud, __pud(prot | __pa(new_pmd_page))); + __inc_zone_page_state(virt_to_page((void *) + new_pmd_page), NR_KAISERTABLE); + } else + free_page(new_pmd_page); + spin_unlock(&shadow_table_allocation_lock); + } + + pmd = pmd_offset(pud, address); + /* The shadow page tables do not use large mappings: */ + if (pmd_large(*pmd)) { + WARN_ON(1); + return NULL; + } + if (pmd_none(*pmd)) { + unsigned long new_pte_page = __get_free_page(gfp); + if (!new_pte_page) + return NULL; + spin_lock(&shadow_table_allocation_lock); + if (pmd_none(*pmd)) { + set_pmd(pmd, __pmd(prot | __pa(new_pte_page))); + __inc_zone_page_state(virt_to_page((void *) + new_pte_page), NR_KAISERTABLE); + } else + free_page(new_pte_page); + spin_unlock(&shadow_table_allocation_lock); + } + + return pte_offset_kernel(pmd, address); +} + +static int kaiser_add_user_map(const void *__start_addr, unsigned long size, + unsigned long flags) +{ + int ret = 0; + pte_t *pte; + unsigned long start_addr = (unsigned long )__start_addr; + unsigned long address = start_addr & PAGE_MASK; + unsigned long end_addr = PAGE_ALIGN(start_addr + size); + unsigned long target_address; + + /* + * It is convenient for callers to pass in __PAGE_KERNEL etc, + * and there is no actual harm from setting _PAGE_GLOBAL, so + * long as CR4.PGE is not set. But it is nonetheless troubling + * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" + * requires that not to be #defined to 0): so mask it off here. + */ + flags &= ~_PAGE_GLOBAL; + if (!(__supported_pte_mask & _PAGE_NX)) + flags &= ~_PAGE_NX; + + for (; address < end_addr; address += PAGE_SIZE) { + target_address = get_pa_from_mapping(address); + if (target_address == -1) { + ret = -EIO; + break; + } + pte = kaiser_pagetable_walk(address, flags & _PAGE_USER); + if (!pte) { + ret = -ENOMEM; + break; + } + if (pte_none(*pte)) { + set_pte(pte, __pte(flags | target_address)); + } else { + pte_t tmp; + set_pte(&tmp, __pte(flags | target_address)); + WARN_ON_ONCE(!pte_same(*pte, tmp)); + } + } + return ret; +} + +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) +{ + unsigned long size = end - start; + + return kaiser_add_user_map(start, size, flags); +} + +/* + * Ensure that the top level of the (shadow) page tables are + * entirely populated. This ensures that all processes that get + * forked have the same entries. This way, we do not have to + * ever go set up new entries in older processes. + * + * Note: we never free these, so there are no updates to them + * after this. + */ +static void __init kaiser_init_all_pgds(void) +{ + pgd_t *pgd; + int i = 0; + + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { + pgd_t new_pgd; + pud_t *pud = pud_alloc_one(&init_mm, + PAGE_OFFSET + i * PGDIR_SIZE); + if (!pud) { + WARN_ON(1); + break; + } + inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); + new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); + /* + * Make sure not to stomp on some other pgd entry. + */ + if (!pgd_none(pgd[i])) { + WARN_ON(1); + continue; + } + set_pgd(pgd + i, new_pgd); + } +} + +#define kaiser_add_user_map_early(start, size, flags) do { \ + int __ret = kaiser_add_user_map(start, size, flags); \ + WARN_ON(__ret); \ +} while (0) + +#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ + int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ + WARN_ON(__ret); \ +} while (0) + +void __init kaiser_check_boottime_disable(void) +{ + bool enable = true; + char arg[5]; + int ret; + + if (boot_cpu_has(X86_FEATURE_XENPV)) + goto silent_disable; + + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); + if (ret > 0) { + if (!strncmp(arg, "on", 2)) + goto enable; + + if (!strncmp(arg, "off", 3)) + goto disable; + + if (!strncmp(arg, "auto", 4)) + goto skip; + } + + if (cmdline_find_option_bool(boot_command_line, "nopti")) + goto disable; + +skip: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + goto disable; + +enable: + if (enable) + setup_force_cpu_cap(X86_FEATURE_KAISER); + + return; + +disable: + pr_info("disabled\n"); + +silent_disable: + kaiser_enabled = 0; + setup_clear_cpu_cap(X86_FEATURE_KAISER); +} + +/* + * If anything in here fails, we will likely die on one of the + * first kernel->user transitions and init will die. But, we + * will have most of the kernel up by then and should be able to + * get a clean warning out of it. If we BUG_ON() here, we run + * the risk of being before we have good console output. + */ +void __init kaiser_init(void) +{ + int cpu; + + if (!kaiser_enabled) + return; + + kaiser_init_all_pgds(); + + /* + * Note that this sets _PAGE_USER and it needs to happen when the + * pagetable hierarchy gets created, i.e., early. Otherwise + * kaiser_pagetable_walk() will encounter initialized PTEs in the + * hierarchy and not set the proper permissions, leading to the + * pagefaults with page-protection violations when trying to read the + * vsyscall page. For example. + */ + if (vsyscall_enabled()) + kaiser_add_user_map_early((void *)VSYSCALL_ADDR, + PAGE_SIZE, + vsyscall_pgprot); + + for_each_possible_cpu(cpu) { + void *percpu_vaddr = __per_cpu_user_mapped_start + + per_cpu_offset(cpu); + unsigned long percpu_sz = __per_cpu_user_mapped_end - + __per_cpu_user_mapped_start; + kaiser_add_user_map_early(percpu_vaddr, percpu_sz, + __PAGE_KERNEL); + } + + /* + * Map the entry/exit text section, which is needed at + * switches from user to and from kernel. + */ + kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, + __PAGE_KERNEL_RX); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + kaiser_add_user_map_ptrs_early(__irqentry_text_start, + __irqentry_text_end, + __PAGE_KERNEL_RX); +#endif + kaiser_add_user_map_early((void *)idt_descr.address, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL_RO); +#ifdef CONFIG_TRACING + kaiser_add_user_map_early(&trace_idt_descr, + sizeof(trace_idt_descr), + __PAGE_KERNEL); + kaiser_add_user_map_early(&trace_idt_table, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL); +#endif + kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr), + __PAGE_KERNEL); + kaiser_add_user_map_early(&debug_idt_table, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL); + + pr_info("enabled\n"); +} + +/* Add a mapping to the shadow mapping, and synchronize the mappings */ +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) +{ + if (!kaiser_enabled) + return 0; + return kaiser_add_user_map((const void *)addr, size, flags); +} + +void kaiser_remove_mapping(unsigned long start, unsigned long size) +{ + extern void unmap_pud_range_nofree(pgd_t *pgd, + unsigned long start, unsigned long end); + unsigned long end = start + size; + unsigned long addr, next; + pgd_t *pgd; + + if (!kaiser_enabled) + return; + pgd = native_get_shadow_pgd(pgd_offset_k(start)); + for (addr = start; addr < end; pgd++, addr = next) { + next = pgd_addr_end(addr, end); + unmap_pud_range_nofree(pgd, addr, next); + } +} + +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * This returns true for user pages that need to get copied into + * both the user and kernel copies of the page tables, and false + * for kernel pages that should only be in the kernel copy. + */ +static inline bool is_userspace_pgd(pgd_t *pgdp) +{ + return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); +} + +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) +{ + if (!kaiser_enabled) + return pgd; + /* + * Do we need to also populate the shadow pgd? Check _PAGE_USER to + * skip cases like kexec and EFI which make temporary low mappings. + */ + if (pgd.pgd & _PAGE_USER) { + if (is_userspace_pgd(pgdp)) { + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + /* + * Even if the entry is *mapping* userspace, ensure + * that userspace can not use it. This way, if we + * get out to userspace running on the kernel CR3, + * userspace will crash instead of running. + */ + if (__supported_pte_mask & _PAGE_NX) + pgd.pgd |= _PAGE_NX; + } + } else if (!pgd.pgd) { + /* + * pgd_clear() cannot check _PAGE_USER, and is even used to + * clear corrupted pgd entries: so just rely on cases like + * kexec and EFI never to be using pgd_clear(). + */ + if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && + is_userspace_pgd(pgdp)) + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + } + return pgd; +} + +void kaiser_setup_pcid(void) +{ + unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; + + if (this_cpu_has(X86_FEATURE_PCID)) + user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; + /* + * These variables are used by the entry/exit + * code to change PCID and pgd and TLB flushing. + */ + this_cpu_write(x86_cr3_pcid_user, user_cr3); +} + +/* + * Make a note that this cpu will need to flush USER tlb on return to user. + * If cpu does not have PCID, then the NOFLUSH bit will never have been set. + */ +void kaiser_flush_tlb_on_return_to_user(void) +{ + if (this_cpu_has(X86_FEATURE_PCID)) + this_cpu_write(x86_cr3_pcid_user, + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); +} +EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 4e5ac46adc9d..fdfa25c83119 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -121,11 +121,22 @@ void __init kasan_init(void) kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); - memset(kasan_zero_page, 0, PAGE_SIZE); - load_cr3(init_level4_pgt); __flush_tlb_all(); - init_task.kasan_depth = 0; + /* + * kasan_zero_page has been used as early shadow memory, thus it may + * contain some garbage. Now we can clear and write protect it, since + * after the TLB flush no one should write to it. + */ + memset(kasan_zero_page, 0, PAGE_SIZE); + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + set_pte(&kasan_zero_pte[i], pte); + } + /* Flush TLBs again to be sure that write protection applied. */ + __flush_tlb_all(); + + init_task.kasan_depth = 0; pr_info("KernelAddressSanitizer initialized\n"); } diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index ddb2244b06a1..76604c8a2a48 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -434,17 +434,18 @@ int register_kmmio_probe(struct kmmio_probe *p) unsigned long flags; int ret = 0; unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); unsigned int l; pte_t *pte; spin_lock_irqsave(&kmmio_lock, flags); - if (get_kmmio_probe(p->addr)) { + if (get_kmmio_probe(addr)) { ret = -EEXIST; goto out; } - pte = lookup_address(p->addr, &l); + pte = lookup_address(addr, &l); if (!pte) { ret = -EINVAL; goto out; @@ -453,7 +454,7 @@ int register_kmmio_probe(struct kmmio_probe *p) kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { - if (add_kmmio_fault_page(p->addr + size)) + if (add_kmmio_fault_page(addr + size)) pr_err("Unable to set page fault.\n"); size += page_level_size(l); } @@ -527,19 +528,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p) { unsigned long flags; unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; unsigned int l; pte_t *pte; - pte = lookup_address(p->addr, &l); + pte = lookup_address(addr, &l); if (!pte) return; spin_lock_irqsave(&kmmio_lock, flags); while (size < size_lim) { - release_kmmio_fault_page(p->addr + size, &release_list); + release_kmmio_fault_page(addr + size, &release_list); size += page_level_size(l); } list_del_rcu(&p->list); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index b599a780a5a9..79377e2a7bcd 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 +#define CPA_FREE_PAGETABLES 8 #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, return 0; } -static bool try_to_free_pte_page(pte_t *pte) +static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte) { int i; + if (!(cpa->flags & CPA_FREE_PAGETABLES)) + return false; + for (i = 0; i < PTRS_PER_PTE; i++) if (!pte_none(pte[i])) return false; @@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte) return true; } -static bool try_to_free_pmd_page(pmd_t *pmd) +static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd) { int i; + if (!(cpa->flags & CPA_FREE_PAGETABLES)) + return false; + for (i = 0; i < PTRS_PER_PMD; i++) if (!pmd_none(pmd[i])) return false; @@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud) return true; } -static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) +static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd, + unsigned long start, + unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, start); @@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) pte++; } - if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { + if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) { pmd_clear(pmd); return true; } return false; } -static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, +static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd, unsigned long start, unsigned long end) { - if (unmap_pte_range(pmd, start, end)) - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + if (unmap_pte_range(cpa, pmd, start, end)) + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) pud_clear(pud); } -static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) +static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, + unsigned long start, unsigned long end) { pmd_t *pmd = pmd_offset(pud, start); @@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; unsigned long pre_end = min_t(unsigned long, end, next_page); - __unmap_pmd_range(pud, pmd, start, pre_end); + __unmap_pmd_range(cpa, pud, pmd, start, pre_end); start = pre_end; pmd++; @@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) if (pmd_large(*pmd)) pmd_clear(pmd); else - __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); + __unmap_pmd_range(cpa, pud, pmd, + start, start + PMD_SIZE); start += PMD_SIZE; pmd++; @@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) * 4K leftovers? */ if (start < end) - return __unmap_pmd_range(pud, pmd, start, end); + return __unmap_pmd_range(cpa, pud, pmd, start, end); /* * Try again to free the PMD page if haven't succeeded above. */ if (!pud_none(*pud)) - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) pud_clear(pud); } -static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd, + unsigned long start, + unsigned long end) { pud_t *pud = pud_offset(pgd, start); @@ -840,7 +853,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; unsigned long pre_end = min_t(unsigned long, end, next_page); - unmap_pmd_range(pud, start, pre_end); + unmap_pmd_range(cpa, pud, start, pre_end); start = pre_end; pud++; @@ -854,7 +867,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) if (pud_large(*pud)) pud_clear(pud); else - unmap_pmd_range(pud, start, start + PUD_SIZE); + unmap_pmd_range(cpa, pud, start, start + PUD_SIZE); start += PUD_SIZE; pud++; @@ -864,7 +877,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) * 2M leftovers? */ if (start < end) - unmap_pmd_range(pud, start, end); + unmap_pmd_range(cpa, pud, start, end); /* * No need to try to free the PUD page because we'll free it in @@ -872,6 +885,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) */ } +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +{ + struct cpa_data cpa = { + .flags = CPA_FREE_PAGETABLES, + }; + + __unmap_pud_range(&cpa, pgd, start, end); +} + +void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end) +{ + struct cpa_data cpa = { + .flags = 0, + }; + + __unmap_pud_range(&cpa, pgd, start, end); +} + static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) { pgd_t *pgd_entry = root + pgd_index(addr); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 3f1bb4f93a5a..3146b1da6d72 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -750,11 +750,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) return 1; while (cursor < to) { - if (!devmem_is_allowed(pfn)) { - pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", - current->comm, from, to - 1); + if (!devmem_is_allowed(pfn)) return 0; - } cursor += PAGE_SIZE; pfn++; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index fb0a9dd1d6e4..dbc27a2b4ad5 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -6,7 +6,7 @@ #include <asm/fixmap.h> #include <asm/mtrr.h> -#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM @@ -340,14 +340,24 @@ static inline void _pgd_free(pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } #else + +/* + * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +#define PGD_ALLOCATION_ORDER kaiser_enabled + static inline pgd_t *_pgd_alloc(void) { - return (pgd_t *)__get_free_page(PGALLOC_GFP); + /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */ + return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT, + PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); } #endif /* CONFIG_X86_PAE */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5a760fd66bec..7cad01af6dcd 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -6,16 +6,17 @@ #include <linux/interrupt.h> #include <linux/module.h> #include <linux/cpu.h> +#include <linux/debugfs.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/cache.h> #include <asm/apic.h> #include <asm/uv/uv.h> -#include <linux/debugfs.h> +#include <asm/kaiser.h> /* - * Smarter SMP flushing macros. + * TLB flushing, formerly SMP-only * c/o Linus Torvalds. * * These mean you can really definitely utterly forget about @@ -34,6 +35,36 @@ struct flush_tlb_info { unsigned long flush_end; }; +static void load_new_mm_cr3(pgd_t *pgdir) +{ + unsigned long new_mm_cr3 = __pa(pgdir); + + if (kaiser_enabled) { + /* + * We reuse the same PCID for different tasks, so we must + * flush all the entries for the PCID out when we change tasks. + * Flush KERN below, flush USER when returning to userspace in + * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro. + * + * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could + * do it here, but can only be used if X86_FEATURE_INVPCID is + * available - and many machines support pcid without invpcid. + * + * If X86_CR3_PCID_KERN_FLUSH actually added something, then it + * would be needed in the write_cr3() below - if PCIDs enabled. + */ + BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH); + kaiser_flush_tlb_on_return_to_user(); + } + + /* + * Caution: many callers of this function expect + * that load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask writes. + */ + write_cr3(new_mm_cr3); +} + /* * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. @@ -45,7 +76,7 @@ void leave_mm(int cpu) BUG(); if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); - load_cr3(swapper_pg_dir); + load_new_mm_cr3(swapper_pg_dir); /* * This gets called in the idle path where RCU * functions differently. Tracing normally @@ -57,6 +88,109 @@ void leave_mm(int cpu) } EXPORT_SYMBOL_GPL(leave_mm); +void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); +} + +void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + + if (likely(prev != next)) { + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + this_cpu_write(cpu_tlbstate.active_mm, next); + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * Re-load page tables. + * + * This logic has an ordering constraint: + * + * CPU 0: Write to a PTE for 'next' + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. + * CPU 1: set bit 1 in next's mm_cpumask + * CPU 1: load from the PTE that CPU 0 writes (implicit) + * + * We need to prevent an outcome in which CPU 1 observes + * the new PTE value and CPU 0 observes bit 1 clear in + * mm_cpumask. (If that occurs, then the IPI will never + * be sent, and CPU 0's TLB will contain a stale entry.) + * + * The bad outcome can occur if either CPU's load is + * reordered before that CPU's store, so both CPUs must + * execute full barriers to prevent this from happening. + * + * Thus, switch_mm needs a full barrier between the + * store to mm_cpumask and any operation that could load + * from next->pgd. TLB fills are special and can happen + * due to instruction fetches or for no reason at all, + * and neither LOCK nor MFENCE orders them. + * Fortunately, load_cr3() is serializing and gives the + * ordering guarantee we need. + * + */ + load_new_mm_cr3(next->pgd); + + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + + /* Stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + + /* Load per-mm CR4 state */ + load_mm_cr4(next); + +#ifdef CONFIG_MODIFY_LDT_SYSCALL + /* + * Load the LDT, if the LDT is different. + * + * It's possible that prev->context.ldt doesn't match + * the LDT register. This can happen if leave_mm(prev) + * was called and then modify_ldt changed + * prev->context.ldt but suppressed an IPI to this CPU. + * In this case, prev->context.ldt != NULL, because we + * never set context.ldt to NULL while the mm still + * exists. That means that next->context.ldt != + * prev->context.ldt, because mms never share an LDT. + */ + if (unlikely(prev->context.ldt != next->context.ldt)) + load_mm_ldt(next); +#endif + } else { + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); + + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * On established mms, the mm_cpumask is only changed + * from irq context, from ptep_clear_flush() while in + * lazy tlb mode, and here. Irqs are blocked during + * schedule, protecting us from simultaneous changes. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload CR3 + * to make sure to use no freed page tables. + * + * As above, load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask write. + */ + load_new_mm_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + load_mm_cr4(next); + load_mm_ldt(next); + } + } +} + /* * The flush IPI assumes that a thread switch happens in this order: * [cpu0: the cpu that switches] @@ -104,7 +238,7 @@ static void flush_tlb_func(void *info) inc_irq_stat(irq_tlb_count); - if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) + if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); @@ -158,23 +292,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, smp_call_function_many(cpumask, flush_tlb_func, &info, 1); } -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - - preempt_disable(); - - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - - /* This is an implicit full barrier that synchronizes with switch_mm. */ - local_flush_tlb(); - - trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); - preempt_enable(); -} - /* * See Documentation/x86/tlb.txt for details. We choose 33 * because it is large enough to cover the vast majority (at @@ -195,6 +312,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long base_pages_to_flush = TLB_FLUSH_ALL; preempt_disable(); + + if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) + base_pages_to_flush = (end - start) >> PAGE_SHIFT; + if (base_pages_to_flush > tlb_single_page_flush_ceiling) + base_pages_to_flush = TLB_FLUSH_ALL; + if (current->active_mm != mm) { /* Synchronize with switch_mm. */ smp_mb(); @@ -211,15 +334,11 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, goto out; } - if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) - base_pages_to_flush = (end - start) >> PAGE_SHIFT; - /* * Both branches below are implicit full barriers (MOV to CR or * INVLPG) that synchronize with switch_mm. */ - if (base_pages_to_flush > tlb_single_page_flush_ceiling) { - base_pages_to_flush = TLB_FLUSH_ALL; + if (base_pages_to_flush == TLB_FLUSH_ALL) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { @@ -240,33 +359,6 @@ out: preempt_enable(); } -void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) -{ - struct mm_struct *mm = vma->vm_mm; - - preempt_disable(); - - if (current->active_mm == mm) { - if (current->mm) { - /* - * Implicit full barrier (INVLPG) that synchronizes - * with switch_mm. - */ - __flush_tlb_one(start); - } else { - leave_mm(smp_processor_id()); - - /* Synchronize with switch_mm. */ - smp_mb(); - } - } - - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); - - preempt_enable(); -} - static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); |