diff options
-rw-r--r-- | arch/x86/kernel/alternative.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/asm-offsets_32.c | 14 | ||||
-rw-r--r-- | arch/x86/kernel/entry_32.S | 2 | ||||
-rw-r--r-- | arch/x86/kernel/paravirt_32.c | 224 | ||||
-rw-r--r-- | arch/x86/kernel/vmi_32.c | 201 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 22 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 233 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 145 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.c | 52 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.h | 5 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 14 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 6 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 10 | ||||
-rw-r--r-- | drivers/char/hvc_lguest.c | 2 | ||||
-rw-r--r-- | drivers/lguest/core.c | 6 | ||||
-rw-r--r-- | drivers/lguest/lguest.c | 152 | ||||
-rw-r--r-- | drivers/lguest/lguest_bus.c | 2 | ||||
-rw-r--r-- | include/asm-x86/paravirt.h | 487 | ||||
-rw-r--r-- | include/asm-x86/pgtable-3level-defs.h | 2 | ||||
-rw-r--r-- | include/xen/interface/vcpu.h | 5 | ||||
-rw-r--r-- | mm/Kconfig | 1 |
21 files changed, 960 insertions, 629 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 11b03d3c6fda..42421437ded3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -369,8 +369,8 @@ void apply_paravirt(struct paravirt_patch_site *start, BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ memcpy(insnbuf, p->instr, p->len); - used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf, - (unsigned long)p->instr, p->len); + used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf, + (unsigned long)p->instr, p->len); BUG_ON(used > p->len); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 8029742c0fc1..f1b7cdda82b3 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -116,12 +116,14 @@ void foo(void) #ifdef CONFIG_PARAVIRT BLANK(); - OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); - OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); - OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); - OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); - OFFSET(PARAVIRT_iret, paravirt_ops, iret); - OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); + OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); + OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); + OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); + OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); + OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); #endif #ifdef CONFIG_XEN diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 8099fea0a72f..dc7f938e5015 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -437,7 +437,7 @@ ldt_ss: * is still available to implement the setting of the high * 16-bits in the INTERRUPT_RETURN paravirt-op. */ - cmpl $0, paravirt_ops+PARAVIRT_enabled + cmpl $0, pv_info+PARAVIRT_enabled jne restore_nocheck #endif diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c index 739cfb207dd7..6a80d67c2121 100644 --- a/arch/x86/kernel/paravirt_32.c +++ b/arch/x86/kernel/paravirt_32.c @@ -42,32 +42,33 @@ void _paravirt_nop(void) static void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", - paravirt_ops.name); + pv_info.name); } char *memory_setup(void) { - return paravirt_ops.memory_setup(); + return pv_init_ops.memory_setup(); } /* Simple instruction patching code. */ -#define DEF_NATIVE(name, code) \ - extern const char start_##name[], end_##name[]; \ - asm("start_" #name ": " code "; end_" #name ":") - -DEF_NATIVE(irq_disable, "cli"); -DEF_NATIVE(irq_enable, "sti"); -DEF_NATIVE(restore_fl, "push %eax; popf"); -DEF_NATIVE(save_fl, "pushf; pop %eax"); -DEF_NATIVE(iret, "iret"); -DEF_NATIVE(irq_enable_sysexit, "sti; sysexit"); -DEF_NATIVE(read_cr2, "mov %cr2, %eax"); -DEF_NATIVE(write_cr3, "mov %eax, %cr3"); -DEF_NATIVE(read_cr3, "mov %cr3, %eax"); -DEF_NATIVE(clts, "clts"); -DEF_NATIVE(read_tsc, "rdtsc"); - -DEF_NATIVE(ud2a, "ud2a"); +#define DEF_NATIVE(ops, name, code) \ + extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") + +DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); +DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); +DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); +DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); +DEF_NATIVE(pv_cpu_ops, iret, "iret"); +DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); +DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); +DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); +DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); +DEF_NATIVE(pv_cpu_ops, clts, "clts"); +DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); + +/* Undefined instruction for dealing with missing ops pointers. */ +static const unsigned char ud2a[] = { 0x0f, 0x0b }; static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -76,37 +77,29 @@ static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned ret; switch(type) { -#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site - SITE(irq_disable); - SITE(irq_enable); - SITE(restore_fl); - SITE(save_fl); - SITE(iret); - SITE(irq_enable_sysexit); - SITE(read_cr2); - SITE(read_cr3); - SITE(write_cr3); - SITE(clts); - SITE(read_tsc); +#define SITE(ops, x) \ + case PARAVIRT_PATCH(ops.x): \ + start = start_##ops##_##x; \ + end = end_##ops##_##x; \ + goto patch_site + + SITE(pv_irq_ops, irq_disable); + SITE(pv_irq_ops, irq_enable); + SITE(pv_irq_ops, restore_fl); + SITE(pv_irq_ops, save_fl); + SITE(pv_cpu_ops, iret); + SITE(pv_cpu_ops, irq_enable_sysexit); + SITE(pv_mmu_ops, read_cr2); + SITE(pv_mmu_ops, read_cr3); + SITE(pv_mmu_ops, write_cr3); + SITE(pv_cpu_ops, clts); + SITE(pv_cpu_ops, read_tsc); #undef SITE patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); break; - case PARAVIRT_PATCH(make_pgd): - case PARAVIRT_PATCH(make_pte): - case PARAVIRT_PATCH(pgd_val): - case PARAVIRT_PATCH(pte_val): -#ifdef CONFIG_X86_PAE - case PARAVIRT_PATCH(make_pmd): - case PARAVIRT_PATCH(pmd_val): -#endif - /* These functions end up returning exactly what - they're passed, in the same registers. */ - ret = paravirt_patch_nop(); - break; - default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; @@ -150,7 +143,7 @@ unsigned paravirt_patch_call(void *insnbuf, return 5; } -unsigned paravirt_patch_jmp(const void *target, void *insnbuf, +unsigned paravirt_patch_jmp(void *insnbuf, const void *target, unsigned long addr, unsigned len) { struct branch *b = insnbuf; @@ -165,22 +158,37 @@ unsigned paravirt_patch_jmp(const void *target, void *insnbuf, return 5; } +/* Neat trick to map patch type back to the call within the + * corresponding structure. */ +static void *get_call_destination(u8 type) +{ + struct paravirt_patch_template tmpl = { + .pv_init_ops = pv_init_ops, + .pv_time_ops = pv_time_ops, + .pv_cpu_ops = pv_cpu_ops, + .pv_irq_ops = pv_irq_ops, + .pv_apic_ops = pv_apic_ops, + .pv_mmu_ops = pv_mmu_ops, + }; + return *((void **)&tmpl + type); +} + unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, unsigned long addr, unsigned len) { - void *opfunc = *((void **)¶virt_ops + type); + void *opfunc = get_call_destination(type); unsigned ret; if (opfunc == NULL) /* If there's no function, patch it with a ud2a (BUG) */ - ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a); + ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); else if (opfunc == paravirt_nop) /* If the operation is a nop, then nop the callsite */ ret = paravirt_patch_nop(); - else if (type == PARAVIRT_PATCH(iret) || - type == PARAVIRT_PATCH(irq_enable_sysexit)) + else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || + type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit)) /* If operation requires a jmp, then jmp */ - ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len); + ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); else /* Otherwise call the function; assume target could clobber any caller-save reg */ @@ -205,7 +213,7 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len, void init_IRQ(void) { - paravirt_ops.init_IRQ(); + pv_irq_ops.init_IRQ(); } static void native_flush_tlb(void) @@ -233,7 +241,7 @@ extern void native_irq_enable_sysexit(void); static int __init print_banner(void) { - paravirt_ops.banner(); + pv_init_ops.banner(); return 0; } core_initcall(print_banner); @@ -273,47 +281,96 @@ int paravirt_disable_iospace(void) return ret; } -struct paravirt_ops paravirt_ops = { +static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; + +static inline void enter_lazy(enum paravirt_lazy_mode mode) +{ + BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(preemptible()); + + x86_write_percpu(paravirt_lazy_mode, mode); +} + +void paravirt_leave_lazy(enum paravirt_lazy_mode mode) +{ + BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode); + BUG_ON(preemptible()); + + x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); +} + +void paravirt_enter_lazy_mmu(void) +{ + enter_lazy(PARAVIRT_LAZY_MMU); +} + +void paravirt_leave_lazy_mmu(void) +{ + paravirt_leave_lazy(PARAVIRT_LAZY_MMU); +} + +void paravirt_enter_lazy_cpu(void) +{ + enter_lazy(PARAVIRT_LAZY_CPU); +} + +void paravirt_leave_lazy_cpu(void) +{ + paravirt_leave_lazy(PARAVIRT_LAZY_CPU); +} + +enum paravirt_lazy_mode paravirt_get_lazy_mode(void) +{ + return x86_read_percpu(paravirt_lazy_mode); +} + +struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ +}; - .patch = native_patch, +struct pv_init_ops pv_init_ops = { + .patch = native_patch, .banner = default_banner, .arch_setup = paravirt_nop, .memory_setup = machine_specific_memory_setup, +}; + +struct pv_time_ops pv_time_ops = { + .time_init = hpet_time_init, .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, - .time_init = hpet_time_init, + .sched_clock = native_sched_clock, + .get_cpu_khz = native_calculate_cpu_khz, +}; + +struct pv_irq_ops pv_irq_ops = { .init_IRQ = native_init_IRQ, + .save_fl = native_save_fl, + .restore_fl = native_restore_fl, + .irq_disable = native_irq_disable, + .irq_enable = native_irq_enable, + .safe_halt = native_safe_halt, + .halt = native_halt, +}; +struct pv_cpu_ops pv_cpu_ops = { .cpuid = native_cpuid, .get_debugreg = native_get_debugreg, .set_debugreg = native_set_debugreg, .clts = native_clts, .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, - .read_cr2 = native_read_cr2, - .write_cr2 = native_write_cr2, - .read_cr3 = native_read_cr3, - .write_cr3 = native_write_cr3, .read_cr4 = native_read_cr4, .read_cr4_safe = native_read_cr4_safe, .write_cr4 = native_write_cr4, - .save_fl = native_save_fl, - .restore_fl = native_restore_fl, - .irq_disable = native_irq_disable, - .irq_enable = native_irq_enable, - .safe_halt = native_safe_halt, - .halt = native_halt, .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .sched_clock = native_sched_clock, - .get_cpu_khz = native_calculate_cpu_khz, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, .load_gdt = native_load_gdt, @@ -327,9 +384,19 @@ struct paravirt_ops paravirt_ops = { .write_idt_entry = write_dt_entry, .load_esp0 = native_load_esp0, + .irq_enable_sysexit = native_irq_enable_sysexit, + .iret = native_iret, + .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, + .lazy_mode = { + .enter = paravirt_nop, + .leave = paravirt_nop, + }, +}; + +struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = native_apic_write, .apic_write_atomic = native_apic_write_atomic, @@ -338,11 +405,17 @@ struct paravirt_ops paravirt_ops = { .setup_secondary_clock = setup_secondary_APIC_clock, .startup_ipi_hook = paravirt_nop, #endif - .set_lazy_mode = paravirt_nop, +}; +struct pv_mmu_ops pv_mmu_ops = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, + .read_cr2 = native_read_cr2, + .write_cr2 = native_write_cr2, + .read_cr3 = native_read_cr3, + .write_cr3 = native_write_cr3, + .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_single = native_flush_tlb_single, @@ -381,12 +454,19 @@ struct paravirt_ops paravirt_ops = { .make_pte = native_make_pte, .make_pgd = native_make_pgd, - .irq_enable_sysexit = native_irq_enable_sysexit, - .iret = native_iret, - .dup_mmap = paravirt_nop, .exit_mmap = paravirt_nop, .activate_mm = paravirt_nop, + + .lazy_mode = { + .enter = paravirt_nop, + .leave = paravirt_nop, + }, }; -EXPORT_SYMBOL(paravirt_ops); +EXPORT_SYMBOL_GPL(pv_time_ops); +EXPORT_SYMBOL_GPL(pv_cpu_ops); +EXPORT_SYMBOL_GPL(pv_mmu_ops); +EXPORT_SYMBOL_GPL(pv_apic_ops); +EXPORT_SYMBOL_GPL(pv_info); +EXPORT_SYMBOL (pv_irq_ops); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 18673e0f193b..f02bad68abaa 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -134,21 +134,21 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned long eip, unsigned len) { switch (type) { - case PARAVIRT_PATCH(irq_disable): + case PARAVIRT_PATCH(pv_irq_ops.irq_disable): return patch_internal(VMI_CALL_DisableInterrupts, len, insns, eip); - case PARAVIRT_PATCH(irq_enable): + case PARAVIRT_PATCH(pv_irq_ops.irq_enable): return patch_internal(VMI_CALL_EnableInterrupts, len, insns, eip); - case PARAVIRT_PATCH(restore_fl): + case PARAVIRT_PATCH(pv_irq_ops.restore_fl): return patch_internal(VMI_CALL_SetInterruptMask, len, insns, eip); - case PARAVIRT_PATCH(save_fl): + case PARAVIRT_PATCH(pv_irq_ops.save_fl): return patch_internal(VMI_CALL_GetInterruptMask, len, insns, eip); - case PARAVIRT_PATCH(iret): + case PARAVIRT_PATCH(pv_cpu_ops.iret): return patch_internal(VMI_CALL_IRET, len, insns, eip); - case PARAVIRT_PATCH(irq_enable_sysexit): + case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); default: break; @@ -552,24 +552,22 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode) +static void vmi_enter_lazy_cpu(void) { - static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode); - - if (!vmi_ops.set_lazy_mode) - return; + paravirt_enter_lazy_cpu(); + vmi_ops.set_lazy_mode(2); +} - /* Modes should never nest or overlap */ - BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE || - mode == PARAVIRT_LAZY_FLUSH)); +static void vmi_enter_lazy_mmu(void) +{ + paravirt_enter_lazy_mmu(); + vmi_ops.set_lazy_mode(1); +} - if (mode == PARAVIRT_LAZY_FLUSH) { - vmi_ops.set_lazy_mode(0); - vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode)); - } else { - vmi_ops.set_lazy_mode(mode); - __get_cpu_var(lazy_mode) = mode; - } +static void vmi_leave_lazy(void) +{ + paravirt_leave_lazy(paravirt_get_lazy_mode()); + vmi_ops.set_lazy_mode(0); } static inline int __init check_vmi_rom(struct vrom_header *rom) @@ -690,9 +688,9 @@ do { \ reloc = call_vrom_long_func(vmi_rom, get_reloc, \ VMI_CALL_##vmicall); \ if (rel->type == VMI_RELOCATION_CALL_REL) \ - paravirt_ops.opname = (void *)rel->eip; \ + opname = (void *)rel->eip; \ else if (rel->type == VMI_RELOCATION_NOP) \ - paravirt_ops.opname = (void *)vmi_nop; \ + opname = (void *)vmi_nop; \ else if (rel->type != VMI_RELOCATION_NONE) \ printk(KERN_WARNING "VMI: Unknown relocation " \ "type %d for " #vmicall"\n",\ @@ -712,7 +710,7 @@ do { \ VMI_CALL_##vmicall); \ BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \ if (rel->type == VMI_RELOCATION_CALL_REL) { \ - paravirt_ops.opname = wrapper; \ + opname = wrapper; \ vmi_ops.cache = (void *)rel->eip; \ } \ } while (0) @@ -732,11 +730,11 @@ static inline int __init activate_vmi(void) } savesegment(cs, kernel_cs); - paravirt_ops.paravirt_enabled = 1; - paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; + pv_info.paravirt_enabled = 1; + pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; + pv_info.name = "vmi"; - paravirt_ops.patch = vmi_patch; - paravirt_ops.name = "vmi"; + pv_init_ops.patch = vmi_patch; /* * Many of these operations are ABI compatible with VMI. @@ -754,26 +752,26 @@ static inline int __init activate_vmi(void) */ /* CPUID is special, so very special it gets wrapped like a present */ - para_wrap(cpuid, vmi_cpuid, cpuid, CPUID); - - para_fill(clts, CLTS); - para_fill(get_debugreg, GetDR); - para_fill(set_debugreg, SetDR); - para_fill(read_cr0, GetCR0); - para_fill(read_cr2, GetCR2); - para_fill(read_cr3, GetCR3); - para_fill(read_cr4, GetCR4); - para_fill(write_cr0, SetCR0); - para_fill(write_cr2, SetCR2); - para_fill(write_cr3, SetCR3); - para_fill(write_cr4, SetCR4); - para_fill(save_fl, GetInterruptMask); - para_fill(restore_fl, SetInterruptMask); - para_fill(irq_disable, DisableInterrupts); - para_fill(irq_enable, EnableInterrupts); - - para_fill(wbinvd, WBINVD); - para_fill(read_tsc, RDTSC); + para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID); + + para_fill(pv_cpu_ops.clts, CLTS); + para_fill(pv_cpu_ops.get_debugreg, GetDR); + para_fill(pv_cpu_ops.set_debugreg, SetDR); + para_fill(pv_cpu_ops.read_cr0, GetCR0); + para_fill(pv_mmu_ops.read_cr2, GetCR2); + para_fill(pv_mmu_ops.read_cr3, GetCR3); + para_fill(pv_cpu_ops.read_cr4, GetCR4); + para_fill(pv_cpu_ops.write_cr0, SetCR0); + para_fill(pv_mmu_ops.write_cr2, SetCR2); + para_fill(pv_mmu_ops.write_cr3, SetCR3); + para_fill(pv_cpu_ops.write_cr4, SetCR4); + para_fill(pv_irq_ops.save_fl, GetInterruptMask); + para_fill(pv_irq_ops.restore_fl, SetInterruptMask); + para_fill(pv_irq_ops.irq_disable, DisableInterrupts); + para_fill(pv_irq_ops.irq_enable, EnableInterrupts); + + para_fill(pv_cpu_ops.wbinvd, WBINVD); + para_fill(pv_cpu_ops.read_tsc, RDTSC); /* The following we emulate with trap and emulate for now */ /* paravirt_ops.read_msr = vmi_rdmsr */ @@ -781,29 +779,38 @@ static inline int __init activate_vmi(void) /* paravirt_ops.rdpmc = vmi_rdpmc */ /* TR interface doesn't pass TR value, wrap */ - para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR); + para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR); /* LDT is special, too */ - para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT); - - para_fill(load_gdt, SetGDT); - para_fill(load_idt, SetIDT); - para_fill(store_gdt, GetGDT); - para_fill(store_idt, GetIDT); - para_fill(store_tr, GetTR); - paravirt_ops.load_tls = vmi_load_tls; - para_fill(write_ldt_entry, WriteLDTEntry); - para_fill(write_gdt_entry, WriteGDTEntry); - para_fill(write_idt_entry, WriteIDTEntry); - para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); - para_fill(set_iopl_mask, SetIOPLMask); - para_fill(io_delay, IODelay); - para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); + para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT); + + para_fill(pv_cpu_ops.load_gdt, SetGDT); + para_fill(pv_cpu_ops.load_idt, SetIDT); + para_fill(pv_cpu_ops.store_gdt, GetGDT); + para_fill(pv_cpu_ops.store_idt, GetIDT); + para_fill(pv_cpu_ops.store_tr, GetTR); + pv_cpu_ops.load_tls = vmi_load_tls; + para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry); + para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry); + para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry); + para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); + para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); + para_fill(pv_cpu_ops.io_delay, IODelay); + + para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, + set_lazy_mode, SetLazyMode); + para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, + set_lazy_mode, SetLazyMode); + + para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, + set_lazy_mode, SetLazyMode); + para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, + set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ - para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); - para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); - para_fill(flush_tlb_single, InvalPage); + para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); + para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); + para_fill(pv_mmu_ops.flush_tlb_single, InvalPage); /* * Until a standard flag format can be agreed on, we need to @@ -819,41 +826,41 @@ static inline int __init activate_vmi(void) #endif if (vmi_ops.set_pte) { - paravirt_ops.set_pte = vmi_set_pte; - paravirt_ops.set_pte_at = vmi_set_pte_at; - paravirt_ops.set_pmd = vmi_set_pmd; + pv_mmu_ops.set_pte = vmi_set_pte; + pv_mmu_ops.set_pte_at = vmi_set_pte_at; + pv_mmu_ops.set_pmd = vmi_set_pmd; #ifdef CONFIG_X86_PAE - paravirt_ops.set_pte_atomic = vmi_set_pte_atomic; - paravirt_ops.set_pte_present = vmi_set_pte_present; - paravirt_ops.set_pud = vmi_set_pud; - paravirt_ops.pte_clear = vmi_pte_clear; - paravirt_ops.pmd_clear = vmi_pmd_clear; + pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic; + pv_mmu_ops.set_pte_present = vmi_set_pte_present; + pv_mmu_ops.set_pud = vmi_set_pud; + pv_mmu_ops.pte_clear = vmi_pte_clear; + pv_mmu_ops.pmd_clear = vmi_pmd_clear; #endif } if (vmi_ops.update_pte) { - paravirt_ops.pte_update = vmi_update_pte; - paravirt_ops.pte_update_defer = vmi_update_pte_defer; + pv_mmu_ops.pte_update = vmi_update_pte; + pv_mmu_ops.pte_update_defer = vmi_update_pte_defer; } vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); if (vmi_ops.allocate_page) { - paravirt_ops.alloc_pt = vmi_allocate_pt; - paravirt_ops.alloc_pd = vmi_allocate_pd; - paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone; + pv_mmu_ops.alloc_pt = vmi_allocate_pt; + pv_mmu_ops.alloc_pd = vmi_allocate_pd; + pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone; } vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); if (vmi_ops.release_page) { - paravirt_ops.release_pt = vmi_release_pt; - paravirt_ops.release_pd = vmi_release_pd; + pv_mmu_ops.release_pt = vmi_release_pt; + pv_mmu_ops.release_pd = vmi_release_pd; } /* Set linear is needed in all cases */ vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); #ifdef CONFIG_HIGHPTE if (vmi_ops.set_linear_mapping) - paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; + pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; #endif /* @@ -863,17 +870,17 @@ static inline int __init activate_vmi(void) * the backend. They are performance critical anyway, so requiring * a patch is not a big problem. */ - paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0; - paravirt_ops.iret = (void *)0xbadbab0; + pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; + pv_cpu_ops.iret = (void *)0xbadbab0; #ifdef CONFIG_SMP - para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); + para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); #endif #ifdef CONFIG_X86_LOCAL_APIC - para_fill(apic_read, APICRead); - para_fill(apic_write, APICWrite); - para_fill(apic_write_atomic, APICWrite); + para_fill(pv_apic_ops.apic_read, APICRead); + para_fill(pv_apic_ops.apic_write, APICWrite); + para_fill(pv_apic_ops.apic_write_atomic, APICWrite); #endif /* @@ -891,15 +898,15 @@ static inline int __init activate_vmi(void) vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); vmi_timer_ops.cancel_alarm = vmi_get_function(VMI_CALL_CancelAlarm); - paravirt_ops.time_init = vmi_time_init; - paravirt_ops.get_wallclock = vmi_get_wallclock; - paravirt_ops.set_wallclock = vmi_set_wallclock; + pv_time_ops.time_init = vmi_time_init; + pv_time_ops.get_wallclock = vmi_get_wallclock; + pv_time_ops.set_wallclock = vmi_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - paravirt_ops.setup_boot_clock = vmi_time_bsp_init; - paravirt_ops.setup_secondary_clock = vmi_time_ap_init; + pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; + pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; #endif - paravirt_ops.sched_clock = vmi_sched_clock; - paravirt_ops.get_cpu_khz = vmi_cpu_khz; + pv_time_ops.sched_clock = vmi_sched_clock; + pv_time_ops.get_cpu_khz = vmi_cpu_khz; /* We have true wallclock functions; disable CMOS clock sync */ no_sync_cmos_clock = 1; @@ -908,7 +915,7 @@ static inline int __init activate_vmi(void) disable_vmi_timer = 1; } - para_fill(safe_halt, Halt); + para_fill(pv_irq_ops.safe_halt, Halt); /* * Alternative instruction rewriting doesn't happen soon enough diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index dda4e83649a0..33d367a3432e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -741,24 +741,12 @@ struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { - size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); - - if (PTRS_PER_PMD > 1) { + if (PTRS_PER_PMD > 1) pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), - SLAB_PANIC, - pmd_ctor); - if (!SHARED_KERNEL_PMD) { - /* If we're in PAE mode and have a non-shared - kernel pmd, then the pgd size must be a - page size. This is because the pgd_list - links through the page structure, so there - can only be one pgd per page for this to - work. */ - pgd_size = PAGE_SIZE; - } - } + PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), + SLAB_PANIC, + pmd_ctor); } /* diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 493a083f6886..94c39aaf695f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -25,7 +25,6 @@ #include <linux/mm.h> #include <linux/page-flags.h> #include <linux/highmem.h> -#include <linux/smp.h> #include <xen/interface/xen.h> #include <xen/interface/physdev.h> @@ -52,11 +51,25 @@ EXPORT_SYMBOL_GPL(hypercall_page); -DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); - DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); -DEFINE_PER_CPU(unsigned long, xen_cr3); + +/* + * Note about cr3 (pagetable base) values: + * + * xen_cr3 contains the current logical cr3 value; it contains the + * last set cr3. This may not be the current effective cr3, because + * its update may be being lazily deferred. However, a vcpu looking + * at its own cr3 can use this value knowing that it everything will + * be self-consistent. + * + * xen_current_cr3 contains the actual vcpu cr3; it is set once the + * hypercall to set the vcpu cr3 is complete (so it may be a little + * out of date, but it will never be set early). If one vcpu is + * looking at another vcpu's cr3 value, it should use this variable. + */ +DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ +DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); @@ -100,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu) info.mfn = virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); - printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", + printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", cpu, vcpup, info.mfn, info.offset); /* Check to see if the hypervisor will put the vcpu_info @@ -124,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu) static void __init xen_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", - paravirt_ops.name); + pv_info.name); printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); } @@ -249,29 +262,10 @@ static void xen_halt(void) xen_safe_halt(); } -static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) +static void xen_leave_lazy(void) { - BUG_ON(preemptible()); - - switch (mode) { - case PARAVIRT_LAZY_NONE: - BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE); - break; - - case PARAVIRT_LAZY_MMU: - case PARAVIRT_LAZY_CPU: - BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE); - break; - - case PARAVIRT_LAZY_FLUSH: - /* flush if necessary, but don't change state */ - if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE) - xen_mc_flush(); - return; - } - + paravirt_leave_lazy(paravirt_get_lazy_mode()); xen_mc_flush(); - x86_write_percpu(xen_lazy_mode, mode); } static unsigned long xen_store_tr(void) @@ -358,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) * loaded properly. This will go away as soon as Xen has been * modified to not save/restore %gs for normal hypercalls. */ - if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) loadsegment(gs, 0); } @@ -632,32 +626,36 @@ static unsigned long xen_read_cr3(void) return x86_read_percpu(xen_cr3); } +static void set_current_cr3(void *v) +{ + x86_write_percpu(xen_current_cr3, (unsigned long)v); +} + static void xen_write_cr3(unsigned long cr3) { + struct mmuext_op *op; + struct multicall_space mcs; + unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + BUG_ON(preemptible()); - if (cr3 == x86_read_percpu(xen_cr3)) { - /* just a simple tlb flush */ - xen_flush_tlb(); - return; - } + mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ x86_write_percpu(xen_cr3, cr3); + op = mcs.args; + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = mfn; - { - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); - - op = mcs.args; - op->cmd = MMUEXT_NEW_BASEPTR; - op->arg1.mfn = mfn; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + /* Update xen_update_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); - xen_mc_issue(PARAVIRT_LAZY_CPU); - } + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ } /* Early in boot, while setting up the initial pagetable, assume @@ -668,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); } +static void pin_pagetable_pfn(unsigned level, unsigned long pfn) +{ + struct mmuext_op op; + op.cmd = level; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + /* This needs to make sure the new pte page is pinned iff its being attached to a pinned pagetable. */ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) @@ -677,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) if (PagePinned(virt_to_page(mm->pgd))) { SetPagePinned(page); - if (!PageHighMem(page)) + if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - else + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + } else /* make sure there are no stray mappings of this page */ kmap_flush_unused(); @@ -692,8 +700,10 @@ static void xen_release_pt(u32 pfn) struct page *page = pfn_to_page(pfn); if (PagePinned(page)) { - if (!PageHighMem(page)) + if (!PageHighMem(page)) { + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + } } } @@ -738,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; /* special set_pte for pagetable initialization */ - paravirt_ops.set_pte = xen_set_pte_init; + pv_mmu_ops.set_pte = xen_set_pte_init; init_mm.pgd = base; /* @@ -785,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base) { /* This will work as long as patching hasn't happened yet (which it hasn't) */ - paravirt_ops.alloc_pt = xen_alloc_pt; - paravirt_ops.set_pte = xen_set_pte; + pv_mmu_ops.alloc_pt = xen_alloc_pt; + pv_mmu_ops.set_pte = xen_set_pte; if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* @@ -808,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base) /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ { - struct mmuext_op op; + unsigned level; + #ifdef CONFIG_X86_PAE - op.cmd = MMUEXT_PIN_L3_TABLE; + level = MMUEXT_PIN_L3_TABLE; #else - op.cmd = MMUEXT_PIN_L3_TABLE; + level = MMUEXT_PIN_L2_TABLE; #endif - op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); + + pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); } } @@ -833,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void) if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); - paravirt_ops.save_fl = xen_save_fl_direct; - paravirt_ops.restore_fl = xen_restore_fl_direct; - paravirt_ops.irq_disable = xen_irq_disable_direct; - paravirt_ops.irq_enable = xen_irq_enable_direct; - paravirt_ops.read_cr2 = xen_read_cr2_direct; - paravirt_ops.iret = xen_iret_direct; + pv_irq_ops.save_fl = xen_save_fl_direct; + pv_irq_ops.restore_fl = xen_restore_fl_direct; + pv_irq_ops.irq_disable = xen_irq_disable_direct; + pv_irq_ops.irq_enable = xen_irq_enable_direct; + pv_mmu_ops.read_cr2 = xen_read_cr2_direct; + pv_cpu_ops.iret = xen_iret_direct; } } @@ -850,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, start = end = reloc = NULL; -#define SITE(x) \ - case PARAVIRT_PATCH(x): \ +#define SITE(op, x) \ + case PARAVIRT_PATCH(op.x): \ if (have_vcpu_info_placement) { \ start = (char *)xen_##x##_direct; \ end = xen_##x##_direct_end; \ @@ -860,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, goto patch_site switch (type) { - SITE(irq_enable); - SITE(irq_disable); - SITE(save_fl); - SITE(restore_fl); + SITE(pv_irq_ops, irq_enable); + SITE(pv_irq_ops, irq_disable); + SITE(pv_irq_ops, save_fl); + SITE(pv_irq_ops, restore_fl); #undef SITE patch_site: @@ -895,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, return ret; } -static const struct paravirt_ops xen_paravirt_ops __initdata = { +static const struct pv_info xen_info __initdata = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, .name = "Xen", - .banner = xen_banner, +}; +static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, + .banner = xen_banner, .memory_setup = xen_memory_setup, .arch_setup = xen_arch_setup, - .init_IRQ = xen_init_IRQ, .post_allocator_init = xen_mark_init_mm_pinned, +}; +static const struct pv_time_ops xen_time_ops __initdata = { .time_init = xen_time_init, + .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, .get_cpu_khz = xen_cpu_khz, .sched_clock = xen_sched_clock, +}; +static const struct pv_cpu_ops xen_cpu_ops __initdata = { .cpuid = xen_cpuid, .set_debugreg = xen_set_debugreg, @@ -925,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, - .read_cr2 = xen_read_cr2, - .write_cr2 = xen_write_cr2, - - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3, - .read_cr4 = native_read_cr4, .read_cr4_safe = native_read_cr4_safe, .write_cr4 = xen_write_cr4, - .save_fl = xen_save_fl, - .restore_fl = xen_restore_fl, - .irq_disable = xen_irq_disable, - .irq_enable = xen_irq_enable, - .safe_halt = xen_safe_halt, - .halt = xen_halt, .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, @@ -969,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .set_iopl_mask = xen_set_iopl_mask, .io_delay = xen_io_delay, + .lazy_mode = { + .enter = paravirt_enter_lazy_cpu, + .leave = xen_leave_lazy, + }, +}; + +static const struct pv_irq_ops xen_irq_ops __initdata = { + .init_IRQ = xen_init_IRQ, + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, +}; + +static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = xen_apic_write, .apic_write_atomic = xen_apic_write, @@ -977,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, #endif +}; + +static const struct pv_mmu_ops xen_mmu_ops __initdata = { + .pagetable_setup_start = xen_pagetable_setup_start, + .pagetable_setup_done = xen_pagetable_setup_done, + + .read_cr2 = xen_read_cr2, + .write_cr2 = xen_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3, .flush_tlb_user = xen_flush_tlb, .flush_tlb_kernel = xen_flush_tlb, @@ -986,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, - .pagetable_setup_start = xen_pagetable_setup_start, - .pagetable_setup_done = xen_pagetable_setup_done, - .alloc_pt = xen_alloc_pt_init, .release_pt = xen_release_pt, .alloc_pd = paravirt_nop, @@ -1024,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .dup_mmap = xen_dup_mmap, .exit_mmap = xen_exit_mmap, - .set_lazy_mode = xen_set_lazy_mode, + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy, + }, }; #ifdef CONFIG_SMP @@ -1080,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = { }; +static void __init xen_reserve_top(void) +{ + unsigned long top = HYPERVISOR_VIRT_START; + struct xen_platform_parameters pp; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) + top = pp.virt_start; + + reserve_top_address(-top + 2 * PAGE_SIZE); +} + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -1091,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void) BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); /* Install Xen paravirt ops */ - paravirt_ops = xen_paravirt_ops; + pv_info = xen_info; + pv_init_ops = xen_init_ops; + pv_time_ops = xen_time_ops; + pv_cpu_ops = xen_cpu_ops; + pv_irq_ops = xen_irq_ops; + pv_apic_ops = xen_apic_ops; + pv_mmu_ops = xen_mmu_ops; + machine_ops = xen_machine_ops; #ifdef CONFIG_SMP @@ -1113,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void) /* keep using Xen gdt for now; no urgent need to change it */ x86_write_percpu(xen_cr3, __pa(pgd)); + x86_write_percpu(xen_current_cr3, __pa(pgd)); #ifdef CONFIG_SMP /* Don't do the full vcpu_info placement stuff until we have a @@ -1124,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void) xen_setup_vcpu_info_placement(); #endif - paravirt_ops.kernel_rpl = 1; + pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) - paravirt_ops.kernel_rpl = 0; + pv_info.kernel_rpl = 0; /* set the limit of our address space */ - reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); + xen_reserve_top(); /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 874db0cd1d2a..b2e32f9d0071 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -41,7 +41,6 @@ #include <linux/sched.h> #include <linux/highmem.h> #include <linux/bug.h> -#include <linux/sched.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { if (mm == current->mm || mm == &init_mm) { - if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { struct multicall_space mcs; mcs = xen_mc_entry(0); @@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd) } #endif /* CONFIG_X86_PAE */ - +enum pt_level { + PT_PGD, + PT_PUD, + PT_PMD, + PT_PTE +}; /* (Yet another) pagetable walker. This one is intended for pinning a @@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd) FIXADDR_TOP. But the important bit is that we don't pin beyond there, because then we start getting into Xen's ptes. */ -static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), +static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), unsigned long limit) { pgd_t *pgd = pgd_base; @@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), pud = pud_offset(pgd, 0); if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pud), 0); + flush |= (*func)(virt_to_page(pud), PT_PUD); for (; addr != pud_limit; pud++, addr = pud_next) { pmd_t *pmd; @@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), pmd = pmd_offset(pud, 0); if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pmd), 0); + flush |= (*func)(virt_to_page(pmd), PT_PMD); for (; addr != pmd_limit; pmd++) { addr += (PAGE_SIZE * PTRS_PER_PTE); @@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), if (pmd_none(*pmd)) continue; - flush |= (*func)(pmd_page(*pmd), 0); + flush |= (*func)(pmd_page(*pmd), PT_PTE); } } } - flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); + flush |= (*func)(virt_to_page(pgd_base), PT_PGD); return flush; } -static int pin_page(struct page *page, unsigned flags) +static spinlock_t *lock_pte(struct page *page) +{ + spinlock_t *ptl = NULL; + +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + ptl = __pte_lockptr(page); + spin_lock(ptl); +#endif + + return ptl; +} + +static void do_unlock(void *v) +{ + spinlock_t *ptl = v; + spin_unlock(ptl); +} + +static void xen_do_pin(unsigned level, unsigned long pfn) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + mcs = __xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = level; + op->arg1.mfn = pfn_to_mfn(pfn); + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); +} + +static int pin_page(struct page *page, enum pt_level level) { unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); int flush; @@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags) void *pt = lowmem_page_address(page); unsigned long pfn = page_to_pfn(page); struct multicall_space mcs = __xen_mc_entry(0); + spinlock_t *ptl; flush = 0; + ptl = NULL; + if (level == PT_PTE) + ptl = lock_pte(page); + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), - flags); + level == PT_PGD ? UVMF_TLB_FLUSH : 0); + + if (level == PT_PTE) + xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); + + if (ptl) { + /* Queue a deferred unlock for when this batch + is completed. */ + xen_mc_callback(do_unlock, ptl); + } } return flush; @@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags) read-only, and can be pinned. */ void xen_pgd_pin(pgd_t *pgd) { - struct multicall_space mcs; - struct mmuext_op *op; + unsigned level; xen_mc_batch(); @@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_batch(); } - mcs = __xen_mc_entry(sizeof(*op)); - op = mcs.args; - #ifdef CONFIG_X86_PAE - op->cmd = MMUEXT_PIN_L3_TABLE; + level = MMUEXT_PIN_L3_TABLE; #else - op->cmd = MMUEXT_PIN_L2_TABLE; + level = MMUEXT_PIN_L2_TABLE; #endif - op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_do_pin(level, PFN_DOWN(__pa(pgd))); xen_mc_issue(0); } @@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd) /* The init_mm pagetable is really pinned as soon as its created, but that's before we have page structures to store the bits. So do all the book-keeping now. */ -static __init int mark_pinned(struct page *page, unsigned flags) +static __init int mark_pinned(struct page *page, enum pt_level level) { SetPagePinned(page); return 0; @@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void) pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); } -static int unpin_page(struct page *page, unsigned flags) +static int unpin_page(struct page *page, enum pt_level level) { unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); if (pgfl && !PageHighMem(page)) { void *pt = lowmem_page_address(page); unsigned long pfn = page_to_pfn(page); - struct multicall_space mcs = __xen_mc_entry(0); + spinlock_t *ptl = NULL; + struct multicall_space mcs; + + if (level == PT_PTE) { + ptl = lock_pte(page); + + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); + } + + mcs = __xen_mc_entry(0); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL), - flags); + level == PT_PGD ? UVMF_TLB_FLUSH : 0); + + if (ptl) { + /* unlock when batch completed */ + xen_mc_callback(do_unlock, ptl); + } } return 0; /* never need to flush on unpin */ @@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags) /* Release a pagetables pages back as normal RW */ static void xen_pgd_unpin(pgd_t *pgd) { - struct mmuext_op *op; - struct multicall_space mcs; - xen_mc_batch(); - mcs = __xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_UNPIN_TABLE; - op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); pgd_walk(pgd, unpin_page, TASK_SIZE); @@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info) if (__get_cpu_var(cpu_tlbstate).active_mm == mm) leave_mm(smp_processor_id()); + + /* If this cpu still has a stale cr3 reference, then make sure + it has been flushed. */ + if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { + load_cr3(swapper_pg_dir); + arch_flush_lazy_cpu_mode(); + } } static void drop_mm_ref(struct mm_struct *mm) { + cpumask_t mask; + unsigned cpu; + if (current->active_mm == mm) { if (current->mm == mm) load_cr3(swapper_pg_dir); else leave_mm(smp_processor_id()); + arch_flush_lazy_cpu_mode(); } - if (!cpus_empty(mm->cpu_vm_mask)) - xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, - mm, 1); + /* Get the "official" set of cpus referring to our pagetable. */ + mask = mm->cpu_vm_mask; + + /* It's possible that a vcpu may have a stale reference to our + cr3, because its in lazy mode, and it hasn't yet flushed + its set of pending hypercalls yet. In this case, we can + look at its actual current cr3 value, and force it to flush + if needed. */ + for_each_online_cpu(cpu) { + if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) + cpu_set(cpu, mask); + } + + if (!cpus_empty(mask)) + xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); } #else static void drop_mm_ref(struct mm_struct *mm) @@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm) /* pgd may not be pinned in the error exit path of execve */ if (PagePinned(virt_to_page(mm->pgd))) xen_pgd_unpin(mm->pgd); + spin_unlock(&mm->page_table_lock); } diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index c837e8e463db..5e6f36f6d876 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -26,13 +26,22 @@ #include "multicalls.h" +#define MC_DEBUG 1 + #define MC_BATCH 32 #define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) struct mc_buffer { struct multicall_entry entries[MC_BATCH]; +#if MC_DEBUG + struct multicall_entry debug[MC_BATCH]; +#endif u64 args[MC_ARGS]; - unsigned mcidx, argidx; + struct callback { + void (*fn)(void *); + void *data; + } callbacks[MC_BATCH]; + unsigned mcidx, argidx, cbidx; }; static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); @@ -43,6 +52,7 @@ void xen_mc_flush(void) struct mc_buffer *b = &__get_cpu_var(mc_buffer); int ret = 0; unsigned long flags; + int i; BUG_ON(preemptible()); @@ -51,13 +61,31 @@ void xen_mc_flush(void) local_irq_save(flags); if (b->mcidx) { - int i; +#if MC_DEBUG + memcpy(b->debug, b->entries, + b->mcidx * sizeof(struct multicall_entry)); +#endif if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) BUG(); for (i = 0; i < b->mcidx; i++) if (b->entries[i].result < 0) ret++; + +#if MC_DEBUG + if (ret) { + printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", + ret, smp_processor_id()); + for(i = 0; i < b->mcidx; i++) { + printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", + i+1, b->mcidx, + b->debug[i].op, + b->debug[i].args[0], + b->entries[i].result); + } + } +#endif + b->mcidx = 0; b->argidx = 0; } else @@ -65,6 +93,13 @@ void xen_mc_flush(void) local_irq_restore(flags); + for(i = 0; i < b->cbidx; i++) { + struct callback *cb = &b->callbacks[i]; + + (*cb->fn)(cb->data); + } + b->cbidx = 0; + BUG_ON(ret); } @@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args) return ret; } + +void xen_mc_callback(void (*fn)(void *), void *data) +{ + struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct callback *cb; + + if (b->cbidx == MC_BATCH) + xen_mc_flush(); + + cb = &b->callbacks[b->cbidx++]; + cb->fn = fn; + cb->data = data; +} diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index e6f7530b156c..8bae996d99a3 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -35,11 +35,14 @@ void xen_mc_flush(void); /* Issue a multicall if we're not in a lazy mode */ static inline void xen_mc_issue(unsigned mode) { - if ((xen_get_lazy_mode() & mode) == 0) + if ((paravirt_get_lazy_mode() & mode) == 0) xen_mc_flush(); /* restore flags saved in xen_mc_batch */ local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); } +/* Set up a callback to be called when the current batch is flushed */ +void xen_mc_callback(void (*fn)(void *), void *data); + #endif /* _XEN_MULTICALLS_H */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 4fa33c27ccb6..d53bf9d8a72d 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -370,7 +370,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, int wait) { struct call_data_struct data; - int cpus; + int cpus, cpu; + bool yield; /* Holding any lock stops cpus from going down. */ spin_lock(&call_lock); @@ -399,9 +400,14 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), /* Send a message to other CPUs and wait for them to respond */ xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); - /* Make sure other vcpus get a chance to run. - XXX too severe? Maybe we should check the other CPU's states? */ - HYPERVISOR_sched_op(SCHEDOP_yield, 0); + /* Make sure other vcpus get a chance to run if they need to. */ + yield = false; + for_each_cpu_mask(cpu, mask) + if (xen_vcpu_stolen(cpu)) + yield = true; + + if (yield) + HYPERVISOR_sched_op(SCHEDOP_yield, 0); /* Wait for response */ while (atomic_read(&data.started) != cpus || diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index dfd6db69ead5..d083ff5ef088 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) } while (get64(&state->state_entry_time) != state_time); } +/* return true when a vcpu could run but has no real cpu to run on */ +bool xen_vcpu_stolen(int vcpu) +{ + return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; +} + static void setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index b9aaea45f07f..b02a909bfd4c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps); DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); DECLARE_PER_CPU(unsigned long, xen_cr3); +DECLARE_PER_CPU(unsigned long, xen_current_cr3); extern struct start_info *xen_start_info; extern struct shared_info *HYPERVISOR_shared_info; @@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); -void xen_mark_init_mm_pinned(void); - -DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); +bool xen_vcpu_stolen(int vcpu); -static inline unsigned xen_get_lazy_mode(void) -{ - return x86_read_percpu(xen_lazy_mode); -} +void xen_mark_init_mm_pinned(void); void __init xen_fill_possible_map(void); diff --git a/drivers/char/hvc_lguest.c b/drivers/char/hvc_lguest.c index 3d6bd0baa56d..efccb2155830 100644 --- a/drivers/char/hvc_lguest.c +++ b/drivers/char/hvc_lguest.c @@ -115,7 +115,7 @@ static struct hv_ops lguest_cons = { * (0), and the struct hv_ops containing the put_chars() function. */ static int __init cons_init(void) { - if (strcmp(paravirt_ops.name, "lguest") != 0) + if (strcmp(pv_info.name, "lguest") != 0) return 0; return hvc_instantiate(0, 0, &lguest_cons); diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 4a315f08a567..a0788c12b392 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -248,8 +248,8 @@ static void unmap_switcher(void) } /*H:130 Our Guest is usually so well behaved; it never tries to do things it - * isn't allowed to. Unfortunately, "struct paravirt_ops" isn't quite - * complete, because it doesn't contain replacements for the Intel I/O + * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't + * quite complete, because it doesn't contain replacements for the Intel I/O * instructions. As a result, the Guest sometimes fumbles across one during * the boot process as it probes for various things which are usually attached * to a PC. @@ -694,7 +694,7 @@ static int __init init(void) /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ if (paravirt_enabled()) { - printk("lguest is afraid of %s\n", paravirt_ops.name); + printk("lguest is afraid of %s\n", pv_info.name); return -EPERM; } diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c index 4a579c840301..3ba337dde857 100644 --- a/drivers/lguest/lguest.c +++ b/drivers/lguest/lguest.c @@ -23,7 +23,7 @@ * * So how does the kernel know it's a Guest? The Guest starts at a special * entry point marked with a magic string, which sets up a few things then - * calls here. We replace the native functions in "struct paravirt_ops" + * calls here. We replace the native functions various "paravirt" structures * with our Guest versions, then boot like normal. :*/ /* @@ -97,29 +97,17 @@ static cycle_t clock_base; * them as a batch when lazy_mode is eventually turned off. Because hypercalls * are reasonably expensive, batching them up makes sense. For example, a * large mmap might update dozens of page table entries: that code calls - * lguest_lazy_mode(PARAVIRT_LAZY_MMU), does the dozen updates, then calls - * lguest_lazy_mode(PARAVIRT_LAZY_NONE). + * paravirt_enter_lazy_mmu(), does the dozen updates, then calls + * lguest_leave_lazy_mode(). * * So, when we're in lazy mode, we call async_hypercall() to store the call for * future processing. When lazy mode is turned off we issue a hypercall to * flush the stored calls. - * - * There's also a hack where "mode" is set to "PARAVIRT_LAZY_FLUSH" which - * indicates we're to flush any outstanding calls immediately. This is used - * when an interrupt handler does a kmap_atomic(): the page table changes must - * happen immediately even if we're in the middle of a batch. Usually we're - * not, though, so there's nothing to do. */ -static enum paravirt_lazy_mode lazy_mode; /* Note: not SMP-safe! */ -static void lguest_lazy_mode(enum paravirt_lazy_mode mode) + */ +static void lguest_leave_lazy_mode(void) { - if (mode == PARAVIRT_LAZY_FLUSH) { - if (unlikely(lazy_mode != PARAVIRT_LAZY_NONE)) - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); - } else { - lazy_mode = mode; - if (mode == PARAVIRT_LAZY_NONE) - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); - } + paravirt_leave_lazy(paravirt_get_lazy_mode()); + hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); } static void lazy_hcall(unsigned long call, @@ -127,7 +115,7 @@ static void lazy_hcall(unsigned long call, unsigned long arg2, unsigned long arg3) { - if (lazy_mode == PARAVIRT_LAZY_NONE) + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) hcall(call, arg1, arg2, arg3); else async_hcall(call, arg1, arg2, arg3); @@ -331,7 +319,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) } /*G:038 That's enough excitement for now, back to ploughing through each of - * the paravirt_ops (we're about 1/3 of the way through). + * the different pv_ops structures (we're about 1/3 of the way through). * * This is the Local Descriptor Table, another weird Intel thingy. Linux only * uses this for some strange applications like Wine. We don't do anything @@ -558,7 +546,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); } -/* Unfortunately for Lguest, the paravirt_ops for page tables were based on +/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). @@ -782,7 +770,7 @@ static void lguest_time_init(void) clocksource_register(&lguest_clock); /* Now we've set up our clock, we can use it as the scheduler clock */ - paravirt_ops.sched_clock = lguest_sched_clock; + pv_time_ops.sched_clock = lguest_sched_clock; /* We can't set cpumask in the initializer: damn C limitations! Set it * here and register our timer device. */ @@ -904,7 +892,7 @@ static __init char *lguest_memory_setup(void) /*G:050 * Patching (Powerfully Placating Performance Pedants) * - * We have already seen that "struct paravirt_ops" lets us replace simple + * We have already seen that pv_ops structures let us replace simple * native instructions with calls to the appropriate back end all throughout * the kernel. This allows the same kernel to run as a Guest and as a native * kernel, but it's slow because of all the indirect branches. @@ -929,10 +917,10 @@ static const struct lguest_insns { const char *start, *end; } lguest_insns[] = { - [PARAVIRT_PATCH(irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(irq_enable)] = { lgstart_sti, lgend_sti }, - [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf }, - [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf }, + [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, + [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, + [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, + [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; /* Now our patch routine is fairly simple (based on the native one in @@ -959,9 +947,9 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -/*G:030 Once we get to lguest_init(), we know we're a Guest. The paravirt_ops - * structure in the kernel provides a single point for (almost) every routine - * we have to override to avoid privileged instructions. */ +/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops + * structures in the kernel provide points for (almost) every routine we have + * to override to avoid privileged instructions. */ __init void lguest_init(void *boot) { /* Copy boot parameters first: the Launcher put the physical location @@ -976,54 +964,70 @@ __init void lguest_init(void *boot) /* We're under lguest, paravirt is enabled, and we're running at * privilege level 1, not 0 as normal. */ - paravirt_ops.name = "lguest"; - paravirt_ops.paravirt_enabled = 1; - paravirt_ops.kernel_rpl = 1; + pv_info.name = "lguest"; + pv_info.paravirt_enabled = 1; + pv_info.kernel_rpl = 1; /* We set up all the lguest overrides for sensitive operations. These * are detailed with the operations themselves. */ - paravirt_ops.save_fl = save_fl; - paravirt_ops.restore_fl = restore_fl; - paravirt_ops.irq_disable = irq_disable; - paravirt_ops.irq_enable = irq_enable; - paravirt_ops.load_gdt = lguest_load_gdt; - paravirt_ops.memory_setup = lguest_memory_setup; - paravirt_ops.cpuid = lguest_cpuid; - paravirt_ops.write_cr3 = lguest_write_cr3; - paravirt_ops.flush_tlb_user = lguest_flush_tlb_user; - paravirt_ops.flush_tlb_single = lguest_flush_tlb_single; - paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; - paravirt_ops.set_pte = lguest_set_pte; - paravirt_ops.set_pte_at = lguest_set_pte_at; - paravirt_ops.set_pmd = lguest_set_pmd; + + /* interrupt-related operations */ + pv_irq_ops.init_IRQ = lguest_init_IRQ; + pv_irq_ops.save_fl = save_fl; + pv_irq_ops.restore_fl = restore_fl; + pv_irq_ops.irq_disable = irq_disable; + pv_irq_ops.irq_enable = irq_enable; + pv_irq_ops.safe_halt = lguest_safe_halt; + + /* init-time operations */ + pv_init_ops.memory_setup = lguest_memory_setup; + pv_init_ops.patch = lguest_patch; + + /* Intercepts of various cpu instructions */ + pv_cpu_ops.load_gdt = lguest_load_gdt; + pv_cpu_ops.cpuid = lguest_cpuid; + pv_cpu_ops.load_idt = lguest_load_idt; + pv_cpu_ops.iret = lguest_iret; + pv_cpu_ops.load_esp0 = lguest_load_esp0; + pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; + pv_cpu_ops.set_ldt = lguest_set_ldt; + pv_cpu_ops.load_tls = lguest_load_tls; + pv_cpu_ops.set_debugreg = lguest_set_debugreg; + pv_cpu_ops.clts = lguest_clts; + pv_cpu_ops.read_cr0 = lguest_read_cr0; + pv_cpu_ops.write_cr0 = lguest_write_cr0; + pv_cpu_ops.read_cr4 = lguest_read_cr4; + pv_cpu_ops.write_cr4 = lguest_write_cr4; + pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; + pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; + pv_cpu_ops.wbinvd = lguest_wbinvd; + pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; + pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; + + /* pagetable management */ + pv_mmu_ops.write_cr3 = lguest_write_cr3; + pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; + pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; + pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; + pv_mmu_ops.set_pte = lguest_set_pte; + pv_mmu_ops.set_pte_at = lguest_set_pte_at; + pv_mmu_ops.set_pmd = lguest_set_pmd; + pv_mmu_ops.read_cr2 = lguest_read_cr2; + pv_mmu_ops.read_cr3 = lguest_read_cr3; + pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; + pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; + #ifdef CONFIG_X86_LOCAL_APIC - paravirt_ops.apic_write = lguest_apic_write; - paravirt_ops.apic_write_atomic = lguest_apic_write; - paravirt_ops.apic_read = lguest_apic_read; + /* apic read/write intercepts */ + pv_apic_ops.apic_write = lguest_apic_write; + pv_apic_ops.apic_write_atomic = lguest_apic_write; + pv_apic_ops.apic_read = lguest_apic_read; #endif - paravirt_ops.load_idt = lguest_load_idt; - paravirt_ops.iret = lguest_iret; - paravirt_ops.load_esp0 = lguest_load_esp0; - paravirt_ops.load_tr_desc = lguest_load_tr_desc; - paravirt_ops.set_ldt = lguest_set_ldt; - paravirt_ops.load_tls = lguest_load_tls; - paravirt_ops.set_debugreg = lguest_set_debugreg; - paravirt_ops.clts = lguest_clts; - paravirt_ops.read_cr0 = lguest_read_cr0; - paravirt_ops.write_cr0 = lguest_write_cr0; - paravirt_ops.init_IRQ = lguest_init_IRQ; - paravirt_ops.read_cr2 = lguest_read_cr2; - paravirt_ops.read_cr3 = lguest_read_cr3; - paravirt_ops.read_cr4 = lguest_read_cr4; - paravirt_ops.write_cr4 = lguest_write_cr4; - paravirt_ops.write_gdt_entry = lguest_write_gdt_entry; - paravirt_ops.write_idt_entry = lguest_write_idt_entry; - paravirt_ops.patch = lguest_patch; - paravirt_ops.safe_halt = lguest_safe_halt; - paravirt_ops.get_wallclock = lguest_get_wallclock; - paravirt_ops.time_init = lguest_time_init; - paravirt_ops.set_lazy_mode = lguest_lazy_mode; - paravirt_ops.wbinvd = lguest_wbinvd; + + /* time operations */ + pv_time_ops.get_wallclock = lguest_get_wallclock; + pv_time_ops.time_init = lguest_time_init; + /* Now is a good time to look at the implementations of these functions * before returning to the rest of lguest_init(). */ diff --git a/drivers/lguest/lguest_bus.c b/drivers/lguest/lguest_bus.c index 9e7752cc8002..57329788f8a7 100644 --- a/drivers/lguest/lguest_bus.c +++ b/drivers/lguest/lguest_bus.c @@ -201,7 +201,7 @@ static void scan_devices(void) * "struct lguest_device_desc" array. */ static int __init lguest_bus_init(void) { - if (strcmp(paravirt_ops.name, "lguest") != 0) + if (strcmp(pv_info.name, "lguest") != 0) return 0; /* Devices are in a single page above top of "normal" mem */ diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 9fa3fa9e62d1..f59d370c5df4 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -25,27 +25,22 @@ struct tss_struct; struct mm_struct; struct desc_struct; -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE = 0, - PARAVIRT_LAZY_MMU = 1, - PARAVIRT_LAZY_CPU = 2, - PARAVIRT_LAZY_FLUSH = 3, -}; - -struct paravirt_ops -{ +/* general info */ +struct pv_info { unsigned int kernel_rpl; int shared_kernel_pmd; - int paravirt_enabled; + int paravirt_enabled; const char *name; +}; +struct pv_init_ops { /* - * Patch may replace one of the defined code sequences with arbitrary - * code, subject to the same register constraints. This generally - * means the code is not free to clobber any registers other than EAX. - * The patch function should return the number of bytes of code - * generated, as we nop pad the rest in generic code. + * Patch may replace one of the defined code sequences with + * arbitrary code, subject to the same register constraints. + * This generally means the code is not free to clobber any + * registers other than EAX. The patch function should return + * the number of bytes of code generated, as we nop pad the + * rest in generic code. */ unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, unsigned long addr, unsigned len); @@ -55,29 +50,29 @@ struct paravirt_ops char *(*memory_setup)(void); void (*post_allocator_init)(void); - void (*init_IRQ)(void); - void (*time_init)(void); - - /* - * Called before/after init_mm pagetable setup. setup_start - * may reset %cr3, and may pre-install parts of the pagetable; - * pagetable setup is expected to preserve any existing - * mapping. - */ - void (*pagetable_setup_start)(pgd_t *pgd_base); - void (*pagetable_setup_done)(pgd_t *pgd_base); - /* Print a banner to identify the environment */ void (*banner)(void); +}; + + +struct pv_lazy_ops { + /* Set deferred update mode, used for batching operations. */ + void (*enter)(void); + void (*leave)(void); +}; + +struct pv_time_ops { + void (*time_init)(void); /* Set and set time of day */ unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long); - /* cpuid emulation, mostly so that caps bits can be disabled */ - void (*cpuid)(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx); + unsigned long long (*sched_clock)(void); + unsigned long (*get_cpu_khz)(void); +}; +struct pv_cpu_ops { /* hooks for various privileged instructions */ unsigned long (*get_debugreg)(int regno); void (*set_debugreg)(int regno, unsigned long value); @@ -87,41 +82,10 @@ struct paravirt_ops unsigned long (*read_cr0)(void); void (*write_cr0)(unsigned long); - unsigned long (*read_cr2)(void); - void (*write_cr2)(unsigned long); - - unsigned long (*read_cr3)(void); - void (*write_cr3)(unsigned long); - unsigned long (*read_cr4_safe)(void); unsigned long (*read_cr4)(void); void (*write_cr4)(unsigned long); - /* - * Get/set interrupt state. save_fl and restore_fl are only - * expected to use X86_EFLAGS_IF; all other bits - * returned from save_fl are undefined, and may be ignored by - * restore_fl. - */ - unsigned long (*save_fl)(void); - void (*restore_fl)(unsigned long); - void (*irq_disable)(void); - void (*irq_enable)(void); - void (*safe_halt)(void); - void (*halt)(void); - - void (*wbinvd)(void); - - /* MSR, PMC and TSR operations. - err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ - u64 (*read_msr)(unsigned int msr, int *err); - int (*write_msr)(unsigned int msr, u64 val); - - u64 (*read_tsc)(void); - u64 (*read_pmc)(void); - unsigned long long (*sched_clock)(void); - unsigned long (*get_cpu_khz)(void); - /* Segment descriptor handling */ void (*load_tr_desc)(void); void (*load_gdt)(const struct Xgt_desc_struct *); @@ -140,18 +104,47 @@ struct paravirt_ops void (*load_esp0)(struct tss_struct *tss, struct thread_struct *t); void (*set_iopl_mask)(unsigned mask); + + void (*wbinvd)(void); void (*io_delay)(void); + /* cpuid emulation, mostly so that caps bits can be disabled */ + void (*cpuid)(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + + /* MSR, PMC and TSR operations. + err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ + u64 (*read_msr)(unsigned int msr, int *err); + int (*write_msr)(unsigned int msr, u64 val); + + u64 (*read_tsc)(void); + u64 (*read_pmc)(void); + + /* These two are jmp to, not actually called. */ + void (*irq_enable_sysexit)(void); + void (*iret)(void); + + struct pv_lazy_ops lazy_mode; +}; + +struct pv_irq_ops { + void (*init_IRQ)(void); + /* - * Hooks for intercepting the creation/use/destruction of an - * mm_struct. + * Get/set interrupt state. save_fl and restore_fl are only + * expected to use X86_EFLAGS_IF; all other bits + * returned from save_fl are undefined, and may be ignored by + * restore_fl. */ - void (*activate_mm)(struct mm_struct *prev, - struct mm_struct *next); - void (*dup_mmap)(struct mm_struct *oldmm, - struct mm_struct *mm); - void (*exit_mmap)(struct mm_struct *mm); + unsigned long (*save_fl)(void); + void (*restore_fl)(unsigned long); + void (*irq_disable)(void); + void (*irq_enable)(void); + void (*safe_halt)(void); + void (*halt)(void); +}; +struct pv_apic_ops { #ifdef CONFIG_X86_LOCAL_APIC /* * Direct APIC operations, principally for VMI. Ideally @@ -167,6 +160,34 @@ struct paravirt_ops unsigned long start_eip, unsigned long start_esp); #endif +}; + +struct pv_mmu_ops { + /* + * Called before/after init_mm pagetable setup. setup_start + * may reset %cr3, and may pre-install parts of the pagetable; + * pagetable setup is expected to preserve any existing + * mapping. + */ + void (*pagetable_setup_start)(pgd_t *pgd_base); + void (*pagetable_setup_done)(pgd_t *pgd_base); + + unsigned long (*read_cr2)(void); + void (*write_cr2)(unsigned long); + + unsigned long (*read_cr3)(void); + void (*write_cr3)(unsigned long); + + /* + * Hooks for intercepting the creation/use/destruction of an + * mm_struct. + */ + void (*activate_mm)(struct mm_struct *prev, + struct mm_struct *next); + void (*dup_mmap)(struct mm_struct *oldmm, + struct mm_struct *mm); + void (*exit_mmap)(struct mm_struct *mm); + /* TLB operations */ void (*flush_tlb_user)(void); @@ -191,15 +212,12 @@ struct paravirt_ops void (*pte_update_defer)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -#ifdef CONFIG_HIGHPTE - void *(*kmap_atomic_pte)(struct page *page, enum km_type type); -#endif - #ifdef CONFIG_X86_PAE void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); - void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); + void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); void (*set_pud)(pud_t *pudp, pud_t pudval); - void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void (*pmd_clear)(pmd_t *pmdp); unsigned long long (*pte_val)(pte_t); @@ -217,21 +235,40 @@ struct paravirt_ops pgd_t (*make_pgd)(unsigned long pgd); #endif - /* Set deferred update mode, used for batching operations. */ - void (*set_lazy_mode)(enum paravirt_lazy_mode mode); +#ifdef CONFIG_HIGHPTE + void *(*kmap_atomic_pte)(struct page *page, enum km_type type); +#endif - /* These two are jmp to, not actually called. */ - void (*irq_enable_sysexit)(void); - void (*iret)(void); + struct pv_lazy_ops lazy_mode; }; -extern struct paravirt_ops paravirt_ops; +/* This contains all the paravirt structures: we get a convenient + * number for each function using the offset which we use to indicate + * what to patch. */ +struct paravirt_patch_template +{ + struct pv_init_ops pv_init_ops; + struct pv_time_ops pv_time_ops; + struct pv_cpu_ops pv_cpu_ops; + struct pv_irq_ops pv_irq_ops; + struct pv_apic_ops pv_apic_ops; + struct pv_mmu_ops pv_mmu_ops; +}; + +extern struct pv_info pv_info; +extern struct pv_init_ops pv_init_ops; +extern struct pv_time_ops pv_time_ops; +extern struct pv_cpu_ops pv_cpu_ops; +extern struct pv_irq_ops pv_irq_ops; +extern struct pv_apic_ops pv_apic_ops; +extern struct pv_mmu_ops pv_mmu_ops; #define PARAVIRT_PATCH(x) \ - (offsetof(struct paravirt_ops, x) / sizeof(void *)) + (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) -#define paravirt_type(type) \ - [paravirt_typenum] "i" (PARAVIRT_PATCH(type)) +#define paravirt_type(op) \ + [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ + [paravirt_opptr] "m" (op) #define paravirt_clobber(clobber) \ [paravirt_clobber] "i" (clobber) @@ -258,7 +295,7 @@ unsigned paravirt_patch_call(void *insnbuf, const void *target, u16 tgt_clobbers, unsigned long addr, u16 site_clobbers, unsigned len); -unsigned paravirt_patch_jmp(const void *target, void *insnbuf, +unsigned paravirt_patch_jmp(void *insnbuf, const void *target, unsigned long addr, unsigned len); unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, unsigned long addr, unsigned len); @@ -271,14 +308,14 @@ int paravirt_disable_iospace(void); /* * This generates an indirect call based on the operation type number. * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_ops structure, and can therefore be freely - * converted back into a structure offset. + * offset into the paravirt_patch_template structure, and can therefore be + * freely converted back into a structure offset. */ -#define PARAVIRT_CALL "call *(paravirt_ops+%c[paravirt_typenum]*4);" +#define PARAVIRT_CALL "call *%[paravirt_opptr];" /* - * These macros are intended to wrap calls into a paravirt_ops - * operation, so that they can be later identified and patched at + * These macros are intended to wrap calls through one of the paravirt + * ops structs, so that they can be later identified and patched at * runtime. * * Normally, a call to a pv_op function is a simple indirect call: @@ -301,7 +338,7 @@ int paravirt_disable_iospace(void); * The call instruction itself is marked by placing its start address * and size into the .parainstructions section, so that * apply_paravirt() in arch/i386/kernel/alternative.c can do the - * appropriate patching under the control of the backend paravirt_ops + * appropriate patching under the control of the backend pv_init_ops * implementation. * * Unfortunately there's no way to get gcc to generate the args setup @@ -409,36 +446,36 @@ int paravirt_disable_iospace(void); static inline int paravirt_enabled(void) { - return paravirt_ops.paravirt_enabled; + return pv_info.paravirt_enabled; } static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread) { - PVOP_VCALL2(load_esp0, tss, thread); + PVOP_VCALL2(pv_cpu_ops.load_esp0, tss, thread); } -#define ARCH_SETUP paravirt_ops.arch_setup(); +#define ARCH_SETUP pv_init_ops.arch_setup(); static inline unsigned long get_wallclock(void) { - return PVOP_CALL0(unsigned long, get_wallclock); + return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock); } static inline int set_wallclock(unsigned long nowtime) { - return PVOP_CALL1(int, set_wallclock, nowtime); + return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime); } static inline void (*choose_time_init(void))(void) { - return paravirt_ops.time_init; + return pv_time_ops.time_init; } /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { - PVOP_VCALL4(cpuid, eax, ebx, ecx, edx); + PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx); } /* @@ -446,87 +483,87 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, */ static inline unsigned long paravirt_get_debugreg(int reg) { - return PVOP_CALL1(unsigned long, get_debugreg, reg); + return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) static inline void set_debugreg(unsigned long val, int reg) { - PVOP_VCALL2(set_debugreg, reg, val); + PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val); } static inline void clts(void) { - PVOP_VCALL0(clts); + PVOP_VCALL0(pv_cpu_ops.clts); } static inline unsigned long read_cr0(void) { - return PVOP_CALL0(unsigned long, read_cr0); + return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0); } static inline void write_cr0(unsigned long x) { - PVOP_VCALL1(write_cr0, x); + PVOP_VCALL1(pv_cpu_ops.write_cr0, x); } static inline unsigned long read_cr2(void) { - return PVOP_CALL0(unsigned long, read_cr2); + return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2); } static inline void write_cr2(unsigned long x) { - PVOP_VCALL1(write_cr2, x); + PVOP_VCALL1(pv_mmu_ops.write_cr2, x); } static inline unsigned long read_cr3(void) { - return PVOP_CALL0(unsigned long, read_cr3); + return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3); } static inline void write_cr3(unsigned long x) { - PVOP_VCALL1(write_cr3, x); + PVOP_VCALL1(pv_mmu_ops.write_cr3, x); } static inline unsigned long read_cr4(void) { - return PVOP_CALL0(unsigned long, read_cr4); + return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4); } static inline unsigned long read_cr4_safe(void) { - return PVOP_CALL0(unsigned long, read_cr4_safe); + return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe); } static inline void write_cr4(unsigned long x) { - PVOP_VCALL1(write_cr4, x); + PVOP_VCALL1(pv_cpu_ops.write_cr4, x); } static inline void raw_safe_halt(void) { - PVOP_VCALL0(safe_halt); + PVOP_VCALL0(pv_irq_ops.safe_halt); } static inline void halt(void) { - PVOP_VCALL0(safe_halt); + PVOP_VCALL0(pv_irq_ops.safe_halt); } static inline void wbinvd(void) { - PVOP_VCALL0(wbinvd); + PVOP_VCALL0(pv_cpu_ops.wbinvd); } -#define get_kernel_rpl() (paravirt_ops.kernel_rpl) +#define get_kernel_rpl() (pv_info.kernel_rpl) static inline u64 paravirt_read_msr(unsigned msr, int *err) { - return PVOP_CALL2(u64, read_msr, msr, err); + return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); } static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { - return PVOP_CALL3(int, write_msr, msr, low, high); + return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); } /* These should all do BUG_ON(_err), but our headers are too tangled. */ @@ -560,7 +597,7 @@ static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) static inline u64 paravirt_read_tsc(void) { - return PVOP_CALL0(u64, read_tsc); + return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); } #define rdtscl(low) do { \ @@ -572,15 +609,15 @@ static inline u64 paravirt_read_tsc(void) static inline unsigned long long paravirt_sched_clock(void) { - return PVOP_CALL0(unsigned long long, sched_clock); + return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } -#define calculate_cpu_khz() (paravirt_ops.get_cpu_khz()) +#define calculate_cpu_khz() (pv_time_ops.get_cpu_khz()) #define write_tsc(val1,val2) wrmsr(0x10, val1, val2) static inline unsigned long long paravirt_read_pmc(int counter) { - return PVOP_CALL1(u64, read_pmc, counter); + return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); } #define rdpmc(counter,low,high) do { \ @@ -591,61 +628,61 @@ static inline unsigned long long paravirt_read_pmc(int counter) static inline void load_TR_desc(void) { - PVOP_VCALL0(load_tr_desc); + PVOP_VCALL0(pv_cpu_ops.load_tr_desc); } static inline void load_gdt(const struct Xgt_desc_struct *dtr) { - PVOP_VCALL1(load_gdt, dtr); + PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr); } static inline void load_idt(const struct Xgt_desc_struct *dtr) { - PVOP_VCALL1(load_idt, dtr); + PVOP_VCALL1(pv_cpu_ops.load_idt, dtr); } static inline void set_ldt(const void *addr, unsigned entries) { - PVOP_VCALL2(set_ldt, addr, entries); + PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); } static inline void store_gdt(struct Xgt_desc_struct *dtr) { - PVOP_VCALL1(store_gdt, dtr); + PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr); } static inline void store_idt(struct Xgt_desc_struct *dtr) { - PVOP_VCALL1(store_idt, dtr); + PVOP_VCALL1(pv_cpu_ops.store_idt, dtr); } static inline unsigned long paravirt_store_tr(void) { - return PVOP_CALL0(unsigned long, store_tr); + return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr); } #define store_tr(tr) ((tr) = paravirt_store_tr()) static inline void load_TLS(struct thread_struct *t, unsigned cpu) { - PVOP_VCALL2(load_tls, t, cpu); + PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu); } static inline void write_ldt_entry(void *dt, int entry, u32 low, u32 high) { - PVOP_VCALL4(write_ldt_entry, dt, entry, low, high); + PVOP_VCALL4(pv_cpu_ops.write_ldt_entry, dt, entry, low, high); } static inline void write_gdt_entry(void *dt, int entry, u32 low, u32 high) { - PVOP_VCALL4(write_gdt_entry, dt, entry, low, high); + PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, low, high); } static inline void write_idt_entry(void *dt, int entry, u32 low, u32 high) { - PVOP_VCALL4(write_idt_entry, dt, entry, low, high); + PVOP_VCALL4(pv_cpu_ops.write_idt_entry, dt, entry, low, high); } static inline void set_iopl_mask(unsigned mask) { - PVOP_VCALL1(set_iopl_mask, mask); + PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask); } /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { - paravirt_ops.io_delay(); + pv_cpu_ops.io_delay(); #ifdef REALLY_SLOW_IO - paravirt_ops.io_delay(); - paravirt_ops.io_delay(); - paravirt_ops.io_delay(); + pv_cpu_ops.io_delay(); + pv_cpu_ops.io_delay(); + pv_cpu_ops.io_delay(); #endif } @@ -655,121 +692,120 @@ static inline void slow_down_io(void) { */ static inline void apic_write(unsigned long reg, unsigned long v) { - PVOP_VCALL2(apic_write, reg, v); + PVOP_VCALL2(pv_apic_ops.apic_write, reg, v); } static inline void apic_write_atomic(unsigned long reg, unsigned long v) { - PVOP_VCALL2(apic_write_atomic, reg, v); + PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v); } static inline unsigned long apic_read(unsigned long reg) { - return PVOP_CALL1(unsigned long, apic_read, reg); + return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg); } static inline void setup_boot_clock(void) { - PVOP_VCALL0(setup_boot_clock); + PVOP_VCALL0(pv_apic_ops.setup_boot_clock); } static inline void setup_secondary_clock(void) { - PVOP_VCALL0(setup_secondary_clock); + PVOP_VCALL0(pv_apic_ops.setup_secondary_clock); } #endif static inline void paravirt_post_allocator_init(void) { - if (paravirt_ops.post_allocator_init) - (*paravirt_ops.post_allocator_init)(); + if (pv_init_ops.post_allocator_init) + (*pv_init_ops.post_allocator_init)(); } static inline void paravirt_pagetable_setup_start(pgd_t *base) { - if (paravirt_ops.pagetable_setup_start) - (*paravirt_ops.pagetable_setup_start)(base); + (*pv_mmu_ops.pagetable_setup_start)(base); } static inline void paravirt_pagetable_setup_done(pgd_t *base) { - if (paravirt_ops.pagetable_setup_done) - (*paravirt_ops.pagetable_setup_done)(base); + (*pv_mmu_ops.pagetable_setup_done)(base); } #ifdef CONFIG_SMP static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) { - PVOP_VCALL3(startup_ipi_hook, phys_apicid, start_eip, start_esp); + PVOP_VCALL3(pv_apic_ops.startup_ipi_hook, + phys_apicid, start_eip, start_esp); } #endif static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { - PVOP_VCALL2(activate_mm, prev, next); + PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next); } static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { - PVOP_VCALL2(dup_mmap, oldmm, mm); + PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm); } static inline void arch_exit_mmap(struct mm_struct *mm) { - PVOP_VCALL1(exit_mmap, mm); + PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm); } static inline void __flush_tlb(void) { - PVOP_VCALL0(flush_tlb_user); + PVOP_VCALL0(pv_mmu_ops.flush_tlb_user); } static inline void __flush_tlb_global(void) { - PVOP_VCALL0(flush_tlb_kernel); + PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel); } static inline void __flush_tlb_single(unsigned long addr) { - PVOP_VCALL1(flush_tlb_single, addr); + PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr); } static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, unsigned long va) { - PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va); + PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va); } static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn) { - PVOP_VCALL2(alloc_pt, mm, pfn); + PVOP_VCALL2(pv_mmu_ops.alloc_pt, mm, pfn); } static inline void paravirt_release_pt(unsigned pfn) { - PVOP_VCALL1(release_pt, pfn); + PVOP_VCALL1(pv_mmu_ops.release_pt, pfn); } static inline void paravirt_alloc_pd(unsigned pfn) { - PVOP_VCALL1(alloc_pd, pfn); + PVOP_VCALL1(pv_mmu_ops.alloc_pd, pfn); } static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn, unsigned start, unsigned count) { - PVOP_VCALL4(alloc_pd_clone, pfn, clonepfn, start, count); + PVOP_VCALL4(pv_mmu_ops.alloc_pd_clone, pfn, clonepfn, start, count); } static inline void paravirt_release_pd(unsigned pfn) { - PVOP_VCALL1(release_pd, pfn); + PVOP_VCALL1(pv_mmu_ops.release_pd, pfn); } #ifdef CONFIG_HIGHPTE static inline void *kmap_atomic_pte(struct page *page, enum km_type type) { unsigned long ret; - ret = PVOP_CALL2(unsigned long, kmap_atomic_pte, page, type); + ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type); return (void *)ret; } #endif @@ -777,162 +813,191 @@ static inline void *kmap_atomic_pte(struct page *page, enum km_type type) static inline void pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - PVOP_VCALL3(pte_update, mm, addr, ptep); + PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); } static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - PVOP_VCALL3(pte_update_defer, mm, addr, ptep); + PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); } #ifdef CONFIG_X86_PAE static inline pte_t __pte(unsigned long long val) { - unsigned long long ret = PVOP_CALL2(unsigned long long, make_pte, + unsigned long long ret = PVOP_CALL2(unsigned long long, + pv_mmu_ops.make_pte, val, val >> 32); return (pte_t) { ret, ret >> 32 }; } static inline pmd_t __pmd(unsigned long long val) { - return (pmd_t) { PVOP_CALL2(unsigned long long, make_pmd, val, val >> 32) }; + return (pmd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pmd, + val, val >> 32) }; } static inline pgd_t __pgd(unsigned long long val) { - return (pgd_t) { PVOP_CALL2(unsigned long long, make_pgd, val, val >> 32) }; + return (pgd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pgd, + val, val >> 32) }; } static inline unsigned long long pte_val(pte_t x) { - return PVOP_CALL2(unsigned long long, pte_val, x.pte_low, x.pte_high); + return PVOP_CALL2(unsigned long long, pv_mmu_ops.pte_val, + x.pte_low, x.pte_high); } static inline unsigned long long pmd_val(pmd_t x) { - return PVOP_CALL2(unsigned long long, pmd_val, x.pmd, x.pmd >> 32); + return PVOP_CALL2(unsigned long long, pv_mmu_ops.pmd_val, + x.pmd, x.pmd >> 32); } static inline unsigned long long pgd_val(pgd_t x) { - return PVOP_CALL2(unsigned long long, pgd_val, x.pgd, x.pgd >> 32); + return PVOP_CALL2(unsigned long long, pv_mmu_ops.pgd_val, + x.pgd, x.pgd >> 32); } static inline void set_pte(pte_t *ptep, pte_t pteval) { - PVOP_VCALL3(set_pte, ptep, pteval.pte_low, pteval.pte_high); + PVOP_VCALL3(pv_mmu_ops.set_pte, ptep, pteval.pte_low, pteval.pte_high); } static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { /* 5 arg words */ - paravirt_ops.set_pte_at(mm, addr, ptep, pteval); + pv_mmu_ops.set_pte_at(mm, addr, ptep, pteval); } static inline void set_pte_atomic(pte_t *ptep, pte_t pteval) { - PVOP_VCALL3(set_pte_atomic, ptep, pteval.pte_low, pteval.pte_high); + PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep, + pteval.pte_low, pteval.pte_high); } static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { /* 5 arg words */ - paravirt_ops.set_pte_present(mm, addr, ptep, pte); + pv_mmu_ops.set_pte_present(mm, addr, ptep, pte); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval) { - PVOP_VCALL3(set_pmd, pmdp, pmdval.pmd, pmdval.pmd >> 32); + PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, + pmdval.pmd, pmdval.pmd >> 32); } static inline void set_pud(pud_t *pudp, pud_t pudval) { - PVOP_VCALL3(set_pud, pudp, pudval.pgd.pgd, pudval.pgd.pgd >> 32); + PVOP_VCALL3(pv_mmu_ops.set_pud, pudp, + pudval.pgd.pgd, pudval.pgd.pgd >> 32); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - PVOP_VCALL3(pte_clear, mm, addr, ptep); + PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep); } static inline void pmd_clear(pmd_t *pmdp) { - PVOP_VCALL1(pmd_clear, pmdp); + PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp); } #else /* !CONFIG_X86_PAE */ static inline pte_t __pte(unsigned long val) { - return (pte_t) { PVOP_CALL1(unsigned long, make_pte, val) }; + return (pte_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pte, val) }; } static inline pgd_t __pgd(unsigned long val) { - return (pgd_t) { PVOP_CALL1(unsigned long, make_pgd, val) }; + return (pgd_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pgd, val) }; } static inline unsigned long pte_val(pte_t x) { - return PVOP_CALL1(unsigned long, pte_val, x.pte_low); + return PVOP_CALL1(unsigned long, pv_mmu_ops.pte_val, x.pte_low); } static inline unsigned long pgd_val(pgd_t x) { - return PVOP_CALL1(unsigned long, pgd_val, x.pgd); + return PVOP_CALL1(unsigned long, pv_mmu_ops.pgd_val, x.pgd); } static inline void set_pte(pte_t *ptep, pte_t pteval) { - PVOP_VCALL2(set_pte, ptep, pteval.pte_low); + PVOP_VCALL2(pv_mmu_ops.set_pte, ptep, pteval.pte_low); } static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - PVOP_VCALL4(set_pte_at, mm, addr, ptep, pteval.pte_low); + PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pteval.pte_low); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval) { - PVOP_VCALL2(set_pmd, pmdp, pmdval.pud.pgd.pgd); + PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, pmdval.pud.pgd.pgd); } #endif /* CONFIG_X86_PAE */ +/* Lazy mode for batching updates / context switch */ +enum paravirt_lazy_mode { + PARAVIRT_LAZY_NONE, + PARAVIRT_LAZY_MMU, + PARAVIRT_LAZY_CPU, +}; + +enum paravirt_lazy_mode paravirt_get_lazy_mode(void); +void paravirt_enter_lazy_cpu(void); +void paravirt_leave_lazy_cpu(void); +void paravirt_enter_lazy_mmu(void); +void paravirt_leave_lazy_mmu(void); +void paravirt_leave_lazy(enum paravirt_lazy_mode mode); + #define __HAVE_ARCH_ENTER_LAZY_CPU_MODE static inline void arch_enter_lazy_cpu_mode(void) { - PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_CPU); + PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); } static inline void arch_leave_lazy_cpu_mode(void) { - PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_NONE); + PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); } static inline void arch_flush_lazy_cpu_mode(void) { - PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_FLUSH); + if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) { + arch_leave_lazy_cpu_mode(); + arch_enter_lazy_cpu_mode(); + } } #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { - PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_MMU); + PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter); } static inline void arch_leave_lazy_mmu_mode(void) { - PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_NONE); + PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave); } static inline void arch_flush_lazy_mmu_mode(void) { - PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_FLUSH); + if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) { + arch_leave_lazy_mmu_mode(); + arch_enter_lazy_mmu_mode(); + } } void _paravirt_nop(void); @@ -957,7 +1022,7 @@ static inline unsigned long __raw_local_save_flags(void) PARAVIRT_CALL "popl %%edx; popl %%ecx") : "=a"(f) - : paravirt_type(save_fl), + : paravirt_type(pv_irq_ops.save_fl), paravirt_clobber(CLBR_EAX) : "memory", "cc"); return f; @@ -970,7 +1035,7 @@ static inline void raw_local_irq_restore(unsigned long f) "popl %%edx; popl %%ecx") : "=a"(f) : "0"(f), - paravirt_type(restore_fl), + paravirt_type(pv_irq_ops.restore_fl), paravirt_clobber(CLBR_EAX) : "memory", "cc"); } @@ -981,7 +1046,7 @@ static inline void raw_local_irq_disable(void) PARAVIRT_CALL "popl %%edx; popl %%ecx") : - : paravirt_type(irq_disable), + : paravirt_type(pv_irq_ops.irq_disable), paravirt_clobber(CLBR_EAX) : "memory", "eax", "cc"); } @@ -992,7 +1057,7 @@ static inline void raw_local_irq_enable(void) PARAVIRT_CALL "popl %%edx; popl %%ecx") : - : paravirt_type(irq_enable), + : paravirt_type(pv_irq_ops.irq_enable), paravirt_clobber(CLBR_EAX) : "memory", "eax", "cc"); } @@ -1008,21 +1073,23 @@ static inline unsigned long __raw_local_irq_save(void) #define CLI_STRING \ _paravirt_alt("pushl %%ecx; pushl %%edx;" \ - "call *paravirt_ops+%c[paravirt_cli_type]*4;" \ + "call *%[paravirt_cli_opptr];" \ "popl %%edx; popl %%ecx", \ "%c[paravirt_cli_type]", "%c[paravirt_clobber]") #define STI_STRING \ _paravirt_alt("pushl %%ecx; pushl %%edx;" \ - "call *paravirt_ops+%c[paravirt_sti_type]*4;" \ + "call *%[paravirt_sti_opptr];" \ "popl %%edx; popl %%ecx", \ "%c[paravirt_sti_type]", "%c[paravirt_clobber]") #define CLI_STI_CLOBBERS , "%eax" #define CLI_STI_INPUT_ARGS \ , \ - [paravirt_cli_type] "i" (PARAVIRT_PATCH(irq_disable)), \ - [paravirt_sti_type] "i" (PARAVIRT_PATCH(irq_enable)), \ + [paravirt_cli_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_disable)), \ + [paravirt_cli_opptr] "m" (pv_irq_ops.irq_disable), \ + [paravirt_sti_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_enable)), \ + [paravirt_sti_opptr] "m" (pv_irq_ops.irq_enable), \ paravirt_clobber(CLBR_EAX) /* Make sure as little as possible of this mess escapes. */ @@ -1042,7 +1109,7 @@ static inline unsigned long __raw_local_irq_save(void) #else /* __ASSEMBLY__ */ -#define PARA_PATCH(off) ((off) / 4) +#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) #define PARA_SITE(ptype, clobbers, ops) \ 771:; \ @@ -1055,29 +1122,29 @@ static inline unsigned long __raw_local_irq_save(void) .short clobbers; \ .popsection -#define INTERRUPT_RETURN \ - PARA_SITE(PARA_PATCH(PARAVIRT_iret), CLBR_NONE, \ - jmp *%cs:paravirt_ops+PARAVIRT_iret) +#define INTERRUPT_RETURN \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \ + jmp *%cs:pv_cpu_ops+PV_CPU_iret) #define DISABLE_INTERRUPTS(clobbers) \ - PARA_SITE(PARA_PATCH(PARAVIRT_irq_disable), clobbers, \ + PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ pushl %eax; pushl %ecx; pushl %edx; \ - call *%cs:paravirt_ops+PARAVIRT_irq_disable; \ + call *%cs:pv_irq_ops+PV_IRQ_irq_disable; \ popl %edx; popl %ecx; popl %eax) \ #define ENABLE_INTERRUPTS(clobbers) \ - PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable), clobbers, \ + PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ pushl %eax; pushl %ecx; pushl %edx; \ - call *%cs:paravirt_ops+PARAVIRT_irq_enable; \ + call *%cs:pv_irq_ops+PV_IRQ_irq_enable; \ popl %edx; popl %ecx; popl %eax) -#define ENABLE_INTERRUPTS_SYSEXIT \ - PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable_sysexit), CLBR_NONE, \ - jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit) +#define ENABLE_INTERRUPTS_SYSEXIT \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), CLBR_NONE,\ + jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_sysexit) #define GET_CR0_INTO_EAX \ push %ecx; push %edx; \ - call *paravirt_ops+PARAVIRT_read_cr0; \ + call *pv_cpu_ops+PV_CPU_read_cr0; \ pop %edx; pop %ecx #endif /* __ASSEMBLY__ */ diff --git a/include/asm-x86/pgtable-3level-defs.h b/include/asm-x86/pgtable-3level-defs.h index c0df89f66e8b..448ac9516314 100644 --- a/include/asm-x86/pgtable-3level-defs.h +++ b/include/asm-x86/pgtable-3level-defs.h @@ -2,7 +2,7 @@ #define _I386_PGTABLE_3LEVEL_DEFS_H #ifdef CONFIG_PARAVIRT -#define SHARED_KERNEL_PMD (paravirt_ops.shared_kernel_pmd) +#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd) #else #define SHARED_KERNEL_PMD 1 #endif diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h index ff61ea365997..b05d8a6d9143 100644 --- a/include/xen/interface/vcpu.h +++ b/include/xen/interface/vcpu.h @@ -160,8 +160,9 @@ struct vcpu_set_singleshot_timer { */ #define VCPUOP_register_vcpu_info 10 /* arg == struct vcpu_info */ struct vcpu_register_vcpu_info { - uint32_t mfn; /* mfn of page to place vcpu_info */ - uint32_t offset; /* offset within page */ + uint64_t mfn; /* mfn of page to place vcpu_info */ + uint32_t offset; /* offset within page */ + uint32_t rsvd; /* unused */ }; #endif /* __XEN_PUBLIC_VCPU_H__ */ diff --git a/mm/Kconfig b/mm/Kconfig index 1cc6cada2bbf..b1f03b0eb7f1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -155,7 +155,6 @@ config SPLIT_PTLOCK_CPUS int default "4096" if ARM && !CPU_CACHE_VIPT default "4096" if PARISC && !PA20 - default "4096" if XEN default "4" # |