diff options
42 files changed, 2229 insertions, 363 deletions
diff --git a/Documentation/ABI/testing/sysfs-hypervisor-pmu b/Documentation/ABI/testing/sysfs-hypervisor-pmu new file mode 100644 index 000000000000..224faa105e18 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-hypervisor-pmu @@ -0,0 +1,23 @@ +What: /sys/hypervisor/pmu/pmu_mode +Date: August 2015 +KernelVersion: 4.3 +Contact: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Description: + Describes mode that Xen's performance-monitoring unit (PMU) + uses. Accepted values are + "off" -- PMU is disabled + "self" -- The guest can profile itself + "hv" -- The guest can profile itself and, if it is + privileged (e.g. dom0), the hypervisor + "all" -- The guest can profile itself, the hypervisor + and all other guests. Only available to + privileged guests. + +What: /sys/hypervisor/pmu/pmu_features +Date: August 2015 +KernelVersion: 4.3 +Contact: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Description: + Describes Xen PMU features (as an integer). A set bit indicates + that the corresponding feature is enabled. See + include/xen/interface/xenpmu.h for available features diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index f0c950598582..22a4b687ea5b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -4106,6 +4106,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. plus one apbt timer for broadcast timer. x86_intel_mid_timer=apbt_only | lapic_and_apbt + xen_512gb_limit [KNL,X86-64,XEN] + Restricts the kernel running paravirtualized under Xen + to use only up to 512 GB of RAM. The reason to do so is + crash analysis tools and Xen tools for doing domain + save/restore/migration must be enabled to handle larger + domains. + xen_emul_unplug= [HW,X86,XEN] Unplug Xen emulated devices Format: [unplug0,][unplug1] diff --git a/arch/arm/include/asm/xen/events.h b/arch/arm/include/asm/xen/events.h index 8b1f37bfeeec..71e473d05fcc 100644 --- a/arch/arm/include/asm/xen/events.h +++ b/arch/arm/include/asm/xen/events.h @@ -20,4 +20,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) atomic64_t, \ counter), (val)) +/* Rebind event channel is supported by default */ +static inline bool xen_support_evtchn_rebind(void) +{ + return true; +} + #endif /* _ASM_ARM_XEN_EVENTS_H */ diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index 1bee8ca12494..98b1084f8282 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -54,26 +54,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) #define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn) -static inline xmaddr_t phys_to_machine(xpaddr_t phys) -{ - unsigned offset = phys.paddr & ~PAGE_MASK; - return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); -} - -static inline xpaddr_t machine_to_phys(xmaddr_t machine) -{ - unsigned offset = machine.maddr & ~PAGE_MASK; - return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); -} /* VIRT <-> MACHINE conversion */ -#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) +/* Only used in PV code. But ARM guests are always HVM. */ static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr) { - /* TODO: assuming it is mapped in the kernel 1:1 */ - return virt_to_machine(vaddr); + BUG(); } /* TODO: this shouldn't be here but it is because the frontend drivers diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 6c09cc440a2b..c50c8d33f874 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -45,13 +45,6 @@ static struct vcpu_info __percpu *xen_vcpu_info; unsigned long xen_released_pages; struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; -/* TODO: to be removed */ -__read_mostly int xen_have_vector_callback; -EXPORT_SYMBOL_GPL(xen_have_vector_callback); - -int xen_platform_pci_unplug = XEN_UNPLUG_ALL; -EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); - static __read_mostly unsigned int xen_events_irq; static __initdata struct device_node *xen_node; diff --git a/arch/arm64/include/asm/xen/events.h b/arch/arm64/include/asm/xen/events.h index 86553213c132..4318866d053c 100644 --- a/arch/arm64/include/asm/xen/events.h +++ b/arch/arm64/include/asm/xen/events.h @@ -18,4 +18,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +/* Rebind event channel is supported by default */ +static inline bool xen_support_evtchn_rebind(void) +{ + return true; +} + #endif /* _ASM_ARM64_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index 608a79d5a466..e6911caf5bbf 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) /* No need for a barrier -- XCHG is a barrier on x86. */ #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +extern int xen_have_vector_callback; + +/* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ +static inline bool xen_support_evtchn_rebind(void) +{ + return (!xen_hvm_domain() || xen_have_vector_callback); +} + #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index ca08a27b90b3..83aea8055119 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -465,6 +465,12 @@ HYPERVISOR_tmem_op( return _hypercall1(int, tmem_op, op); } +static inline int +HYPERVISOR_xenpmu_op(unsigned int op, void *arg) +{ + return _hypercall2(int, xenpmu_op, op, arg); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 3400dbaec3c3..62ca03ef5c65 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -3,12 +3,38 @@ * * Guest OS interface to x86 Xen. * - * Copyright (c) 2004, K A Fraser + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2006, K A Fraser */ #ifndef _ASM_X86_XEN_INTERFACE_H #define _ASM_X86_XEN_INTERFACE_H +/* + * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field + * in a struct in memory. + * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an + * hypercall argument. + * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but + * they might not be on other architectures. + */ #ifdef __XEN__ #define __DEFINE_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name @@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t); * start of the GDT because some stupid OSes export hard-coded selector values * in their ABI. These hard-coded values are always near the start of the GDT, * so Xen places itself out of the way, at the far end of the GDT. + * + * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op */ #define FIRST_RESERVED_GDT_PAGE 14 #define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) /* - * Send an array of these to HYPERVISOR_set_trap_table() + * Send an array of these to HYPERVISOR_set_trap_table(). + * Terminate the array with a sentinel entry, with traps[].address==0. * The privilege level specifies which modes may enter a trap via a software * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate * privilege levels as follows: @@ -118,10 +147,41 @@ struct trap_info { DEFINE_GUEST_HANDLE_STRUCT(trap_info); struct arch_shared_info { - unsigned long max_pfn; /* max pfn that appears in table */ - /* Frame containing list of mfns containing list of mfns containing p2m. */ - unsigned long pfn_to_mfn_frame_list_list; - unsigned long nmi_reason; + /* + * Number of valid entries in the p2m table(s) anchored at + * pfn_to_mfn_frame_list_list and/or p2m_vaddr. + */ + unsigned long max_pfn; + /* + * Frame containing list of mfns containing list of mfns containing p2m. + * A value of 0 indicates it has not yet been set up, ~0 indicates it + * has been set to invalid e.g. due to the p2m being too large for the + * 3-level p2m tree. In this case the linear mapper p2m list anchored + * at p2m_vaddr is to be used. + */ + xen_pfn_t pfn_to_mfn_frame_list_list; + unsigned long nmi_reason; + /* + * Following three fields are valid if p2m_cr3 contains a value + * different from 0. + * p2m_cr3 is the root of the address space where p2m_vaddr is valid. + * p2m_cr3 is in the same format as a cr3 value in the vcpu register + * state and holds the folded machine frame number (via xen_pfn_to_cr3) + * of a L3 or L4 page table. + * p2m_vaddr holds the virtual address of the linear p2m list. All + * entries in the range [0...max_pfn[ are accessible via this pointer. + * p2m_generation will be incremented by the guest before and after each + * change of the mappings of the p2m list. p2m_generation starts at 0 + * and a value with the least significant bit set indicates that a + * mapping update is in progress. This allows guest external software + * (e.g. in Dom0) to verify that read mappings are consistent and + * whether they have changed since the last check. + * Modifying a p2m element in the linear p2m list is allowed via an + * atomic write only. + */ + unsigned long p2m_cr3; /* cr3 value of the p2m address space */ + unsigned long p2m_vaddr; /* virtual address of the p2m list */ + unsigned long p2m_generation; /* generation count of p2m mapping */ }; #endif /* !__ASSEMBLY__ */ @@ -137,13 +197,31 @@ struct arch_shared_info { /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + * + * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise + * for HVM and PVH guests, not all information in this structure is updated: + * + * - For HVM guests, the structures read include: fpu_ctxt (if + * VGCT_I387_VALID is set), flags, user_regs, debugreg[*] + * + * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to + * set cr3. All other fields not used should be set to 0. */ struct vcpu_guest_context { /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ -#define VGCF_I387_VALID (1<<0) -#define VGCF_HVM_GUEST (1<<1) -#define VGCF_IN_KERNEL (1<<2) +#define VGCF_I387_VALID (1<<0) +#define VGCF_IN_KERNEL (1<<2) +#define _VGCF_i387_valid 0 +#define VGCF_i387_valid (1<<_VGCF_i387_valid) +#define _VGCF_in_kernel 2 +#define VGCF_in_kernel (1<<_VGCF_in_kernel) +#define _VGCF_failsafe_disables_events 3 +#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events) +#define _VGCF_syscall_disables_events 4 +#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events) +#define _VGCF_online 5 +#define VGCF_online (1<<_VGCF_online) unsigned long flags; /* VGCF_* flags */ struct cpu_user_regs user_regs; /* User-level CPU registers */ struct trap_info trap_ctxt[256]; /* Virtual IDT */ @@ -172,6 +250,129 @@ struct vcpu_guest_context { #endif }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); + +/* AMD PMU registers and structures */ +struct xen_pmu_amd_ctxt { + /* + * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd). + * For PV(H) guests these fields are RO. + */ + uint32_t counters; + uint32_t ctrls; + + /* Counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Intel PMU registers and structures */ +struct xen_pmu_cntr_pair { + uint64_t counter; + uint64_t control; +}; + +struct xen_pmu_intel_ctxt { + /* + * Offsets to fixed and architectural counter MSRs (relative to + * xen_pmu_arch.c.intel). + * For PV(H) guests these fields are RO. + */ + uint32_t fixed_counters; + uint32_t arch_counters; + + /* PMU registers */ + uint64_t global_ctrl; + uint64_t global_ovf_ctrl; + uint64_t global_status; + uint64_t fixed_ctrl; + uint64_t ds_area; + uint64_t pebs_enable; + uint64_t debugctl; + + /* Fixed and architectural counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Sampled domain's registers */ +struct xen_pmu_regs { + uint64_t ip; + uint64_t sp; + uint64_t flags; + uint16_t cs; + uint16_t ss; + uint8_t cpl; + uint8_t pad[3]; +}; + +/* PMU flags */ +#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */ +#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */ +#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */ +#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */ + +/* + * Architecture-specific information describing state of the processor at + * the time of PMU interrupt. + * Fields of this structure marked as RW for guest should only be written by + * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the + * hypervisor during PMU interrupt). Hypervisor will read updated data in + * XENPMU_flush hypercall and clear PMU_CACHED bit. + */ +struct xen_pmu_arch { + union { + /* + * Processor's registers at the time of interrupt. + * WO for hypervisor, RO for guests. + */ + struct xen_pmu_regs regs; + /* + * Padding for adding new registers to xen_pmu_regs in + * the future + */ +#define XENPMU_REGS_PAD_SZ 64 + uint8_t pad[XENPMU_REGS_PAD_SZ]; + } r; + + /* WO for hypervisor, RO for guest */ + uint64_t pmu_flags; + + /* + * APIC LVTPC register. + * RW for both hypervisor and guest. + * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware + * during XENPMU_flush or XENPMU_lvtpc_set. + */ + union { + uint32_t lapic_lvtpc; + uint64_t pad; + } l; + + /* + * Vendor-specific PMU registers. + * RW for both hypervisor and guest (see exceptions above). + * Guest's updates to this field are verified and then loaded by the + * hypervisor into hardware during XENPMU_flush + */ + union { + struct xen_pmu_amd_ctxt amd; + struct xen_pmu_intel_ctxt intel; + + /* + * Padding for contexts (fixed parts only, does not include + * MSR banks that are specified by offsets) + */ +#define XENPMU_CTXT_PAD_SZ 128 + uint8_t pad[XENPMU_CTXT_PAD_SZ]; + } c; +}; + #endif /* !__ASSEMBLY__ */ /* diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c44a5d53e464..a3804fbe1f36 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -35,9 +35,7 @@ typedef struct xpaddr { #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) #define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) -/* Maximum amount of memory we can handle in a domain in pages */ -#define MAX_DOMAIN_PAGES \ - ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) extern unsigned long *machine_to_phys_mapping; extern unsigned long machine_to_phys_nr; @@ -48,8 +46,8 @@ extern unsigned long xen_max_p2m_pfn; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); -extern unsigned long set_phys_range_identity(unsigned long pfn_s, - unsigned long pfn_e); +extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 484145368a24..c7b15f3e2cf3 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -7,6 +7,7 @@ config XEN depends on PARAVIRT select PARAVIRT_CLOCK select XEN_HAVE_PVMMU + select XEN_HAVE_VPMU depends on X86_64 || (X86_32 && X86_PAE) depends on X86_LOCAL_APIC && X86_TSC help @@ -23,14 +24,18 @@ config XEN_PVHVM def_bool y depends on XEN && PCI && X86_LOCAL_APIC -config XEN_MAX_DOMAIN_MEMORY - int - default 500 if X86_64 - default 64 if X86_32 - depends on XEN - help - This only affects the sizing of some bss arrays, the unused - portions of which are freed. +config XEN_512GB + bool "Limit Xen pv-domain memory to 512GB" + depends on XEN && X86_64 + default y + help + Limit paravirtualized user domains to 512GB of RAM. + + The Xen tools and crash dump analysis tools might not support + pv-domains with more than 512 GB of RAM. This option controls the + default setting of the kernel to use only up to 512 GB or more. + It is always possible to change the default via specifying the + boot parameter "xen_512gb_limit". config XEN_SAVE_RESTORE bool diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 4b6e29ac0968..e47e52787d32 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o apic.o + p2m.o apic.o pmu.o obj-$(CONFIG_EVENT_TRACING) += trace.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 70e060ad879a..acda713ab5be 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -7,6 +7,7 @@ #include <xen/xen.h> #include <xen/interface/physdev.h> #include "xen-ops.h" +#include "pmu.h" #include "smp.h" static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) @@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg) static void xen_apic_write(u32 reg, u32 val) { + if (reg == APIC_LVTPC) { + (void)pmu_apic_update(reg); + return; + } + /* Warn to see if there's any stray references */ WARN(1,"register: %x, value: %x\n", reg, val); } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d9cfa452da9d..30d12afe52ed 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -84,6 +84,7 @@ #include "mmu.h" #include "smp.h" #include "multicalls.h" +#include "pmu.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0) static void xen_write_cr4(unsigned long cr4) { - cr4 &= ~X86_CR4_PGE; - cr4 &= ~X86_CR4_PSE; + cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE); native_write_cr4(cr4); } @@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err) { u64 val; + if (pmu_msr_read(msr, &val, err)) + return val; + val = native_read_msr_safe(msr, err); switch (msr) { case MSR_IA32_APICBASE: @@ -1076,7 +1079,8 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) Xen console noise. */ default: - ret = native_write_msr_safe(msr, low, high); + if (!pmu_msr_write(msr, low, high, &ret)) + ret = native_write_msr_safe(msr, low, high); } return ret; @@ -1215,7 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_msr = xen_read_msr_safe, .write_msr = xen_write_msr_safe, - .read_pmc = native_read_pmc, + .read_pmc = xen_read_pmc, .iret = xen_iret, #ifdef CONFIG_X86_64 @@ -1264,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = { static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); @@ -1607,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void) early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); - xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, + xen_start_info->nr_pages); + xen_reserve_special_pages(); /* * Modify the cache mode translation tables to match Xen's PAT diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dd151b2045b0..2c50b445884e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ +static phys_addr_t xen_pt_base, xen_pt_size __initdata; /* * Just beyond the highest usermode address. STACK_TOP_MAX has a @@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm) static void xen_post_allocator_init(void); +static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + #ifdef CONFIG_X86_64 static void __init xen_cleanhighmap(unsigned long vaddr, unsigned long vaddr_end) @@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr, xen_mc_flush(); } +/* + * Make a page range writeable and free it. + */ +static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) +{ + void *vaddr = __va(paddr); + void *vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) + make_lowmem_page_readwrite(vaddr); + + memblock_free(paddr, size); +} + +static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) +{ + unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; + + if (unpin) + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa)); + ClearPagePinned(virt_to_page(__va(pa))); + xen_free_ro_pages(pa, PAGE_SIZE); +} + +/* + * Since it is well isolated we can (and since it is perhaps large we should) + * also free the page tables mapping the initial P->M table. + */ +static void __init xen_cleanmfnmap(unsigned long vaddr) +{ + unsigned long va = vaddr & PMD_MASK; + unsigned long pa; + pgd_t *pgd = pgd_offset_k(va); + pud_t *pud_page = pud_offset(pgd, 0); + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned int i; + bool unpin; + + unpin = (vaddr == 2 * PGDIR_SIZE); + set_pgd(pgd, __pgd(0)); + do { + pud = pud_page + pud_index(va); + if (pud_none(*pud)) { + va += PUD_SIZE; + } else if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PUD_SIZE); + va += PUD_SIZE; + } else { + pmd = pmd_offset(pud, va); + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PMD_SIZE); + } else if (!pmd_none(*pmd)) { + pte = pte_offset_kernel(pmd, va); + set_pmd(pmd, __pmd(0)); + for (i = 0; i < PTRS_PER_PTE; ++i) { + if (pte_none(pte[i])) + break; + pa = pte_pfn(pte[i]) << PAGE_SHIFT; + xen_free_ro_pages(pa, PAGE_SIZE); + } + xen_cleanmfnmap_free_pgtbl(pte, unpin); + } + va += PMD_SIZE; + if (pmd_index(va)) + continue; + set_pud(pud, __pud(0)); + xen_cleanmfnmap_free_pgtbl(pmd, unpin); + } + + } while (pud_index(va) || pmd_index(va)); + xen_cleanmfnmap_free_pgtbl(pud_page, unpin); +} + static void __init xen_pagetable_p2m_free(void) { unsigned long size; @@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void) /* using __ka address and sticking INVALID_P2M_ENTRY! */ memset((void *)xen_start_info->mfn_list, 0xff, size); - /* We should be in __ka space. */ - BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); addr = xen_start_info->mfn_list; - /* We roundup to the PMD, which means that if anybody at this stage is - * using the __ka address of xen_start_info or xen_start_info->shared_info - * they are in going to crash. Fortunatly we have already revectored - * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + /* + * We could be in __ka space. + * We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or + * xen_start_info->shared_info they are in going to crash. Fortunatly + * we have already revectored in xen_setup_kernel_pagetable and in + * xen_setup_shared_info. + */ size = roundup(size, PMD_SIZE); - xen_cleanhighmap(addr, addr + size); - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_free(__pa(xen_start_info->mfn_list), size); + if (addr >= __START_KERNEL_map) { + xen_cleanhighmap(addr, addr + size); + size = PAGE_ALIGN(xen_start_info->nr_pages * + sizeof(unsigned long)); + memblock_free(__pa(addr), size); + } else { + xen_cleanmfnmap(addr); + } +} + +static void __init xen_pagetable_cleanhighmap(void) +{ + unsigned long size; + unsigned long addr; /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of @@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void) #ifdef CONFIG_X86_64 xen_pagetable_p2m_free(); + + xen_pagetable_cleanhighmap(); #endif /* And revector! Bye bye old array */ xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; @@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) #else /* CONFIG_X86_64 */ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { + unsigned long pfn; + + if (xen_feature(XENFEAT_writable_page_tables) || + xen_feature(XENFEAT_auto_translated_physmap) || + xen_start_info->mfn_list >= __START_KERNEL_map) + return pte; + + /* + * Pages belonging to the initial p2m list mapped outside the default + * address range must be mapped read-only. This region contains the + * page tables for mapping the p2m list, too, and page tables MUST be + * mapped read-only. + */ + pfn = pte_pfn(pte); + if (pfn >= xen_start_info->first_p2m_pfn && + pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) + pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW); + return pte; } #endif /* CONFIG_X86_64 */ @@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) native_set_pte(ptep, pte); } -static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct mmuext_op op; - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); -} - /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) @@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) * mappings. Considering that on Xen after the kernel mappings we * have the mappings of some pages that don't exist in pfn space, we * set max_pfn_mapped to the last real pfn mapped. */ - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + if (xen_start_info->mfn_list < __START_KERNEL_map) + max_pfn_mapped = xen_start_info->first_p2m_pfn; + else + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); pt_end = pt_base + xen_start_info->nr_pt_frames; @@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Graft it onto L4[511][510] */ copy_page(level2_kernel_pgt, l2); + /* Copy the initial P->M table mappings if necessary. */ + i = pgd_index(xen_start_info->mfn_list); + if (i && i < pgd_index(__START_KERNEL_map)) + init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; + if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); @@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) check_pt_base(&pt_base, &pt_end, addr[i]); /* Our (by three pages) smaller Xen pagetable that we are using */ - memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); + xen_pt_base = PFN_PHYS(pt_base); + xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; + memblock_reserve(xen_pt_base, xen_pt_size); + /* Revector the xen_start_info */ xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); } + +/* + * Read a value from a physical address. + */ +static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) +{ + unsigned long *vaddr; + unsigned long val; + + vaddr = early_memremap_ro(addr, sizeof(val)); + val = *vaddr; + early_memunmap(vaddr, sizeof(val)); + return val; +} + +/* + * Translate a virtual address to a physical one without relying on mapped + * page tables. + */ +static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) +{ + phys_addr_t pa; + pgd_t pgd; + pud_t pud; + pmd_t pmd; + pte_t pte; + + pa = read_cr3(); + pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * + sizeof(pgd))); + if (!pgd_present(pgd)) + return 0; + + pa = pgd_val(pgd) & PTE_PFN_MASK; + pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) * + sizeof(pud))); + if (!pud_present(pud)) + return 0; + pa = pud_pfn(pud) << PAGE_SHIFT; + if (pud_large(pud)) + return pa + (vaddr & ~PUD_MASK); + + pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * + sizeof(pmd))); + if (!pmd_present(pmd)) + return 0; + pa = pmd_pfn(pmd) << PAGE_SHIFT; + if (pmd_large(pmd)) + return pa + (vaddr & ~PMD_MASK); + + pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * + sizeof(pte))); + if (!pte_present(pte)) + return 0; + pa = pte_pfn(pte) << PAGE_SHIFT; + + return pa | (vaddr & ~PAGE_MASK); +} + +/* + * Find a new area for the hypervisor supplied p2m list and relocate the p2m to + * this area. + */ +void __init xen_relocate_p2m(void) +{ + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; + unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; + int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; + pte_t *pt; + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd; + unsigned long *new_p2m; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; + n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; + n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; + n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; + n_frames = n_pte + n_pt + n_pmd + n_pud; + + new_area = xen_find_free_area(PFN_PHYS(n_frames)); + if (!new_area) { + xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n"); + BUG(); + } + + /* + * Setup the page tables for addressing the new p2m list. + * We have asked the hypervisor to map the p2m list at the user address + * PUD_SIZE. It may have done so, or it may have used a kernel space + * address depending on the Xen version. + * To avoid any possible virtual address collision, just use + * 2 * PUD_SIZE for the new area. + */ + pud_phys = new_area; + pmd_phys = pud_phys + PFN_PHYS(n_pud); + pt_phys = pmd_phys + PFN_PHYS(n_pmd); + p2m_pfn = PFN_DOWN(pt_phys) + n_pt; + + pgd = __va(read_cr3()); + new_p2m = (unsigned long *)(2 * PGDIR_SIZE); + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { + pud = early_memremap(pud_phys, PAGE_SIZE); + clear_page(pud); + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); + idx_pmd++) { + pmd = early_memremap(pmd_phys, PAGE_SIZE); + clear_page(pmd); + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); + idx_pt++) { + pt = early_memremap(pt_phys, PAGE_SIZE); + clear_page(pt); + for (idx_pte = 0; + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + set_pte(pt + idx_pte, + pfn_pte(p2m_pfn, PAGE_KERNEL)); + p2m_pfn++; + } + n_pte -= PTRS_PER_PTE; + early_memunmap(pt, PAGE_SIZE); + make_lowmem_page_readonly(__va(pt_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, + PFN_DOWN(pt_phys)); + set_pmd(pmd + idx_pt, + __pmd(_PAGE_TABLE | pt_phys)); + pt_phys += PAGE_SIZE; + } + n_pt -= PTRS_PER_PMD; + early_memunmap(pmd, PAGE_SIZE); + make_lowmem_page_readonly(__va(pmd_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, + PFN_DOWN(pmd_phys)); + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pmd_phys += PAGE_SIZE; + } + n_pmd -= PTRS_PER_PUD; + early_memunmap(pud, PAGE_SIZE); + make_lowmem_page_readonly(__va(pud_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); + pud_phys += PAGE_SIZE; + } + + /* Now copy the old p2m info to the new area. */ + memcpy(new_p2m, xen_p2m_addr, size); + xen_p2m_addr = new_p2m; + + /* Release the old p2m list and set new list info. */ + p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list)); + BUG_ON(!p2m_pfn); + p2m_pfn_end = p2m_pfn + PFN_DOWN(size); + + if (xen_start_info->mfn_list < __START_KERNEL_map) { + pfn = xen_start_info->first_p2m_pfn; + pfn_end = xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames; + set_pgd(pgd + 1, __pgd(0)); + } else { + pfn = p2m_pfn; + pfn_end = p2m_pfn_end; + } + + memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); + while (pfn < pfn_end) { + if (pfn == p2m_pfn) { + pfn = p2m_pfn_end; + continue; + } + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + pfn++; + } + + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; + xen_start_info->first_p2m_pfn = PFN_DOWN(new_area); + xen_start_info->nr_p2m_frames = n_frames; +} + #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); @@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3) pv_mmu_ops.write_cr3 = &xen_write_cr3; } +/* + * For 32 bit domains xen_start_info->pt_base is the pgd address which might be + * not the first page table in the page table pool. + * Iterate through the initial page tables to find the real page table base. + */ +static phys_addr_t xen_find_pt_base(pmd_t *pmd) +{ + phys_addr_t pt_base, paddr; + unsigned pmdidx; + + pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd)); + + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) + if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) { + paddr = m2p(pmd[pmdidx].pmd); + pt_base = min(pt_base, paddr); + } + + return pt_base; +} + void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + + xen_pt_base = xen_find_pt_base(kernel_pmd); + xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; + initial_kernel_pmd = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + - xen_start_info->nr_pt_frames * PAGE_SIZE + - 512*1024); + max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024); - kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); copy_page(initial_kernel_pmd, kernel_pmd); xen_map_identity_early(initial_kernel_pmd, max_pfn); @@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) PFN_DOWN(__pa(initial_page_table))); xen_write_cr3(__pa(initial_page_table)); - memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE); + memblock_reserve(xen_pt_base, xen_pt_size); } #endif /* CONFIG_X86_64 */ +void __init xen_reserve_special_pages(void) +{ + phys_addr_t paddr; + + memblock_reserve(__pa(xen_start_info), PAGE_SIZE); + if (xen_start_info->store_mfn) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } + if (!xen_initial_domain()) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } +} + +void __init xen_pt_check_e820(void) +{ + if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { + xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); + BUG(); + } +} + static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 8b7f18e200aa..bfc08b13044b 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -79,10 +79,14 @@ #include <xen/balloon.h> #include <xen/grant_table.h> -#include "p2m.h" #include "multicalls.h" #include "xen-ops.h" +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + #define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE) unsigned long *xen_p2m_addr __read_mostly; @@ -199,7 +203,8 @@ void __ref xen_build_mfn_list_list(void) unsigned int level, topidx, mididx; unsigned long *mid_mfn_p; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_feature(XENFEAT_auto_translated_physmap) || + xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) return; /* Pre-initialize p2m_top_mfn to be completely missing */ @@ -260,9 +265,16 @@ void xen_setup_mfn_list_list(void) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn); + if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL; + else + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(p2m_top_mfn); HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; + HYPERVISOR_shared_info->arch.p2m_generation = 0; + HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr; + HYPERVISOR_shared_info->arch.p2m_cr3 = + xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); } /* Set up p2m_top to point to the domain-builder provided p2m pages */ @@ -478,8 +490,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) ptechk = lookup_address(vaddr, &level); if (ptechk == pte_pg) { + HYPERVISOR_shared_info->arch.p2m_generation++; + wmb(); /* Tools are synchronizing via p2m_generation. */ set_pmd(pmdp, __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE)); + wmb(); /* Tools are synchronizing via p2m_generation. */ + HYPERVISOR_shared_info->arch.p2m_generation++; pte_newpg[i] = NULL; } @@ -505,7 +521,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) */ static bool alloc_p2m(unsigned long pfn) { - unsigned topidx, mididx; + unsigned topidx; unsigned long *top_mfn_p, *mid_mfn; pte_t *ptep, *pte_pg; unsigned int level; @@ -513,9 +529,6 @@ static bool alloc_p2m(unsigned long pfn) unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); unsigned long p2m_pfn; - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - ptep = lookup_address(addr, &level); BUG_ON(!ptep || level != PG_LEVEL_4K); pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); @@ -527,7 +540,8 @@ static bool alloc_p2m(unsigned long pfn) return false; } - if (p2m_top_mfn) { + if (p2m_top_mfn && pfn < MAX_P2M_PFN) { + topidx = p2m_top_index(pfn); top_mfn_p = &p2m_top_mfn[topidx]; mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); @@ -577,10 +591,14 @@ static bool alloc_p2m(unsigned long pfn) spin_lock_irqsave(&p2m_update_lock, flags); if (pte_pfn(*ptep) == p2m_pfn) { + HYPERVISOR_shared_info->arch.p2m_generation++; + wmb(); /* Tools are synchronizing via p2m_generation. */ set_pte(ptep, pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL)); + wmb(); /* Tools are synchronizing via p2m_generation. */ + HYPERVISOR_shared_info->arch.p2m_generation++; if (mid_mfn) - mid_mfn[mididx] = virt_to_mfn(p2m); + mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m); p2m = NULL; } @@ -630,6 +648,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) return true; } + /* + * The interface requires atomic updates on p2m elements. + * xen_safe_write_ulong() is using __put_user which does an atomic + * store via asm(). + */ if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn))) return true; diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h deleted file mode 100644 index ad8aee24ab72..000000000000 --- a/arch/x86/xen/p2m.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _XEN_P2M_H -#define _XEN_P2M_H - -#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - -#define MAX_REMAP_RANGES 10 - -extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, - unsigned long pfn_e); - -#endif /* _XEN_P2M_H */ diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index a8261716d58d..9586ff32810c 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -68,7 +68,7 @@ static int check_platform_magic(void) return 0; } -bool xen_has_pv_devices() +bool xen_has_pv_devices(void) { if (!xen_domain()) return false; diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c new file mode 100644 index 000000000000..724a08740a04 --- /dev/null +++ b/arch/x86/xen/pmu.c @@ -0,0 +1,570 @@ +#include <linux/types.h> +#include <linux/interrupt.h> + +#include <asm/xen/hypercall.h> +#include <xen/page.h> +#include <xen/interface/xen.h> +#include <xen/interface/vcpu.h> +#include <xen/interface/xenpmu.h> + +#include "xen-ops.h" +#include "pmu.h" + +/* x86_pmu.handle_irq definition */ +#include "../kernel/cpu/perf_event.h" + +#define XENPMU_IRQ_PROCESSING 1 +struct xenpmu { + /* Shared page between hypervisor and domain */ + struct xen_pmu_data *xenpmu_data; + + uint8_t flags; +}; +static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared); +#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data) +#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags) + +/* Macro for computing address of a PMU MSR bank */ +#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \ + (uintptr_t)ctxt->field)) + +/* AMD PMU */ +#define F15H_NUM_COUNTERS 6 +#define F10H_NUM_COUNTERS 4 + +static __read_mostly uint32_t amd_counters_base; +static __read_mostly uint32_t amd_ctrls_base; +static __read_mostly int amd_msr_step; +static __read_mostly int k7_counters_mirrored; +static __read_mostly int amd_num_counters; + +/* Intel PMU */ +#define MSR_TYPE_COUNTER 0 +#define MSR_TYPE_CTRL 1 +#define MSR_TYPE_GLOBAL 2 +#define MSR_TYPE_ARCH_COUNTER 3 +#define MSR_TYPE_ARCH_CTRL 4 + +/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */ +#define PMU_GENERAL_NR_SHIFT 8 +#define PMU_GENERAL_NR_BITS 8 +#define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) \ + << PMU_GENERAL_NR_SHIFT) + +/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */ +#define PMU_FIXED_NR_SHIFT 0 +#define PMU_FIXED_NR_BITS 5 +#define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) - 1) \ + << PMU_FIXED_NR_SHIFT) + +/* Alias registers (0x4c1) for full-width writes to PMCs */ +#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0)) + +#define INTEL_PMC_TYPE_SHIFT 30 + +static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters; + + +static void xen_pmu_arch_init(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + + switch (boot_cpu_data.x86) { + case 0x15: + amd_num_counters = F15H_NUM_COUNTERS; + amd_counters_base = MSR_F15H_PERF_CTR; + amd_ctrls_base = MSR_F15H_PERF_CTL; + amd_msr_step = 2; + k7_counters_mirrored = 1; + break; + case 0x10: + case 0x12: + case 0x14: + case 0x16: + default: + amd_num_counters = F10H_NUM_COUNTERS; + amd_counters_base = MSR_K7_PERFCTR0; + amd_ctrls_base = MSR_K7_EVNTSEL0; + amd_msr_step = 1; + k7_counters_mirrored = 0; + break; + } + } else { + uint32_t eax, ebx, ecx, edx; + + cpuid(0xa, &eax, &ebx, &ecx, &edx); + + intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >> + PMU_GENERAL_NR_SHIFT; + intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >> + PMU_FIXED_NR_SHIFT; + } +} + +static inline uint32_t get_fam15h_addr(u32 addr) +{ + switch (addr) { + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0); + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0); + default: + break; + } + + return addr; +} + +static inline bool is_amd_pmu_msr(unsigned int msr) +{ + if ((msr >= MSR_F15H_PERF_CTL && + msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) || + (msr >= MSR_K7_EVNTSEL0 && + msr < MSR_K7_PERFCTR0 + amd_num_counters)) + return true; + + return false; +} + +static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) +{ + u32 msr_index_pmc; + + switch (msr_index) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + case MSR_IA32_DS_AREA: + case MSR_IA32_PEBS_ENABLE: + *type = MSR_TYPE_CTRL; + return true; + + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + *type = MSR_TYPE_GLOBAL; + return true; + + default: + + if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) && + (msr_index < MSR_CORE_PERF_FIXED_CTR0 + + intel_num_fixed_counters)) { + *index = msr_index - MSR_CORE_PERF_FIXED_CTR0; + *type = MSR_TYPE_COUNTER; + return true; + } + + if ((msr_index >= MSR_P6_EVNTSEL0) && + (msr_index < MSR_P6_EVNTSEL0 + intel_num_arch_counters)) { + *index = msr_index - MSR_P6_EVNTSEL0; + *type = MSR_TYPE_ARCH_CTRL; + return true; + } + + msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK; + if ((msr_index_pmc >= MSR_IA32_PERFCTR0) && + (msr_index_pmc < MSR_IA32_PERFCTR0 + + intel_num_arch_counters)) { + *type = MSR_TYPE_ARCH_COUNTER; + *index = msr_index_pmc - MSR_IA32_PERFCTR0; + return true; + } + return false; + } +} + +static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type, + int index, bool is_read) +{ + uint64_t *reg = NULL; + struct xen_pmu_intel_ctxt *ctxt; + uint64_t *fix_counters; + struct xen_pmu_cntr_pair *arch_cntr_pair; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) + return false; + + ctxt = &xenpmu_data->pmu.c.intel; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + reg = &ctxt->global_ovf_ctrl; + break; + case MSR_CORE_PERF_GLOBAL_STATUS: + reg = &ctxt->global_status; + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + reg = &ctxt->global_ctrl; + break; + case MSR_CORE_PERF_FIXED_CTR_CTRL: + reg = &ctxt->fixed_ctrl; + break; + default: + switch (type) { + case MSR_TYPE_COUNTER: + fix_counters = field_offset(ctxt, fixed_counters); + reg = &fix_counters[index]; + break; + case MSR_TYPE_ARCH_COUNTER: + arch_cntr_pair = field_offset(ctxt, arch_counters); + reg = &arch_cntr_pair[index].counter; + break; + case MSR_TYPE_ARCH_CTRL: + arch_cntr_pair = field_offset(ctxt, arch_counters); + reg = &arch_cntr_pair[index].control; + break; + default: + return false; + } + } + + if (reg) { + if (is_read) + *val = *reg; + else { + *reg = *val; + + if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL) + ctxt->global_status &= (~(*val)); + } + return true; + } + + return false; +} + +static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) +{ + uint64_t *reg = NULL; + int i, off = 0; + struct xen_pmu_amd_ctxt *ctxt; + uint64_t *counter_regs, *ctrl_regs; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) + return false; + + if (k7_counters_mirrored && + ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3))) + msr = get_fam15h_addr(msr); + + ctxt = &xenpmu_data->pmu.c.amd; + for (i = 0; i < amd_num_counters; i++) { + if (msr == amd_ctrls_base + off) { + ctrl_regs = field_offset(ctxt, ctrls); + reg = &ctrl_regs[i]; + break; + } else if (msr == amd_counters_base + off) { + counter_regs = field_offset(ctxt, counters); + reg = &counter_regs[i]; + break; + } + off += amd_msr_step; + } + + if (reg) { + if (is_read) + *val = *reg; + else + *reg = *val; + + return true; + } + return false; +} + +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (is_amd_pmu_msr(msr)) { + if (!xen_amd_pmu_emulate(msr, val, 1)) + *val = native_read_msr_safe(msr, err); + return true; + } + } else { + int type, index; + + if (is_intel_pmu_msr(msr, &type, &index)) { + if (!xen_intel_pmu_emulate(msr, val, type, index, 1)) + *val = native_read_msr_safe(msr, err); + return true; + } + } + + return false; +} + +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) +{ + uint64_t val = ((uint64_t)high << 32) | low; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (is_amd_pmu_msr(msr)) { + if (!xen_amd_pmu_emulate(msr, &val, 0)) + *err = native_write_msr_safe(msr, low, high); + return true; + } + } else { + int type, index; + + if (is_intel_pmu_msr(msr, &type, &index)) { + if (!xen_intel_pmu_emulate(msr, &val, type, index, 0)) + *err = native_write_msr_safe(msr, low, high); + return true; + } + } + + return false; +} + +static unsigned long long xen_amd_read_pmc(int counter) +{ + struct xen_pmu_amd_ctxt *ctxt; + uint64_t *counter_regs; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { + uint32_t msr; + int err; + + msr = amd_counters_base + (counter * amd_msr_step); + return native_read_msr_safe(msr, &err); + } + + ctxt = &xenpmu_data->pmu.c.amd; + counter_regs = field_offset(ctxt, counters); + return counter_regs[counter]; +} + +static unsigned long long xen_intel_read_pmc(int counter) +{ + struct xen_pmu_intel_ctxt *ctxt; + uint64_t *fixed_counters; + struct xen_pmu_cntr_pair *arch_cntr_pair; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { + uint32_t msr; + int err; + + if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) + msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff); + else + msr = MSR_IA32_PERFCTR0 + counter; + + return native_read_msr_safe(msr, &err); + } + + ctxt = &xenpmu_data->pmu.c.intel; + if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) { + fixed_counters = field_offset(ctxt, fixed_counters); + return fixed_counters[counter & 0xffff]; + } + + arch_cntr_pair = field_offset(ctxt, arch_counters); + return arch_cntr_pair[counter].counter; +} + +unsigned long long xen_read_pmc(int counter) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return xen_amd_read_pmc(counter); + else + return xen_intel_read_pmc(counter); +} + +int pmu_apic_update(uint32_t val) +{ + int ret; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return -EINVAL; + } + + xenpmu_data->pmu.l.lapic_lvtpc = val; + + if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING) + return 0; + + ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL); + + return ret; +} + +/* perf callbacks */ +static int xen_is_in_guest(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF)) + return 0; + + return 1; +} + +static int xen_is_user_mode(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) + return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER); + else + return !!(xenpmu_data->pmu.r.regs.cpl & 3); +} + +static unsigned long xen_get_guest_ip(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + return xenpmu_data->pmu.r.regs.ip; +} + +static struct perf_guest_info_callbacks xen_guest_cbs = { + .is_in_guest = xen_is_in_guest, + .is_user_mode = xen_is_user_mode, + .get_guest_ip = xen_get_guest_ip, +}; + +/* Convert registers from Xen's format to Linux' */ +static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, + struct pt_regs *regs, uint64_t pmu_flags) +{ + regs->ip = xen_regs->ip; + regs->cs = xen_regs->cs; + regs->sp = xen_regs->sp; + + if (pmu_flags & PMU_SAMPLE_PV) { + if (pmu_flags & PMU_SAMPLE_USER) + regs->cs |= 3; + else + regs->cs &= ~3; + } else { + if (xen_regs->cpl) + regs->cs |= 3; + else + regs->cs &= ~3; + } +} + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) +{ + int err, ret = IRQ_NONE; + struct pt_regs regs; + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return ret; + } + + this_cpu_ptr(&xenpmu_shared)->flags = + xenpmu_flags | XENPMU_IRQ_PROCESSING; + xen_convert_regs(&xenpmu_data->pmu.r.regs, ®s, + xenpmu_data->pmu.pmu_flags); + if (x86_pmu.handle_irq(®s)) + ret = IRQ_HANDLED; + + /* Write out cached context to HW */ + err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL); + this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags; + if (err) { + pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err); + return IRQ_NONE; + } + + return ret; +} + +bool is_xen_pmu(int cpu) +{ + return (get_xenpmu_data() != NULL); +} + +void xen_pmu_init(int cpu) +{ + int err; + struct xen_pmu_params xp; + unsigned long pfn; + struct xen_pmu_data *xenpmu_data; + + BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE); + + if (xen_hvm_domain()) + return; + + xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL); + if (!xenpmu_data) { + pr_err("VPMU init: No memory\n"); + return; + } + pfn = virt_to_pfn(xenpmu_data); + + xp.val = pfn_to_mfn(pfn); + xp.vcpu = cpu; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp); + if (err) + goto fail; + + per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data; + per_cpu(xenpmu_shared, cpu).flags = 0; + + if (cpu == 0) { + perf_register_guest_info_callbacks(&xen_guest_cbs); + xen_pmu_arch_init(); + } + + return; + +fail: + pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n", + cpu, err); + free_pages((unsigned long)xenpmu_data, 0); +} + +void xen_pmu_finish(int cpu) +{ + struct xen_pmu_params xp; + + if (xen_hvm_domain()) + return; + + xp.vcpu = cpu; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + + (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp); + + free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0); + per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL; +} diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h new file mode 100644 index 000000000000..af5f0ad94078 --- /dev/null +++ b/arch/x86/xen/pmu.h @@ -0,0 +1,15 @@ +#ifndef __XEN_PMU_H +#define __XEN_PMU_H + +#include <xen/interface/xenpmu.h> + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); +void xen_pmu_init(int cpu); +void xen_pmu_finish(int cpu); +bool is_xen_pmu(int cpu); +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); +int pmu_apic_update(uint32_t reg); +unsigned long long xen_read_pmc(int counter); + +#endif /* __XEN_PMU_H */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 55f388ef481a..f5ef6746d47a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -27,17 +27,23 @@ #include <xen/interface/memory.h> #include <xen/interface/physdev.h> #include <xen/features.h> +#include <xen/hvc-console.h> #include "xen-ops.h" #include "vdso.h" -#include "p2m.h" #include "mmu.h" +#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) + /* Amount of extra memory space we add to the e820 ranges */ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; +/* E820 map used during setting up memory. */ +static struct e820entry xen_e820_map[E820MAX] __initdata; +static u32 xen_e820_map_entries __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; */ #define EXTRA_MEM_RATIO (10) -static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size) +static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); + +static void __init xen_parse_512gb(void) +{ + bool val = false; + char *arg; + + arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit"); + if (!arg) + return; + + arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit="); + if (!arg) + val = true; + else if (strtobool(arg + strlen("xen_512gb_limit="), &val)) + return; + + xen_512gb_limit = val; +} + +static void __init xen_add_extra_mem(unsigned long start_pfn, + unsigned long n_pfns) { int i; + /* + * No need to check for zero size, should happen rarely and will only + * write a new entry regarded to be unused due to zero size. + */ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { /* Add new region. */ - if (xen_extra_mem[i].size == 0) { - xen_extra_mem[i].start = start; - xen_extra_mem[i].size = size; + if (xen_extra_mem[i].n_pfns == 0) { + xen_extra_mem[i].start_pfn = start_pfn; + xen_extra_mem[i].n_pfns = n_pfns; break; } /* Append to existing region. */ - if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { - xen_extra_mem[i].size += size; + if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == + start_pfn) { + xen_extra_mem[i].n_pfns += n_pfns; break; } } if (i == XEN_EXTRA_MEM_MAX_REGIONS) printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - memblock_reserve(start, size); + memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } -static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) +static void __init xen_del_extra_mem(unsigned long start_pfn, + unsigned long n_pfns) { int i; - phys_addr_t start_r, size_r; + unsigned long start_r, size_r; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - start_r = xen_extra_mem[i].start; - size_r = xen_extra_mem[i].size; + start_r = xen_extra_mem[i].start_pfn; + size_r = xen_extra_mem[i].n_pfns; /* Start of region. */ - if (start_r == start) { - BUG_ON(size > size_r); - xen_extra_mem[i].start += size; - xen_extra_mem[i].size -= size; + if (start_r == start_pfn) { + BUG_ON(n_pfns > size_r); + xen_extra_mem[i].start_pfn += n_pfns; + xen_extra_mem[i].n_pfns -= n_pfns; break; } /* End of region. */ - if (start_r + size_r == start + size) { - BUG_ON(size > size_r); - xen_extra_mem[i].size -= size; + if (start_r + size_r == start_pfn + n_pfns) { + BUG_ON(n_pfns > size_r); + xen_extra_mem[i].n_pfns -= n_pfns; break; } /* Mid of region. */ - if (start > start_r && start < start_r + size_r) { - BUG_ON(start + size > start_r + size_r); - xen_extra_mem[i].size = start - start_r; + if (start_pfn > start_r && start_pfn < start_r + size_r) { + BUG_ON(start_pfn + n_pfns > start_r + size_r); + xen_extra_mem[i].n_pfns = start_pfn - start_r; /* Calling memblock_reserve() again is okay. */ - xen_add_extra_mem(start + size, start_r + size_r - - (start + size)); + xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r - + (start_pfn + n_pfns)); break; } } - memblock_free(start, size); + memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } /* @@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) unsigned long __ref xen_chk_extra_mem(unsigned long pfn) { int i; - phys_addr_t addr = PFN_PHYS(pfn); for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - if (addr >= xen_extra_mem[i].start && - addr < xen_extra_mem[i].start + xen_extra_mem[i].size) + if (pfn >= xen_extra_mem[i].start_pfn && + pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns) return INVALID_P2M_ENTRY; } @@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void) int i; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - if (!xen_extra_mem[i].size) + if (!xen_extra_mem[i].n_pfns) continue; - pfn_s = PFN_DOWN(xen_extra_mem[i].start); - pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); + pfn_s = xen_extra_mem[i].start_pfn; + pfn_e = pfn_s + xen_extra_mem[i].n_pfns; for (pfn = pfn_s; pfn < pfn_e; pfn++) set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } @@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void) * This function updates min_pfn with the pfn found and returns * the size of that range or zero if not found. */ -static unsigned long __init xen_find_pfn_range( - const struct e820entry *list, size_t map_size, - unsigned long *min_pfn) +static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn) { - const struct e820entry *entry; + const struct e820entry *entry = xen_e820_map; unsigned int i; unsigned long done = 0; - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { unsigned long s_pfn; unsigned long e_pfn; @@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages, unsigned long *released) + unsigned long end_pfn, unsigned long nr_pages) { unsigned long pfn, end; int ret; @@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); if (ret == 1) { - (*released)++; + xen_released_pages++; if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY)) break; } else @@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk( * to Xen and not remapped. */ static unsigned long __init xen_set_identity_and_remap_chunk( - const struct e820entry *list, size_t map_size, unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, - unsigned long *released, unsigned long *remapped) + unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long remap_pfn) { unsigned long pfn; unsigned long i = 0; @@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk( if (cur_pfn + size > nr_pages) size = nr_pages - cur_pfn; - remap_range_size = xen_find_pfn_range(list, map_size, - &remap_pfn); + remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { pr_warning("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages, released); + cur_pfn + left, nr_pages); break; } /* Adjust size to fit in current e820 RAM region */ @@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk( /* Update variables to reflect new mappings. */ i += size; remap_pfn += size; - *remapped += size; } /* @@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk( return remap_pfn; } -static void __init xen_set_identity_and_remap( - const struct e820entry *list, size_t map_size, unsigned long nr_pages, - unsigned long *released, unsigned long *remapped) +static void __init xen_set_identity_and_remap(unsigned long nr_pages) { phys_addr_t start = 0; unsigned long last_pfn = nr_pages; - const struct e820entry *entry; - unsigned long num_released = 0; - unsigned long num_remapped = 0; + const struct e820entry *entry = xen_e820_map; int i; /* @@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap( * example) the DMI tables in a reserved region that begins on * a non-page boundary. */ - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { phys_addr_t end = entry->addr + entry->size; - if (entry->type == E820_RAM || i == map_size - 1) { + if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap( if (start_pfn < end_pfn) last_pfn = xen_set_identity_and_remap_chunk( - list, map_size, start_pfn, - end_pfn, nr_pages, last_pfn, - &num_released, &num_remapped); + start_pfn, end_pfn, nr_pages, + last_pfn); start = end; } } - *released = num_released; - *remapped = num_remapped; - - pr_info("Released %ld page(s)\n", num_released); + pr_info("Released %ld page(s)\n", xen_released_pages); } /* @@ -494,7 +513,7 @@ void __init xen_remap_memory(void) } else if (pfn_s + len == xen_remap_buf.target_pfn) { len += xen_remap_buf.size; } else { - xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + xen_del_extra_mem(pfn_s, len); pfn_s = xen_remap_buf.target_pfn; len = xen_remap_buf.size; } @@ -504,19 +523,36 @@ void __init xen_remap_memory(void) } if (pfn_s != ~0UL && len) - xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + xen_del_extra_mem(pfn_s, len); set_pte_mfn(buf, mfn_save, PAGE_KERNEL); pr_info("Remapped %ld page(s)\n", remapped); } +static unsigned long __init xen_get_pages_limit(void) +{ + unsigned long limit; + +#ifdef CONFIG_X86_32 + limit = GB(64) / PAGE_SIZE; +#else + limit = MAXMEM / PAGE_SIZE; + if (!xen_initial_domain() && xen_512gb_limit) + limit = GB(512) / PAGE_SIZE; +#endif + return limit; +} + static unsigned long __init xen_get_max_pages(void) { - unsigned long max_pages = MAX_DOMAIN_PAGES; + unsigned long max_pages, limit; domid_t domid = DOMID_SELF; int ret; + limit = xen_get_pages_limit(); + max_pages = limit; + /* * For the initial domain we use the maximum reservation as * the maximum page. @@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void) max_pages = ret; } - return min(max_pages, MAX_DOMAIN_PAGES); + return min(max_pages, limit); } static void __init xen_align_and_add_e820_region(phys_addr_t start, @@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start, e820_add_region(start, end - start, type); } -static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size) +static void __init xen_ignore_unusable(void) { - struct e820entry *entry; + struct e820entry *entry = xen_e820_map; unsigned int i; - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { if (entry->type == E820_UNUSABLE) entry->type = E820_RAM; } } +static unsigned long __init xen_count_remap_pages(unsigned long max_pfn) +{ + unsigned long extra = 0; + unsigned long start_pfn, end_pfn; + const struct e820entry *entry = xen_e820_map; + int i; + + end_pfn = 0; + for (i = 0; i < xen_e820_map_entries; i++, entry++) { + start_pfn = PFN_DOWN(entry->addr); + /* Adjacent regions on non-page boundaries handling! */ + end_pfn = min(end_pfn, start_pfn); + + if (start_pfn >= max_pfn) + return extra + max_pfn - end_pfn; + + /* Add any holes in map to result. */ + extra += start_pfn - end_pfn; + + end_pfn = PFN_UP(entry->addr + entry->size); + end_pfn = min(end_pfn, max_pfn); + + if (entry->type != E820_RAM) + extra += end_pfn - start_pfn; + } + + return extra; +} + +bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) +{ + struct e820entry *entry; + unsigned mapcnt; + phys_addr_t end; + + if (!size) + return false; + + end = start + size; + entry = xen_e820_map; + + for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) { + if (entry->type == E820_RAM && entry->addr <= start && + (entry->addr + entry->size) >= end) + return false; + + entry++; + } + + return true; +} + +/* + * Find a free area in physical memory not yet reserved and compliant with + * E820 map. + * Used to relocate pre-allocated areas like initrd or p2m list which are in + * conflict with the to be used E820 map. + * In case no area is found, return 0. Otherwise return the physical address + * of the area which is already reserved for convenience. + */ +phys_addr_t __init xen_find_free_area(phys_addr_t size) +{ + unsigned mapcnt; + phys_addr_t addr, start; + struct e820entry *entry = xen_e820_map; + + for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) { + if (entry->type != E820_RAM || entry->size < size) + continue; + start = entry->addr; + for (addr = start; addr < start + size; addr += PAGE_SIZE) { + if (!memblock_is_reserved(addr)) + continue; + start = addr + PAGE_SIZE; + if (start + size > entry->addr + entry->size) + break; + } + if (addr >= start + size) { + memblock_reserve(start, size); + return start; + } + } + + return 0; +} + +/* + * Like memcpy, but with physical addresses for dest and src. + */ +static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src, + phys_addr_t n) +{ + phys_addr_t dest_off, src_off, dest_len, src_len, len; + void *from, *to; + + while (n) { + dest_off = dest & ~PAGE_MASK; + src_off = src & ~PAGE_MASK; + dest_len = n; + if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off) + dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off; + src_len = n; + if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off) + src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off; + len = min(dest_len, src_len); + to = early_memremap(dest - dest_off, dest_len + dest_off); + from = early_memremap(src - src_off, src_len + src_off); + memcpy(to, from, len); + early_memunmap(to, dest_len + dest_off); + early_memunmap(from, src_len + src_off); + n -= len; + dest += len; + src += len; + } +} + +/* + * Reserve Xen mfn_list. + */ +static void __init xen_reserve_xen_mfnlist(void) +{ + phys_addr_t start, size; + + if (xen_start_info->mfn_list >= __START_KERNEL_map) { + start = __pa(xen_start_info->mfn_list); + size = PFN_ALIGN(xen_start_info->nr_pages * + sizeof(unsigned long)); + } else { + start = PFN_PHYS(xen_start_info->first_p2m_pfn); + size = PFN_PHYS(xen_start_info->nr_p2m_frames); + } + + if (!xen_is_e820_reserved(start, size)) { + memblock_reserve(start, size); + return; + } + +#ifdef CONFIG_X86_32 + /* + * Relocating the p2m on 32 bit system to an arbitrary virtual address + * is not supported, so just give up. + */ + xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n"); + BUG(); +#else + xen_relocate_p2m(); +#endif +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { - static struct e820entry map[E820MAX] __initdata; - - unsigned long max_pfn = xen_start_info->nr_pages; - phys_addr_t mem_end; + unsigned long max_pfn, pfn_s, n_pfns; + phys_addr_t mem_end, addr, size, chunk_size; + u32 type; int rc; struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; - unsigned long remapped_pages; int i; int op; - max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); + xen_parse_512gb(); + max_pfn = xen_get_pages_limit(); + max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, map); + set_xen_guest_handle(memmap.buffer, xen_e820_map); op = xen_initial_domain() ? XENMEM_machine_memory_map : @@ -590,15 +775,16 @@ char * __init xen_memory_setup(void) if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; - map[0].addr = 0ULL; - map[0].size = mem_end; + xen_e820_map[0].addr = 0ULL; + xen_e820_map[0].size = mem_end; /* 8MB slack (to balance backend allocations). */ - map[0].size += 8ULL << 20; - map[0].type = E820_RAM; + xen_e820_map[0].size += 8ULL << 20; + xen_e820_map[0].type = E820_RAM; rc = 0; } BUG_ON(rc); BUG_ON(memmap.nr_entries == 0); + xen_e820_map_entries = memmap.nr_entries; /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE @@ -609,24 +795,19 @@ char * __init xen_memory_setup(void) * a patch in the future. */ if (xen_initial_domain()) - xen_ignore_unusable(map, memmap.nr_entries); + xen_ignore_unusable(); /* Make sure the Xen-supplied memory map is well-ordered. */ - sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); + sanitize_e820_map(xen_e820_map, xen_e820_map_entries, + &xen_e820_map_entries); max_pages = xen_get_max_pages(); - if (max_pages > max_pfn) - extra_pages += max_pages - max_pfn; - /* - * Set identity map on non-RAM pages and prepare remapping the - * underlying RAM. - */ - xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, - &xen_released_pages, &remapped_pages); + /* How many extra pages do we need due to remapping? */ + max_pages += xen_count_remap_pages(max_pfn); - extra_pages += xen_released_pages; - extra_pages += remapped_pages; + if (max_pages > max_pfn) + extra_pages += max_pages - max_pfn; /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO @@ -635,46 +816,54 @@ char * __init xen_memory_setup(void) * is limited to the max size of lowmem, so that it doesn't * get completely filled. * + * Make sure we have no memory above max_pages, as this area + * isn't handled by the p2m management. + * * In principle there could be a problem in lowmem systems if * the initial memory is also very large with respect to * lowmem, but we won't try to deal with that here. */ - extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), - extra_pages); + extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + extra_pages, max_pages - max_pfn); i = 0; - while (i < memmap.nr_entries) { - phys_addr_t addr = map[i].addr; - phys_addr_t size = map[i].size; - u32 type = map[i].type; + addr = xen_e820_map[0].addr; + size = xen_e820_map[0].size; + while (i < xen_e820_map_entries) { + chunk_size = size; + type = xen_e820_map[i].type; if (type == E820_RAM) { if (addr < mem_end) { - size = min(size, mem_end - addr); + chunk_size = min(size, mem_end - addr); } else if (extra_pages) { - size = min(size, PFN_PHYS(extra_pages)); - extra_pages -= PFN_DOWN(size); - xen_add_extra_mem(addr, size); - xen_max_p2m_pfn = PFN_DOWN(addr + size); + chunk_size = min(size, PFN_PHYS(extra_pages)); + pfn_s = PFN_UP(addr); + n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s; + extra_pages -= n_pfns; + xen_add_extra_mem(pfn_s, n_pfns); + xen_max_p2m_pfn = pfn_s + n_pfns; } else type = E820_UNUSABLE; } - xen_align_and_add_e820_region(addr, size, type); + xen_align_and_add_e820_region(addr, chunk_size, type); - map[i].addr += size; - map[i].size -= size; - if (map[i].size == 0) + addr += chunk_size; + size -= chunk_size; + if (size == 0) { i++; + if (i < xen_e820_map_entries) { + addr = xen_e820_map[i].addr; + size = xen_e820_map[i].size; + } + } } /* * Set the rest as identity mapped, in case PCI BARs are * located here. - * - * PFNs above MAX_P2M_PFN are considered identity mapped as - * well. */ - set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); + set_phys_range_identity(addr / PAGE_SIZE, ~0ul); /* * In domU, the ISA region is normal, usable memory, but we @@ -684,34 +873,53 @@ char * __init xen_memory_setup(void) e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + /* - * Reserve Xen bits: - * - mfn_list - * - xen_start_info - * See comment above "struct start_info" in <xen/interface/xen.h> - * We tried to make the the memblock_reserve more selective so - * that it would be clear what region is reserved. Sadly we ran - * in the problem wherein on a 64-bit hypervisor with a 32-bit - * initial domain, the pt_base has the cr3 value which is not - * neccessarily where the pagetable starts! As Jan put it: " - * Actually, the adjustment turns out to be correct: The page - * tables for a 32-on-64 dom0 get allocated in the order "first L1", - * "first L2", "first L3", so the offset to the page table base is - * indeed 2. When reading xen/include/public/xen.h's comment - * very strictly, this is not a violation (since there nothing is said - * that the first thing in the page table space is pointed to by - * pt_base; I admit that this seems to be implied though, namely - * do I think that it is implied that the page table space is the - * range [pt_base, pt_base + nt_pt_frames), whereas that - * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), - * which - without a priori knowledge - the kernel would have - * difficulty to figure out)." - so lets just fall back to the - * easy way and reserve the whole region. + * Check whether the kernel itself conflicts with the target E820 map. + * Failing now is better than running into weird problems later due + * to relocating (and even reusing) pages with kernel text or data. */ - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); + if (xen_is_e820_reserved(__pa_symbol(_text), + __pa_symbol(__bss_stop) - __pa_symbol(_text))) { + xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); + BUG(); + } - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + /* + * Check for a conflict of the hypervisor supplied page tables with + * the target E820 map. + */ + xen_pt_check_e820(); + + xen_reserve_xen_mfnlist(); + + /* Check for a conflict of the initrd with the target E820 map. */ + if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image, + boot_params.hdr.ramdisk_size)) { + phys_addr_t new_area, start, size; + + new_area = xen_find_free_area(boot_params.hdr.ramdisk_size); + if (!new_area) { + xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n"); + BUG(); + } + + start = boot_params.hdr.ramdisk_image; + size = boot_params.hdr.ramdisk_size; + xen_phys_memcpy(new_area, start, size); + pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n", + start, start + size, new_area, new_area + size); + memblock_free(start, size); + boot_params.hdr.ramdisk_image = new_area; + boot_params.ext_ramdisk_image = new_area >> 32; + } + + /* + * Set identity map on non-RAM pages and prepare remapping the + * underlying RAM. + */ + xen_set_identity_and_remap(max_pfn); return "Xen"; } @@ -721,26 +929,30 @@ char * __init xen_memory_setup(void) */ char * __init xen_auto_xlated_memory_setup(void) { - static struct e820entry map[E820MAX] __initdata; - struct xen_memory_map memmap; int i; int rc; memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, map); + set_xen_guest_handle(memmap.buffer, xen_e820_map); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); if (rc < 0) panic("No memory map (%d)\n", rc); - sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries); + xen_e820_map_entries = memmap.nr_entries; + + sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), + &xen_e820_map_entries); - for (i = 0; i < memmap.nr_entries; i++) - e820_add_region(map[i].addr, map[i].size, map[i].type); + for (i = 0; i < xen_e820_map_entries; i++) + e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size, + xen_e820_map[i].type); - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); + /* Remove p2m info, it is not needed. */ + xen_start_info->mfn_list = 0; + xen_start_info->first_p2m_pfn = 0; + xen_start_info->nr_p2m_frames = 0; return "Xen"; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 86484384492e..2a9ff7342791 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -26,6 +26,7 @@ #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> +#include <xen/interface/xenpmu.h> #include <asm/xen/interface.h> #include <asm/xen/hypercall.h> @@ -38,6 +39,7 @@ #include "xen-ops.h" #include "mmu.h" #include "smp.h" +#include "pmu.h" cpumask_var_t xen_cpu_initialized_map; @@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); @@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu) kfree(per_cpu(xen_irq_work, cpu).name); per_cpu(xen_irq_work, cpu).name = NULL; } + + if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); + per_cpu(xen_pmu_irq, cpu).irq = -1; + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; + } }; static int xen_smp_intr_init(unsigned int cpu) { int rc; - char *resched_name, *callfunc_name, *debug_name; + char *resched_name, *callfunc_name, *debug_name, *pmu_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, @@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu) per_cpu(xen_irq_work, cpu).irq = rc; per_cpu(xen_irq_work, cpu).name = callfunc_name; + if (is_xen_pmu(cpu)) { + pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, + xen_pmu_irq_handler, + IRQF_PERCPU|IRQF_NOBALANCING, + pmu_name, NULL); + if (rc < 0) + goto fail; + per_cpu(xen_pmu_irq, cpu).irq = rc; + per_cpu(xen_pmu_irq, cpu).name = pmu_name; + } + return 0; fail: @@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) } set_cpu_sibling_map(0); + xen_pmu_init(0); + if (xen_smp_intr_init(0)) BUG(); @@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) if (rc) return rc; + xen_pmu_init(cpu); + rc = xen_smp_intr_init(cpu); if (rc) return rc; @@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu) xen_smp_intr_free(cpu); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); + xen_pmu_finish(cpu); } } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 53b4c0811f4f..feddabdab448 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -11,6 +11,7 @@ #include "xen-ops.h" #include "mmu.h" +#include "pmu.h" static void xen_pv_pre_suspend(void) { @@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled) void xen_arch_pre_suspend(void) { - if (xen_pv_domain()) - xen_pv_pre_suspend(); + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); + + if (xen_pv_domain()) + xen_pv_pre_suspend(); } void xen_arch_post_suspend(int cancelled) { - if (xen_pv_domain()) - xen_pv_post_suspend(cancelled); - else - xen_hvm_post_suspend(cancelled); + int cpu; + + if (xen_pv_domain()) + xen_pv_post_suspend(cancelled); + else + xen_hvm_post_suspend(cancelled); + + for_each_online_cpu(cpu) + xen_pmu_init(cpu); } static void xen_vcpu_notify_restore(void *data) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 8afdfccf6086..b65f59a358a2 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -104,6 +104,8 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) #else ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) + /* Map the p2m table to a 512GB-aligned user address. */ + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) #endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 2292721b1d10..1399423f3418 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); +void __init xen_reserve_special_pages(void); +void __init xen_pt_check_e820(void); void xen_mm_pin_all(void); void xen_mm_unpin_all(void); +#ifdef CONFIG_X86_64 +void __init xen_relocate_p2m(void); +#endif +bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); +phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); char * xen_auto_xlated_memory_setup(void); void __init xen_arch_setup(void); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 5f6b3be0a93c..15083539df15 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -37,6 +37,7 @@ #include <linux/interrupt.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/hdreg.h> #include <linux/cdrom.h> #include <linux/module.h> @@ -147,6 +148,7 @@ struct blkfront_info unsigned int feature_persistent:1; unsigned int max_indirect_segments; int is_ready; + struct blk_mq_tag_set tag_set; }; static unsigned int nr_minors; @@ -616,54 +618,41 @@ static inline bool blkif_request_flush_invalid(struct request *req, !(info->feature_flush & REQ_FUA))); } -/* - * do_blkif_request - * read a block; request is in a request queue - */ -static void do_blkif_request(struct request_queue *rq) +static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *qd) { - struct blkfront_info *info = NULL; - struct request *req; - int queued; - - pr_debug("Entered do_blkif_request\n"); - - queued = 0; + struct blkfront_info *info = qd->rq->rq_disk->private_data; - while ((req = blk_peek_request(rq)) != NULL) { - info = req->rq_disk->private_data; - - if (RING_FULL(&info->ring)) - goto wait; + blk_mq_start_request(qd->rq); + spin_lock_irq(&info->io_lock); + if (RING_FULL(&info->ring)) + goto out_busy; - blk_start_request(req); + if (blkif_request_flush_invalid(qd->rq, info)) + goto out_err; - if (blkif_request_flush_invalid(req, info)) { - __blk_end_request_all(req, -EOPNOTSUPP); - continue; - } + if (blkif_queue_request(qd->rq)) + goto out_busy; - pr_debug("do_blk_req %p: cmd %p, sec %lx, " - "(%u/%u) [%s]\n", - req, req->cmd, (unsigned long)blk_rq_pos(req), - blk_rq_cur_sectors(req), blk_rq_sectors(req), - rq_data_dir(req) ? "write" : "read"); - - if (blkif_queue_request(req)) { - blk_requeue_request(rq, req); -wait: - /* Avoid pointless unplugs. */ - blk_stop_queue(rq); - break; - } + flush_requests(info); + spin_unlock_irq(&info->io_lock); + return BLK_MQ_RQ_QUEUE_OK; - queued++; - } +out_err: + spin_unlock_irq(&info->io_lock); + return BLK_MQ_RQ_QUEUE_ERROR; - if (queued != 0) - flush_requests(info); +out_busy: + spin_unlock_irq(&info->io_lock); + blk_mq_stop_hw_queue(hctx); + return BLK_MQ_RQ_QUEUE_BUSY; } +static struct blk_mq_ops blkfront_mq_ops = { + .queue_rq = blkif_queue_rq, + .map_queue = blk_mq_map_queue, +}; + static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, unsigned int physical_sector_size, unsigned int segments) @@ -671,9 +660,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, struct request_queue *rq; struct blkfront_info *info = gd->private_data; - rq = blk_init_queue(do_blkif_request, &info->io_lock); - if (rq == NULL) + memset(&info->tag_set, 0, sizeof(info->tag_set)); + info->tag_set.ops = &blkfront_mq_ops; + info->tag_set.nr_hw_queues = 1; + info->tag_set.queue_depth = BLK_RING_SIZE(info); + info->tag_set.numa_node = NUMA_NO_NODE; + info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + info->tag_set.cmd_size = 0; + info->tag_set.driver_data = info; + + if (blk_mq_alloc_tag_set(&info->tag_set)) return -1; + rq = blk_mq_init_queue(&info->tag_set); + if (IS_ERR(rq)) { + blk_mq_free_tag_set(&info->tag_set); + return -1; + } queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); @@ -901,19 +903,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, static void xlvbd_release_gendisk(struct blkfront_info *info) { unsigned int minor, nr_minors; - unsigned long flags; if (info->rq == NULL) return; - spin_lock_irqsave(&info->io_lock, flags); - /* No more blkif_request(). */ - blk_stop_queue(info->rq); + blk_mq_stop_hw_queues(info->rq); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); - spin_unlock_irqrestore(&info->io_lock, flags); /* Flush gnttab callback work. Must be done with no locks held. */ flush_work(&info->work); @@ -925,20 +923,18 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) xlbd_release_minors(minor, nr_minors); blk_cleanup_queue(info->rq); + blk_mq_free_tag_set(&info->tag_set); info->rq = NULL; put_disk(info->gd); info->gd = NULL; } +/* Must be called with io_lock holded */ static void kick_pending_request_queues(struct blkfront_info *info) { - if (!RING_FULL(&info->ring)) { - /* Re-enable calldowns. */ - blk_start_queue(info->rq); - /* Kick things off immediately. */ - do_blkif_request(info->rq); - } + if (!RING_FULL(&info->ring)) + blk_mq_start_stopped_hw_queues(info->rq, true); } static void blkif_restart_queue(struct work_struct *work) @@ -963,7 +959,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; /* No more blkif_request(). */ if (info->rq) - blk_stop_queue(info->rq); + blk_mq_stop_hw_queues(info->rq); /* Remove all persistent grants */ if (!list_empty(&info->grants)) { @@ -1146,7 +1142,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) RING_IDX i, rp; unsigned long flags; struct blkfront_info *info = (struct blkfront_info *)dev_id; - int error; spin_lock_irqsave(&info->io_lock, flags); @@ -1187,37 +1182,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) continue; } - error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; + req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; switch (bret->operation) { case BLKIF_OP_DISCARD: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { struct request_queue *rq = info->rq; printk(KERN_WARNING "blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - error = -EOPNOTSUPP; + req->errors = -EOPNOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; queue_flag_clear(QUEUE_FLAG_DISCARD, rq); queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); } - __blk_end_request_all(req, error); + blk_mq_complete_request(req); break; case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { printk(KERN_WARNING "blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - error = -EOPNOTSUPP; + req->errors = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && info->shadow[id].req.u.rw.nr_segments == 0)) { printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - error = -EOPNOTSUPP; + req->errors = -EOPNOTSUPP; } - if (unlikely(error)) { - if (error == -EOPNOTSUPP) - error = 0; + if (unlikely(req->errors)) { + if (req->errors == -EOPNOTSUPP) + req->errors = 0; info->feature_flush = 0; xlvbd_flush(info); } @@ -1228,7 +1223,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " "request: %x\n", bret->status); - __blk_end_request_all(req, error); + blk_mq_complete_request(req); break; default: BUG(); @@ -1555,28 +1550,6 @@ static int blkif_recover(struct blkfront_info *info) kfree(copy); - /* - * Empty the queue, this is important because we might have - * requests in the queue with more segments than what we - * can handle now. - */ - spin_lock_irq(&info->io_lock); - while ((req = blk_fetch_request(info->rq)) != NULL) { - if (req->cmd_flags & - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { - list_add(&req->queuelist, &requests); - continue; - } - merge_bio.head = req->bio; - merge_bio.tail = req->biotail; - bio_list_merge(&bio_list, &merge_bio); - req->bio = NULL; - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) - pr_alert("diskcache flush request found!\n"); - __blk_end_request_all(req, 0); - } - spin_unlock_irq(&info->io_lock); - xenbus_switch_state(info->xbdev, XenbusStateConnected); spin_lock_irq(&info->io_lock); @@ -1591,9 +1564,10 @@ static int blkif_recover(struct blkfront_info *info) /* Requeue pending requests (flush or discard) */ list_del_init(&req->queuelist); BUG_ON(req->nr_phys_segments > segs); - blk_requeue_request(info->rq, req); + blk_mq_requeue_request(req); } spin_unlock_irq(&info->io_lock); + blk_mq_kick_requeue_list(info->rq); while ((bio = bio_list_pop(&bio_list)) != NULL) { /* Traverse the list of pending bios and re-queue them */ diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 7cd226da15fe..73708acce3ca 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -280,4 +280,15 @@ config XEN_ACPI def_bool y depends on X86 && ACPI +config XEN_SYMS + bool "Xen symbols" + depends on X86 && XEN_DOM0 && XENFS + default y if KALLSYMS + help + Exports hypervisor symbols (along with their types and addresses) via + /proc/xen/xensyms file, similar to /proc/kallsyms + +config XEN_HAVE_VPMU + bool + endmenu diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index bf4a23c7c591..1fa633b2d556 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -638,9 +638,9 @@ static int __init balloon_init(void) * regions (see arch/x86/xen/setup.c). */ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) - if (xen_extra_mem[i].size) - balloon_add_region(PFN_UP(xen_extra_mem[i].start), - PFN_DOWN(xen_extra_mem[i].size)); + if (xen_extra_mem[i].n_pfns) + balloon_add_region(xen_extra_mem[i].start_pfn, + xen_extra_mem[i].n_pfns); return 0; } diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index ed8bf1067a97..68d129019e8f 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1301,11 +1301,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) if (!VALID_EVTCHN(evtchn)) return -1; - /* - * Events delivered via platform PCI interrupts are always - * routed to vcpu 0 and hence cannot be rebound. - */ - if (xen_hvm_domain() && !xen_have_vector_callback) + if (!xen_support_evtchn_rebind()) return -1; /* Send future instances of this interrupt to other vcpu. */ diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 96453f8a85c5..b5a7342e0ba5 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -20,6 +20,9 @@ #include <xen/xenbus.h> #include <xen/interface/xen.h> #include <xen/interface/version.h> +#ifdef CONFIG_XEN_HAVE_VPMU +#include <xen/interface/xenpmu.h> +#endif #define HYPERVISOR_ATTR_RO(_name) \ static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) @@ -368,6 +371,126 @@ static void xen_properties_destroy(void) sysfs_remove_group(hypervisor_kobj, &xen_properties_group); } +#ifdef CONFIG_XEN_HAVE_VPMU +struct pmu_mode { + const char *name; + uint32_t mode; +}; + +static struct pmu_mode pmu_modes[] = { + {"off", XENPMU_MODE_OFF}, + {"self", XENPMU_MODE_SELF}, + {"hv", XENPMU_MODE_HV}, + {"all", XENPMU_MODE_ALL} +}; + +static ssize_t pmu_mode_store(struct hyp_sysfs_attr *attr, + const char *buffer, size_t len) +{ + int ret; + struct xen_pmu_params xp; + int i; + + for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) { + if (strncmp(buffer, pmu_modes[i].name, len - 1) == 0) { + xp.val = pmu_modes[i].mode; + break; + } + } + + if (i == ARRAY_SIZE(pmu_modes)) + return -EINVAL; + + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_mode_set, &xp); + if (ret) + return ret; + + return len; +} + +static ssize_t pmu_mode_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + struct xen_pmu_params xp; + int i; + uint32_t mode; + + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_mode_get, &xp); + if (ret) + return ret; + + mode = (uint32_t)xp.val; + for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) { + if (mode == pmu_modes[i].mode) + return sprintf(buffer, "%s\n", pmu_modes[i].name); + } + + return -EINVAL; +} +HYPERVISOR_ATTR_RW(pmu_mode); + +static ssize_t pmu_features_store(struct hyp_sysfs_attr *attr, + const char *buffer, size_t len) +{ + int ret; + uint32_t features; + struct xen_pmu_params xp; + + ret = kstrtou32(buffer, 0, &features); + if (ret) + return ret; + + xp.val = features; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_feature_set, &xp); + if (ret) + return ret; + + return len; +} + +static ssize_t pmu_features_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + struct xen_pmu_params xp; + + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_feature_get, &xp); + if (ret) + return ret; + + return sprintf(buffer, "0x%x\n", (uint32_t)xp.val); +} +HYPERVISOR_ATTR_RW(pmu_features); + +static struct attribute *xen_pmu_attrs[] = { + &pmu_mode_attr.attr, + &pmu_features_attr.attr, + NULL +}; + +static const struct attribute_group xen_pmu_group = { + .name = "pmu", + .attrs = xen_pmu_attrs, +}; + +static int __init xen_pmu_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &xen_pmu_group); +} + +static void xen_pmu_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &xen_pmu_group); +} +#endif + static int __init hyper_sysfs_init(void) { int ret; @@ -390,7 +513,15 @@ static int __init hyper_sysfs_init(void) ret = xen_properties_init(); if (ret) goto prop_out; - +#ifdef CONFIG_XEN_HAVE_VPMU + if (xen_initial_domain()) { + ret = xen_pmu_init(); + if (ret) { + xen_properties_destroy(); + goto prop_out; + } + } +#endif goto out; prop_out: @@ -407,6 +538,9 @@ out: static void __exit hyper_sysfs_exit(void) { +#ifdef CONFIG_XEN_HAVE_VPMU + xen_pmu_destroy(); +#endif xen_properties_destroy(); xen_compilation_destroy(); xen_sysfs_uuid_destroy(); diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile index b019865fcc56..1a83010ddffa 100644 --- a/drivers/xen/xenfs/Makefile +++ b/drivers/xen/xenfs/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_XENFS) += xenfs.o xenfs-y = super.o xenfs-$(CONFIG_XEN_DOM0) += xenstored.o +xenfs-$(CONFIG_XEN_SYMS) += xensyms.o diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 06092e0fe8ce..8559a71f36b1 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -57,6 +57,9 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent) { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, { "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR}, { "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR}, +#ifdef CONFIG_XEN_SYMS + { "xensyms", &xensyms_ops, S_IRUSR}, +#endif {""}, }; diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h index 6b80c7779c02..2c5934ea9b1e 100644 --- a/drivers/xen/xenfs/xenfs.h +++ b/drivers/xen/xenfs/xenfs.h @@ -3,5 +3,6 @@ extern const struct file_operations xsd_kva_file_ops; extern const struct file_operations xsd_port_file_ops; +extern const struct file_operations xensyms_ops; #endif /* _XENFS_XENBUS_H */ diff --git a/drivers/xen/xenfs/xensyms.c b/drivers/xen/xenfs/xensyms.c new file mode 100644 index 000000000000..f8b12856753f --- /dev/null +++ b/drivers/xen/xenfs/xensyms.c @@ -0,0 +1,152 @@ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/seq_file.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/slab.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> +#include <xen/xen-ops.h> +#include "xenfs.h" + + +#define XEN_KSYM_NAME_LEN 127 /* Hypervisor may have different name length */ + +struct xensyms { + struct xen_platform_op op; + char *name; + uint32_t namelen; +}; + +/* Grab next output page from the hypervisor */ +static int xensyms_next_sym(struct xensyms *xs) +{ + int ret; + struct xenpf_symdata *symdata = &xs->op.u.symdata; + uint64_t symnum; + + memset(xs->name, 0, xs->namelen); + symdata->namelen = xs->namelen; + + symnum = symdata->symnum; + + ret = HYPERVISOR_dom0_op(&xs->op); + if (ret < 0) + return ret; + + /* + * If hypervisor's symbol didn't fit into the buffer then allocate + * a larger buffer and try again. + */ + if (unlikely(symdata->namelen > xs->namelen)) { + kfree(xs->name); + + xs->namelen = symdata->namelen; + xs->name = kzalloc(xs->namelen, GFP_KERNEL); + if (!xs->name) + return -ENOMEM; + + set_xen_guest_handle(symdata->name, xs->name); + symdata->symnum--; /* Rewind */ + + ret = HYPERVISOR_dom0_op(&xs->op); + if (ret < 0) + return ret; + } + + if (symdata->symnum == symnum) + /* End of symbols */ + return 1; + + return 0; +} + +static void *xensyms_start(struct seq_file *m, loff_t *pos) +{ + struct xensyms *xs = (struct xensyms *)m->private; + + xs->op.u.symdata.symnum = *pos; + + if (xensyms_next_sym(xs)) + return NULL; + + return m->private; +} + +static void *xensyms_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct xensyms *xs = (struct xensyms *)m->private; + + xs->op.u.symdata.symnum = ++(*pos); + + if (xensyms_next_sym(xs)) + return NULL; + + return p; +} + +static int xensyms_show(struct seq_file *m, void *p) +{ + struct xensyms *xs = (struct xensyms *)m->private; + struct xenpf_symdata *symdata = &xs->op.u.symdata; + + seq_printf(m, "%016llx %c %s\n", symdata->address, + symdata->type, xs->name); + + return 0; +} + +static void xensyms_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations xensyms_seq_ops = { + .start = xensyms_start, + .next = xensyms_next, + .show = xensyms_show, + .stop = xensyms_stop, +}; + +static int xensyms_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + struct xensyms *xs; + int ret; + + ret = seq_open_private(file, &xensyms_seq_ops, + sizeof(struct xensyms)); + if (ret) + return ret; + + m = file->private_data; + xs = (struct xensyms *)m->private; + + xs->namelen = XEN_KSYM_NAME_LEN + 1; + xs->name = kzalloc(xs->namelen, GFP_KERNEL); + if (!xs->name) { + seq_release_private(inode, file); + return -ENOMEM; + } + set_xen_guest_handle(xs->op.u.symdata.name, xs->name); + xs->op.cmd = XENPF_get_symbol; + xs->op.u.symdata.namelen = xs->namelen; + + return 0; +} + +static int xensyms_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct xensyms *xs = (struct xensyms *)m->private; + + kfree(xs->name); + return seq_release_private(inode, file); +} + +const struct file_operations xensyms_ops = { + .open = xensyms_open, + .read = seq_read, + .llseek = seq_lseek, + .release = xensyms_release +}; diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h index a5de55c04fb2..316bd043319e 100644 --- a/include/asm-generic/early_ioremap.h +++ b/include/asm-generic/early_ioremap.h @@ -11,6 +11,8 @@ extern void __iomem *early_ioremap(resource_size_t phys_addr, unsigned long size); extern void *early_memremap(resource_size_t phys_addr, unsigned long size); +extern void *early_memremap_ro(resource_size_t phys_addr, + unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); extern void early_memunmap(void *addr, unsigned long size); diff --git a/include/asm-generic/fixmap.h b/include/asm-generic/fixmap.h index f23174fb9ec4..1cbb8338edf3 100644 --- a/include/asm-generic/fixmap.h +++ b/include/asm-generic/fixmap.h @@ -46,6 +46,9 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr) #ifndef FIXMAP_PAGE_NORMAL #define FIXMAP_PAGE_NORMAL PAGE_KERNEL #endif +#if !defined(FIXMAP_PAGE_RO) && defined(PAGE_KERNEL_RO) +#define FIXMAP_PAGE_RO PAGE_KERNEL_RO +#endif #ifndef FIXMAP_PAGE_NOCACHE #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_NOCACHE #endif diff --git a/include/xen/events.h b/include/xen/events.h index 7d95fdf9cf3e..88da2abaf535 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -92,7 +92,6 @@ void xen_hvm_callback_vector(void); #ifdef CONFIG_TRACING #define trace_xen_hvm_callback_vector xen_hvm_callback_vector #endif -extern int xen_have_vector_callback; int xen_set_callback_via(uint64_t via); void xen_evtchn_do_upcall(struct pt_regs *regs); void xen_hvm_evtchn_do_upcall(void); diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h index 5cc49ea8d840..8e035871360e 100644 --- a/include/xen/interface/platform.h +++ b/include/xen/interface/platform.h @@ -474,6 +474,23 @@ struct xenpf_core_parking { }; DEFINE_GUEST_HANDLE_STRUCT(xenpf_core_parking); +#define XENPF_get_symbol 63 +struct xenpf_symdata { + /* IN/OUT variables */ + uint32_t namelen; /* size of 'name' buffer */ + + /* IN/OUT variables */ + uint32_t symnum; /* IN: Symbol to read */ + /* OUT: Next available symbol. If same as IN */ + /* then we reached the end */ + + /* OUT variables */ + GUEST_HANDLE(char) name; + uint64_t address; + char type; +}; +DEFINE_GUEST_HANDLE_STRUCT(xenpf_symdata); + struct xen_platform_op { uint32_t cmd; uint32_t interface_version; /* XENPF_INTERFACE_VERSION */ @@ -495,6 +512,7 @@ struct xen_platform_op { struct xenpf_cpu_hotadd cpu_add; struct xenpf_mem_hotadd mem_add; struct xenpf_core_parking core_parking; + struct xenpf_symdata symdata; uint8_t pad[128]; } u; }; diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index a48378958062..167071c290b3 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -80,6 +80,7 @@ #define __HYPERVISOR_kexec_op 37 #define __HYPERVISOR_tmem_op 38 #define __HYPERVISOR_xc_reserved_op 39 /* reserved for XenClient */ +#define __HYPERVISOR_xenpmu_op 40 /* Architecture-specific hypercall definitions. */ #define __HYPERVISOR_arch_0 48 @@ -112,6 +113,7 @@ #define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */ #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */ #define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */ +#define VIRQ_XENPMU 13 /* PMC interrupt */ /* Architecture-specific VIRQ definitions. */ #define VIRQ_ARCH_0 16 @@ -585,26 +587,29 @@ struct shared_info { }; /* - * Start-of-day memory layout for the initial domain (DOM0): + * Start-of-day memory layout + * * 1. The domain is started within contiguous virtual-memory region. * 2. The contiguous region begins and ends on an aligned 4MB boundary. - * 3. The region start corresponds to the load address of the OS image. - * If the load address is not 4MB aligned then the address is rounded down. - * 4. This the order of bootstrap elements in the initial virtual region: + * 3. This the order of bootstrap elements in the initial virtual region: * a. relocated kernel image * b. initial ram disk [mod_start, mod_len] + * (may be omitted) * c. list of allocated page frames [mfn_list, nr_pages] + * (unless relocated due to XEN_ELFNOTE_INIT_P2M) * d. start_info_t structure [register ESI (x86)] - * e. bootstrap page tables [pt_base, CR3 (x86)] - * f. bootstrap stack [register ESP (x86)] - * 5. Bootstrap elements are packed together, but each is 4kB-aligned. - * 6. The initial ram disk may be omitted. - * 7. The list of page frames forms a contiguous 'pseudo-physical' memory + * in case of dom0 this page contains the console info, too + * e. unless dom0: xenstore ring page + * f. unless dom0: console ring page + * g. bootstrap page tables [pt_base, CR3 (x86)] + * h. bootstrap stack [register ESP (x86)] + * 4. Bootstrap elements are packed together, but each is 4kB-aligned. + * 5. The list of page frames forms a contiguous 'pseudo-physical' memory * layout for the domain. In particular, the bootstrap virtual-memory * region is a 1:1 mapping to the first section of the pseudo-physical map. - * 8. All bootstrap elements are mapped read-writable for the guest OS. The + * 6. All bootstrap elements are mapped read-writable for the guest OS. The * only exception is the bootstrap page table, which is mapped read-only. - * 9. There is guaranteed to be at least 512kB padding after the final + * 7. There is guaranteed to be at least 512kB padding after the final * bootstrap element. If necessary, the bootstrap virtual region is * extended by an extra 4MB to ensure this. */ @@ -641,10 +646,12 @@ struct start_info { }; /* These flags are passed in the 'flags' field of start_info_t. */ -#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ -#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ -#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ -#define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ +#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ +#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ +#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ +#define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ +#define SIF_VIRT_P2M_4TOOLS (1<<4) /* Do Xen tools understand a virt. mapped */ + /* P->M making the 3 level tree obsolete? */ #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ /* diff --git a/include/xen/interface/xenpmu.h b/include/xen/interface/xenpmu.h new file mode 100644 index 000000000000..139efc91bceb --- /dev/null +++ b/include/xen/interface/xenpmu.h @@ -0,0 +1,94 @@ +#ifndef __XEN_PUBLIC_XENPMU_H__ +#define __XEN_PUBLIC_XENPMU_H__ + +#include "xen.h" + +#define XENPMU_VER_MAJ 0 +#define XENPMU_VER_MIN 1 + +/* + * ` enum neg_errnoval + * ` HYPERVISOR_xenpmu_op(enum xenpmu_op cmd, struct xenpmu_params *args); + * + * @cmd == XENPMU_* (PMU operation) + * @args == struct xenpmu_params + */ +/* ` enum xenpmu_op { */ +#define XENPMU_mode_get 0 /* Also used for getting PMU version */ +#define XENPMU_mode_set 1 +#define XENPMU_feature_get 2 +#define XENPMU_feature_set 3 +#define XENPMU_init 4 +#define XENPMU_finish 5 +#define XENPMU_lvtpc_set 6 +#define XENPMU_flush 7 + +/* ` } */ + +/* Parameters structure for HYPERVISOR_xenpmu_op call */ +struct xen_pmu_params { + /* IN/OUT parameters */ + struct { + uint32_t maj; + uint32_t min; + } version; + uint64_t val; + + /* IN parameters */ + uint32_t vcpu; + uint32_t pad; +}; + +/* PMU modes: + * - XENPMU_MODE_OFF: No PMU virtualization + * - XENPMU_MODE_SELF: Guests can profile themselves + * - XENPMU_MODE_HV: Guests can profile themselves, dom0 profiles + * itself and Xen + * - XENPMU_MODE_ALL: Only dom0 has access to VPMU and it profiles + * everyone: itself, the hypervisor and the guests. + */ +#define XENPMU_MODE_OFF 0 +#define XENPMU_MODE_SELF (1<<0) +#define XENPMU_MODE_HV (1<<1) +#define XENPMU_MODE_ALL (1<<2) + +/* + * PMU features: + * - XENPMU_FEATURE_INTEL_BTS: Intel BTS support (ignored on AMD) + */ +#define XENPMU_FEATURE_INTEL_BTS 1 + +/* + * Shared PMU data between hypervisor and PV(H) domains. + * + * The hypervisor fills out this structure during PMU interrupt and sends an + * interrupt to appropriate VCPU. + * Architecture-independent fields of xen_pmu_data are WO for the hypervisor + * and RO for the guest but some fields in xen_pmu_arch can be writable + * by both the hypervisor and the guest (see arch-$arch/pmu.h). + */ +struct xen_pmu_data { + /* Interrupted VCPU */ + uint32_t vcpu_id; + + /* + * Physical processor on which the interrupt occurred. On non-privileged + * guests set to vcpu_id; + */ + uint32_t pcpu_id; + + /* + * Domain that was interrupted. On non-privileged guests set to + * DOMID_SELF. + * On privileged guests can be DOMID_SELF, DOMID_XEN, or, when in + * XENPMU_MODE_ALL mode, domain ID of another domain. + */ + domid_t domain_id; + + uint8_t pad[6]; + + /* Architecture-specific information */ + struct xen_pmu_arch pmu; +}; + +#endif /* __XEN_PUBLIC_XENPMU_H__ */ diff --git a/include/xen/page.h b/include/xen/page.h index c5ed20bb3fe9..a5983da2f5cd 100644 --- a/include/xen/page.h +++ b/include/xen/page.h @@ -9,8 +9,8 @@ static inline unsigned long page_to_mfn(struct page *page) } struct xen_memory_region { - phys_addr_t start; - phys_addr_t size; + unsigned long start_pfn; + unsigned long n_pfns; }; #define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820MAX */ diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index e10ccd299d66..0cfadafb3fb0 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -217,6 +217,13 @@ early_memremap(resource_size_t phys_addr, unsigned long size) return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_NORMAL); } +#ifdef FIXMAP_PAGE_RO +void __init * +early_memremap_ro(resource_size_t phys_addr, unsigned long size) +{ + return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); +} +#endif #else /* CONFIG_MMU */ void __init __iomem * @@ -231,6 +238,11 @@ early_memremap(resource_size_t phys_addr, unsigned long size) { return (void *)phys_addr; } +void __init * +early_memremap_ro(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} void __init early_iounmap(void __iomem *addr, unsigned long size) { |