summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-09 06:10:17 +0900
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-09 06:10:17 +0900
commitfeb06d2690bb826fd33798a99ce5cff8d07b38f9 (patch)
tree9d17c00596e355b195c87c2e4046388b10950bf0
parentc2f2b01b74be8b40a2173372bcd770723f87e7b2 (diff)
parent615a6e7d83f958e7ef3bc818e818f7c6433b4c2a (diff)
Merge tag 'hyperv-next-signed-20251207' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux
Pull hyperv updates from Wei Liu: - Enhancements to Linux as the root partition for Microsoft Hypervisor: - Support a new mode called L1VH, which allows Linux to drive the hypervisor running the Azure Host directly - Support for MSHV crash dump collection - Allow Linux's memory management subsystem to better manage guest memory regions - Fix issues that prevented a clean shutdown of the whole system on bare metal and nested configurations - ARM64 support for the MSHV driver - Various other bug fixes and cleanups - Add support for Confidential VMBus for Linux guest on Hyper-V - Secure AVIC support for Linux guests on Hyper-V - Add the mshv_vtl driver to allow Linux to run as the secure kernel in a higher virtual trust level for Hyper-V * tag 'hyperv-next-signed-20251207' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (58 commits) mshv: Cleanly shutdown root partition with MSHV mshv: Use reboot notifier to configure sleep state mshv: Add definitions for MSHV sleep state configuration mshv: Add support for movable memory regions mshv: Add refcount and locking to mem regions mshv: Fix huge page handling in memory region traversal mshv: Move region management to mshv_regions.c mshv: Centralize guest memory region destruction mshv: Refactor and rename memory region handling functions mshv: adjust interrupt control structure for ARM64 Drivers: hv: use kmalloc_array() instead of kmalloc() mshv: Add ioctl for self targeted passthrough hvcalls Drivers: hv: Introduce mshv_vtl driver Drivers: hv: Export some symbols for mshv_vtl static_call: allow using STATIC_CALL_TRAMP_STR() from assembly mshv: Extend create partition ioctl to support cpu features mshv: Allow mappings that overlap in uaddr mshv: Fix create memory region overlap check mshv: add WQ_PERCPU to alloc_workqueue users Drivers: hv: Use kmalloc_array() instead of kmalloc() ...
-rw-r--r--Documentation/virt/hyperv/coco.rst139
-rw-r--r--MAINTAINERS3
-rw-r--r--arch/x86/hyperv/Makefile16
-rw-r--r--arch/x86/hyperv/hv_apic.c8
-rw-r--r--arch/x86/hyperv/hv_crash.c642
-rw-r--r--arch/x86/hyperv/hv_init.c9
-rw-r--r--arch/x86/hyperv/hv_trampoline.S101
-rw-r--r--arch/x86/hyperv/hv_vtl.c30
-rw-r--r--arch/x86/hyperv/mshv-asm-offsets.c37
-rw-r--r--arch/x86/hyperv/mshv_vtl_asm.S116
-rw-r--r--arch/x86/include/asm/mshyperv.h45
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c88
-rw-r--r--drivers/hv/Kconfig29
-rw-r--r--drivers/hv/Makefile9
-rw-r--r--drivers/hv/channel.c75
-rw-r--r--drivers/hv/channel_mgmt.c27
-rw-r--r--drivers/hv/connection.c6
-rw-r--r--drivers/hv/hv.c377
-rw-r--r--drivers/hv/hv_common.c27
-rw-r--r--drivers/hv/hv_util.c2
-rw-r--r--drivers/hv/hyperv_vmbus.h76
-rw-r--r--drivers/hv/mshv_common.c99
-rw-r--r--drivers/hv/mshv_eventfd.c8
-rw-r--r--drivers/hv/mshv_irq.c4
-rw-r--r--drivers/hv/mshv_regions.c555
-rw-r--r--drivers/hv/mshv_root.h57
-rw-r--r--drivers/hv/mshv_root_hv_call.c196
-rw-r--r--drivers/hv/mshv_root_main.c745
-rw-r--r--drivers/hv/mshv_synic.c6
-rw-r--r--drivers/hv/mshv_vtl.h25
-rw-r--r--drivers/hv/mshv_vtl_main.c1392
-rw-r--r--drivers/hv/ring_buffer.c5
-rw-r--r--drivers/hv/vmbus_drv.c188
-rw-r--r--include/asm-generic/mshyperv.h63
-rw-r--r--include/hyperv/hvgdk_mini.h115
-rw-r--r--include/hyperv/hvhdk.h46
-rw-r--r--include/hyperv/hvhdk_mini.h128
-rw-r--r--include/linux/compiler_types.h8
-rw-r--r--include/linux/hyperv.h69
-rw-r--r--include/linux/static_call_types.h4
-rw-r--r--include/uapi/linux/mshv.h116
-rw-r--r--tools/include/linux/static_call_types.h4
42 files changed, 5003 insertions, 692 deletions
diff --git a/Documentation/virt/hyperv/coco.rst b/Documentation/virt/hyperv/coco.rst
index c15d6fe34b4e..3231e51444da 100644
--- a/Documentation/virt/hyperv/coco.rst
+++ b/Documentation/virt/hyperv/coco.rst
@@ -178,7 +178,7 @@ These Hyper-V and VMBus memory pages are marked as decrypted:
* VMBus monitor pages
-* Synthetic interrupt controller (synic) related pages (unless supplied by
+* Synthetic interrupt controller (SynIC) related pages (unless supplied by
the paravisor)
* Per-cpu hypercall input and output pages (unless running with a paravisor)
@@ -232,6 +232,143 @@ with arguments explicitly describing the access. See
_hv_pcifront_read_config() and _hv_pcifront_write_config() and the
"use_calls" flag indicating to use hypercalls.
+Confidential VMBus
+------------------
+The confidential VMBus enables the confidential guest not to interact with
+the untrusted host partition and the untrusted hypervisor. Instead, the guest
+relies on the trusted paravisor to communicate with the devices processing
+sensitive data. The hardware (SNP or TDX) encrypts the guest memory and the
+register state while measuring the paravisor image using the platform security
+processor to ensure trusted and confidential computing.
+
+Confidential VMBus provides a secure communication channel between the guest
+and the paravisor, ensuring that sensitive data is protected from hypervisor-
+level access through memory encryption and register state isolation.
+
+Confidential VMBus is an extension of Confidential Computing (CoCo) VMs
+(a.k.a. "Isolated" VMs in Hyper-V terminology). Without Confidential VMBus,
+guest VMBus device drivers (the "VSC"s in VMBus terminology) communicate
+with VMBus servers (the VSPs) running on the Hyper-V host. The
+communication must be through memory that has been decrypted so the
+host can access it. With Confidential VMBus, one or more of the VSPs reside
+in the trusted paravisor layer in the guest VM. Since the paravisor layer also
+operates in encrypted memory, the memory used for communication with
+such VSPs does not need to be decrypted and thereby exposed to the
+Hyper-V host. The paravisor is responsible for communicating securely
+with the Hyper-V host as necessary.
+
+The data is transferred directly between the VM and a vPCI device (a.k.a.
+a PCI pass-thru device, see :doc:`vpci`) that is directly assigned to VTL2
+and that supports encrypted memory. In such a case, neither the host partition
+nor the hypervisor has any access to the data. The guest needs to establish
+a VMBus connection only with the paravisor for the channels that process
+sensitive data, and the paravisor abstracts the details of communicating
+with the specific devices away providing the guest with the well-established
+VSP (Virtual Service Provider) interface that has had support in the Hyper-V
+drivers for a decade.
+
+In the case the device does not support encrypted memory, the paravisor
+provides bounce-buffering, and although the data is not encrypted, the backing
+pages aren't mapped into the host partition through SLAT. While not impossible,
+it becomes much more difficult for the host partition to exfiltrate the data
+than it would be with a conventional VMBus connection where the host partition
+has direct access to the memory used for communication.
+
+Here is the data flow for a conventional VMBus connection (`C` stands for the
+client or VSC, `S` for the server or VSP, the `DEVICE` is a physical one, might
+be with multiple virtual functions)::
+
+ +---- GUEST ----+ +----- DEVICE ----+ +----- HOST -----+
+ | | | | | |
+ | | | | | |
+ | | | ========== |
+ | | | | | |
+ | | | | | |
+ | | | | | |
+ +----- C -------+ +-----------------+ +------- S ------+
+ || ||
+ || ||
+ +------||------------------ VMBus --------------------------||------+
+ | Interrupts, MMIO |
+ +-------------------------------------------------------------------+
+
+and the Confidential VMBus connection::
+
+ +---- GUEST --------------- VTL0 ------+ +-- DEVICE --+
+ | | | |
+ | +- PARAVISOR --------- VTL2 -----+ | | |
+ | | +-- VMBus Relay ------+ ====+================ |
+ | | | Interrupts, MMIO | | | | |
+ | | +-------- S ----------+ | | +------------+
+ | | || | |
+ | +---------+ || | |
+ | | Linux | || OpenHCL | |
+ | | kernel | || | |
+ | +---- C --+-----||---------------+ |
+ | || || |
+ +-------++------- C -------------------+ +------------+
+ || | HOST |
+ || +---- S -----+
+ +-------||----------------- VMBus ---------------------------||-----+
+ | Interrupts, MMIO |
+ +-------------------------------------------------------------------+
+
+An implementation of the VMBus relay that offers the Confidential VMBus
+channels is available in the OpenVMM project as a part of the OpenHCL
+paravisor. Please refer to
+
+ * https://openvmm.dev/, and
+ * https://github.com/microsoft/openvmm
+
+for more information about the OpenHCL paravisor.
+
+A guest that is running with a paravisor must determine at runtime if
+Confidential VMBus is supported by the current paravisor. The x86_64-specific
+approach relies on the CPUID Virtualization Stack leaf; the ARM64 implementation
+is expected to support the Confidential VMBus unconditionally when running
+ARM CCA guests.
+
+Confidential VMBus is a characteristic of the VMBus connection as a whole,
+and of each VMBus channel that is created. When a Confidential VMBus
+connection is established, the paravisor provides the guest the message-passing
+path that is used for VMBus device creation and deletion, and it provides a
+per-CPU synthetic interrupt controller (SynIC) just like the SynIC that is
+offered by the Hyper-V host. Each VMBus device that is offered to the guest
+indicates the degree to which it participates in Confidential VMBus. The offer
+indicates if the device uses encrypted ring buffers, and if the device uses
+encrypted memory for DMA that is done outside the ring buffer. These settings
+may be different for different devices using the same Confidential VMBus
+connection.
+
+Although these settings are separate, in practice it'll always be encrypted
+ring buffer only, or both encrypted ring buffer and external data. If a channel
+is offered by the paravisor with confidential VMBus, the ring buffer can always
+be encrypted since it's strictly for communication between the VTL2 paravisor
+and the VTL0 guest. However, other memory regions are often used for e.g. DMA,
+so they need to be accessible by the underlying hardware, and must be
+unencrypted (unless the device supports encrypted memory). Currently, there are
+not any VSPs in OpenHCL that support encrypted external memory, but future
+versions are expected to enable this capability.
+
+Because some devices on a Confidential VMBus may require decrypted ring buffers
+and DMA transfers, the guest must interact with two SynICs -- the one provided
+by the paravisor and the one provided by the Hyper-V host when Confidential
+VMBus is not offered. Interrupts are always signaled by the paravisor SynIC,
+but the guest must check for messages and for channel interrupts on both SynICs.
+
+In the case of a confidential VMBus, regular SynIC access by the guest is
+intercepted by the paravisor (this includes various MSRs such as the SIMP and
+SIEFP, as well as hypercalls like HvPostMessage and HvSignalEvent). If the
+guest actually wants to communicate with the hypervisor, it has to use special
+mechanisms (GHCB page on SNP, or tdcall on TDX). Messages can be of either
+kind: with confidential VMBus, messages use the paravisor SynIC, and if the
+guest chose to communicate directly to the hypervisor, they use the hypervisor
+SynIC. For interrupt signaling, some channels may be running on the host
+(non-confidential, using the VMBus relay) and use the hypervisor SynIC, and
+some on the paravisor and use its SynIC. The RelIDs are coordinated by the
+OpenHCL VMBus server and are guaranteed to be unique regardless of whether
+the channel originated on the host or the paravisor.
+
load_unaligned_zeropad()
------------------------
When transitioning memory between encrypted and decrypted, the caller of
diff --git a/MAINTAINERS b/MAINTAINERS
index d701a4d5b00e..9f886de7e4ab 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11705,6 +11705,7 @@ M: "K. Y. Srinivasan" <kys@microsoft.com>
M: Haiyang Zhang <haiyangz@microsoft.com>
M: Wei Liu <wei.liu@kernel.org>
M: Dexuan Cui <decui@microsoft.com>
+M: Long Li <longli@microsoft.com>
L: linux-hyperv@vger.kernel.org
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git
@@ -11722,6 +11723,7 @@ F: arch/x86/kernel/cpu/mshyperv.c
F: drivers/clocksource/hyperv_timer.c
F: drivers/hid/hid-hyperv.c
F: drivers/hv/
+F: drivers/infiniband/hw/mana/
F: drivers/input/serio/hyperv-keyboard.c
F: drivers/iommu/hyperv-iommu.c
F: drivers/net/ethernet/microsoft/
@@ -11740,6 +11742,7 @@ F: include/hyperv/hvhdk_mini.h
F: include/linux/hyperv.h
F: include/net/mana
F: include/uapi/linux/hyperv.h
+F: include/uapi/rdma/mana-abi.h
F: net/vmw_vsock/hyperv_transport.c
F: tools/hv/
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index d55f494f471d..56292102af62 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,8 +1,22 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o
obj-$(CONFIG_X86_64) += hv_apic.o
-obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o
+obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o mshv_vtl_asm.o
+
+$(obj)/mshv_vtl_asm.o: $(obj)/mshv-asm-offsets.h
+
+$(obj)/mshv-asm-offsets.h: $(obj)/mshv-asm-offsets.s FORCE
+ $(call filechk,offsets,__MSHV_ASM_OFFSETS_H__)
ifdef CONFIG_X86_64
obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o
+
+ ifdef CONFIG_MSHV_ROOT
+ CFLAGS_REMOVE_hv_trampoline.o += -pg
+ CFLAGS_hv_trampoline.o += -fno-stack-protector
+ obj-$(CONFIG_CRASH_DUMP) += hv_crash.o hv_trampoline.o
+ endif
endif
+
+targets += mshv-asm-offsets.s
+clean-files += mshv-asm-offsets.h
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index bfde0a3498b9..a8de503def37 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -53,6 +53,11 @@ static void hv_apic_icr_write(u32 low, u32 id)
wrmsrq(HV_X64_MSR_ICR, reg_val);
}
+void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set)
+{
+ apic_update_vector(cpu, vector, set);
+}
+
static u32 hv_apic_read(u32 reg)
{
u32 reg_val, hi;
@@ -293,6 +298,9 @@ static void hv_send_ipi_self(int vector)
void __init hv_apic_init(void)
{
+ if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ return;
+
if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) {
pr_info("Hyper-V: Using IPI hypercalls\n");
/*
diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c
new file mode 100644
index 000000000000..c0e22921ace1
--- /dev/null
+++ b/arch/x86/hyperv/hv_crash.c
@@ -0,0 +1,642 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * X86 specific Hyper-V root partition kdump/crash support module
+ *
+ * Copyright (C) 2025, Microsoft, Inc.
+ *
+ * This module implements hypervisor RAM collection into vmcore for both
+ * cases of the hypervisor crash and Linux root crash. Hyper-V implements
+ * a disable hypercall with a 32bit protected mode ABI callback. This
+ * mechanism must be used to unlock hypervisor RAM. Since the hypervisor RAM
+ * is already mapped in Linux, it is automatically collected into Linux vmcore,
+ * and can be examined by the crash command (raw RAM dump) or windbg.
+ *
+ * At a high level:
+ *
+ * Hypervisor Crash:
+ * Upon crash, hypervisor goes into an emergency minimal dispatch loop, a
+ * restrictive mode with very limited hypercall and MSR support. Each cpu
+ * then injects NMIs into root vcpus. A shared page is used to check
+ * by Linux in the NMI handler if the hypervisor has crashed. This shared
+ * page is setup in hv_root_crash_init during boot.
+ *
+ * Linux Crash:
+ * In case of Linux crash, the callback hv_crash_stop_other_cpus will send
+ * NMIs to all cpus, then proceed to the crash_nmi_callback where it waits
+ * for all cpus to be in NMI.
+ *
+ * NMI Handler (upon quorum):
+ * Eventually, in both cases, all cpus will end up in the NMI handler.
+ * Hyper-V requires the disable hypervisor must be done from the BSP. So
+ * the BSP NMI handler saves current context, does some fixups and makes
+ * the hypercall to disable the hypervisor, ie, devirtualize. Hypervisor
+ * at that point will suspend all vcpus (except the BSP), unlock all its
+ * RAM, and return to Linux at the 32bit mode entry RIP.
+ *
+ * Linux 32bit entry trampoline will then restore long mode and call C
+ * function here to restore context and continue execution to crash kexec.
+ */
+
+#include <linux/delay.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <linux/panic.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/mshyperv.h>
+#include <asm/nmi.h>
+#include <asm/idtentry.h>
+#include <asm/reboot.h>
+#include <asm/intel_pt.h>
+
+bool hv_crash_enabled;
+EXPORT_SYMBOL_GPL(hv_crash_enabled);
+
+struct hv_crash_ctxt {
+ ulong rsp;
+ ulong cr0;
+ ulong cr2;
+ ulong cr4;
+ ulong cr8;
+
+ u16 cs;
+ u16 ss;
+ u16 ds;
+ u16 es;
+ u16 fs;
+ u16 gs;
+
+ u16 gdt_fill;
+ struct desc_ptr gdtr;
+ char idt_fill[6];
+ struct desc_ptr idtr;
+
+ u64 gsbase;
+ u64 efer;
+ u64 pat;
+};
+static struct hv_crash_ctxt hv_crash_ctxt;
+
+/* Shared hypervisor page that contains crash dump area we peek into.
+ * NB: windbg looks for "hv_cda" symbol so don't change it.
+ */
+static struct hv_crashdump_area *hv_cda;
+
+static u32 trampoline_pa, devirt_arg;
+static atomic_t crash_cpus_wait;
+static void *hv_crash_ptpgs[4];
+static bool hv_has_crashed, lx_has_crashed;
+
+static void __noreturn hv_panic_timeout_reboot(void)
+{
+ #define PANIC_TIMER_STEP 100
+
+ if (panic_timeout > 0) {
+ int i;
+
+ for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP)
+ mdelay(PANIC_TIMER_STEP);
+ }
+
+ if (panic_timeout)
+ native_wrmsrq(HV_X64_MSR_RESET, 1); /* get hyp to reboot */
+
+ for (;;)
+ cpu_relax();
+}
+
+/* This cannot be inlined as it needs stack */
+static noinline __noclone void hv_crash_restore_tss(void)
+{
+ load_TR_desc();
+}
+
+/* This cannot be inlined as it needs stack */
+static noinline void hv_crash_clear_kernpt(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ /* Clear entry so it's not confusing to someone looking at the core */
+ pgd = pgd_offset_k(trampoline_pa);
+ p4d = p4d_offset(pgd, trampoline_pa);
+ native_p4d_clear(p4d);
+}
+
+/*
+ * This is the C entry point from the asm glue code after the disable hypercall.
+ * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
+ * page tables with our below 4G page identity mapped, but using a temporary
+ * GDT. ds/fs/gs/es are null. ss is not usable. bp is null. stack is not
+ * available. We restore kernel GDT, and rest of the context, and continue
+ * to kexec.
+ */
+static asmlinkage void __noreturn hv_crash_c_entry(void)
+{
+ struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
+
+ /* first thing, restore kernel gdt */
+ native_load_gdt(&ctxt->gdtr);
+
+ asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
+ asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
+
+ asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
+ asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
+ asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
+ asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
+
+ native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
+ asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
+
+ asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
+ asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
+ asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
+
+ native_load_idt(&ctxt->idtr);
+ native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
+ native_wrmsrq(MSR_EFER, ctxt->efer);
+
+ /* restore the original kernel CS now via far return */
+ asm volatile("movzwq %0, %%rax\n\t"
+ "pushq %%rax\n\t"
+ "pushq $1f\n\t"
+ "lretq\n\t"
+ "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
+
+ /* We are in asmlinkage without stack frame, hence make C function
+ * calls which will buy stack frames.
+ */
+ hv_crash_restore_tss();
+ hv_crash_clear_kernpt();
+
+ /* we are now fully in devirtualized normal kernel mode */
+ __crash_kexec(NULL);
+
+ hv_panic_timeout_reboot();
+}
+/* Tell gcc we are using lretq long jump in the above function intentionally */
+STACK_FRAME_NON_STANDARD(hv_crash_c_entry);
+
+static void hv_mark_tss_not_busy(void)
+{
+ struct desc_struct *desc = get_current_gdt_rw();
+ tss_desc tss;
+
+ memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
+ tss.type = 0x9; /* available 64-bit TSS. 0xB is busy TSS */
+ write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS);
+}
+
+/* Save essential context */
+static void hv_hvcrash_ctxt_save(void)
+{
+ struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
+
+ asm volatile("movq %%rsp,%0" : "=m"(ctxt->rsp));
+
+ ctxt->cr0 = native_read_cr0();
+ ctxt->cr4 = native_read_cr4();
+
+ asm volatile("movq %%cr2, %0" : "=a"(ctxt->cr2));
+ asm volatile("movq %%cr8, %0" : "=a"(ctxt->cr8));
+
+ asm volatile("movl %%cs, %%eax" : "=a"(ctxt->cs));
+ asm volatile("movl %%ss, %%eax" : "=a"(ctxt->ss));
+ asm volatile("movl %%ds, %%eax" : "=a"(ctxt->ds));
+ asm volatile("movl %%es, %%eax" : "=a"(ctxt->es));
+ asm volatile("movl %%fs, %%eax" : "=a"(ctxt->fs));
+ asm volatile("movl %%gs, %%eax" : "=a"(ctxt->gs));
+
+ native_store_gdt(&ctxt->gdtr);
+ store_idt(&ctxt->idtr);
+
+ ctxt->gsbase = __rdmsr(MSR_GS_BASE);
+ ctxt->efer = __rdmsr(MSR_EFER);
+ ctxt->pat = __rdmsr(MSR_IA32_CR_PAT);
+}
+
+/* Add trampoline page to the kernel pagetable for transition to kernel PT */
+static void hv_crash_fixup_kernpt(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ pgd = pgd_offset_k(trampoline_pa);
+ p4d = p4d_offset(pgd, trampoline_pa);
+
+ /* trampoline_pa is below 4G, so no pre-existing entry to clobber */
+ p4d_populate(&init_mm, p4d, (pud_t *)hv_crash_ptpgs[1]);
+ p4d->p4d = p4d->p4d & ~(_PAGE_NX); /* enable execute */
+}
+
+/*
+ * Notify the hyp that Linux has crashed. This will cause the hyp to quiesce
+ * and suspend all guest VPs.
+ */
+static void hv_notify_prepare_hyp(void)
+{
+ u64 status;
+ struct hv_input_notify_partition_event *input;
+ struct hv_partition_event_root_crashdump_input *cda;
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ cda = &input->input.crashdump_input;
+ memset(input, 0, sizeof(*input));
+ input->event = HV_PARTITION_EVENT_ROOT_CRASHDUMP;
+
+ cda->crashdump_action = HV_CRASHDUMP_ENTRY;
+ status = hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
+ if (!hv_result_success(status))
+ return;
+
+ cda->crashdump_action = HV_CRASHDUMP_SUSPEND_ALL_VPS;
+ hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
+}
+
+/*
+ * Common function for all cpus before devirtualization.
+ *
+ * Hypervisor crash: all cpus get here in NMI context.
+ * Linux crash: the panicing cpu gets here at base level, all others in NMI
+ * context. Note, panicing cpu may not be the BSP.
+ *
+ * The function is not inlined so it will show on the stack. It is named so
+ * because the crash cmd looks for certain well known function names on the
+ * stack before looking into the cpu saved note in the elf section, and
+ * that work is currently incomplete.
+ *
+ * Notes:
+ * Hypervisor crash:
+ * - the hypervisor is in a very restrictive mode at this point and any
+ * vmexit it cannot handle would result in reboot. So, no mumbo jumbo,
+ * just get to kexec as quickly as possible.
+ *
+ * Devirtualization is supported from the BSP only at present.
+ */
+static noinline __noclone void crash_nmi_callback(struct pt_regs *regs)
+{
+ struct hv_input_disable_hyp_ex *input;
+ u64 status;
+ int msecs = 1000, ccpu = smp_processor_id();
+
+ if (ccpu == 0) {
+ /* crash_save_cpu() will be done in the kexec path */
+ cpu_emergency_stop_pt(); /* disable performance trace */
+ atomic_inc(&crash_cpus_wait);
+ } else {
+ crash_save_cpu(regs, ccpu);
+ cpu_emergency_stop_pt(); /* disable performance trace */
+ atomic_inc(&crash_cpus_wait);
+ for (;;)
+ cpu_relax();
+ }
+
+ while (atomic_read(&crash_cpus_wait) < num_online_cpus() && msecs--)
+ mdelay(1);
+
+ stop_nmi();
+ if (!hv_has_crashed)
+ hv_notify_prepare_hyp();
+
+ if (crashing_cpu == -1)
+ crashing_cpu = ccpu; /* crash cmd uses this */
+
+ hv_hvcrash_ctxt_save();
+ hv_mark_tss_not_busy();
+ hv_crash_fixup_kernpt();
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->rip = trampoline_pa;
+ input->arg = devirt_arg;
+
+ status = hv_do_hypercall(HVCALL_DISABLE_HYP_EX, input, NULL);
+
+ hv_panic_timeout_reboot();
+}
+
+
+static DEFINE_SPINLOCK(hv_crash_reboot_lk);
+
+/*
+ * Generic NMI callback handler: could be called without any crash also.
+ * hv crash: hypervisor injects NMI's into all cpus
+ * lx crash: panicing cpu sends NMI to all but self via crash_stop_other_cpus
+ */
+static int hv_crash_nmi_local(unsigned int cmd, struct pt_regs *regs)
+{
+ if (!hv_has_crashed && hv_cda && hv_cda->cda_valid)
+ hv_has_crashed = true;
+
+ if (!hv_has_crashed && !lx_has_crashed)
+ return NMI_DONE; /* ignore the NMI */
+
+ if (hv_has_crashed && !kexec_crash_loaded()) {
+ if (spin_trylock(&hv_crash_reboot_lk))
+ hv_panic_timeout_reboot();
+ else
+ for (;;)
+ cpu_relax();
+ }
+
+ crash_nmi_callback(regs);
+
+ return NMI_DONE;
+}
+
+/*
+ * hv_crash_stop_other_cpus() == smp_ops.crash_stop_other_cpus
+ *
+ * On normal Linux panic, this is called twice: first from panic and then again
+ * from native_machine_crash_shutdown.
+ *
+ * In case of hyperv, 3 ways to get here:
+ * 1. hv crash (only BSP will get here):
+ * BSP : NMI callback -> DisableHv -> hv_crash_asm32 -> hv_crash_c_entry
+ * -> __crash_kexec -> native_machine_crash_shutdown
+ * -> crash_smp_send_stop -> smp_ops.crash_stop_other_cpus
+ * Linux panic:
+ * 2. panic cpu x: panic() -> crash_smp_send_stop
+ * -> smp_ops.crash_stop_other_cpus
+ * 3. BSP: native_machine_crash_shutdown -> crash_smp_send_stop
+ *
+ * NB: noclone and non standard stack because of call to crash_setup_regs().
+ */
+static void __noclone hv_crash_stop_other_cpus(void)
+{
+ static bool crash_stop_done;
+ struct pt_regs lregs;
+ int ccpu = smp_processor_id();
+
+ if (hv_has_crashed)
+ return; /* all cpus already in NMI handler path */
+
+ if (!kexec_crash_loaded()) {
+ hv_notify_prepare_hyp();
+ hv_panic_timeout_reboot(); /* no return */
+ }
+
+ /* If the hv crashes also, we could come here again before cpus_stopped
+ * is set in crash_smp_send_stop(). So use our own check.
+ */
+ if (crash_stop_done)
+ return;
+ crash_stop_done = true;
+
+ /* Linux has crashed: hv is healthy, we can IPI safely */
+ lx_has_crashed = true;
+ wmb(); /* NMI handlers look at lx_has_crashed */
+
+ apic->send_IPI_allbutself(NMI_VECTOR);
+
+ if (crashing_cpu == -1)
+ crashing_cpu = ccpu; /* crash cmd uses this */
+
+ /* crash_setup_regs() happens in kexec also, but for the kexec cpu which
+ * is the BSP. We could be here on non-BSP cpu, collect regs if so.
+ */
+ if (ccpu)
+ crash_setup_regs(&lregs, NULL);
+
+ crash_nmi_callback(&lregs);
+}
+STACK_FRAME_NON_STANDARD(hv_crash_stop_other_cpus);
+
+/* This GDT is accessed in IA32-e compat mode which uses 32bits addresses */
+struct hv_gdtreg_32 {
+ u16 fill;
+ u16 limit;
+ u32 address;
+} __packed;
+
+/* We need a CS with L bit to goto IA32-e long mode from 32bit compat mode */
+struct hv_crash_tramp_gdt {
+ u64 null; /* index 0, selector 0, null selector */
+ u64 cs64; /* index 1, selector 8, cs64 selector */
+} __packed;
+
+/* No stack, so jump via far ptr in memory to load the 64bit CS */
+struct hv_cs_jmptgt {
+ u32 address;
+ u16 csval;
+ u16 fill;
+} __packed;
+
+/* Linux use only, hypervisor doesn't look at this struct */
+struct hv_crash_tramp_data {
+ u64 tramp32_cr3;
+ u64 kernel_cr3;
+ struct hv_gdtreg_32 gdtr32;
+ struct hv_crash_tramp_gdt tramp_gdt;
+ struct hv_cs_jmptgt cs_jmptgt;
+ u64 c_entry_addr;
+} __packed;
+
+/*
+ * Setup a temporary gdt to allow the asm code to switch to the long mode.
+ * Since the asm code is relocated/copied to a below 4G page, it cannot use rip
+ * relative addressing, hence we must use trampoline_pa here. Also, save other
+ * info like jmp and C entry targets for same reasons.
+ *
+ * Returns: 0 on success, -1 on error
+ */
+static int hv_crash_setup_trampdata(u64 trampoline_va)
+{
+ int size, offs;
+ void *dest;
+ struct hv_crash_tramp_data *tramp;
+
+ /* These must match exactly the ones in the corresponding asm file */
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, tramp32_cr3) != 0);
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, kernel_cr3) != 8);
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, gdtr32.limit) != 18);
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data,
+ cs_jmptgt.address) != 40);
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, c_entry_addr) != 48);
+
+ /* hv_crash_asm_end is beyond last byte by 1 */
+ size = &hv_crash_asm_end - &hv_crash_asm32;
+ if (size + sizeof(struct hv_crash_tramp_data) > PAGE_SIZE) {
+ pr_err("%s: trampoline page overflow\n", __func__);
+ return -1;
+ }
+
+ dest = (void *)trampoline_va;
+ memcpy(dest, &hv_crash_asm32, size);
+
+ dest += size;
+ dest = (void *)round_up((ulong)dest, 16);
+ tramp = (struct hv_crash_tramp_data *)dest;
+
+ /* see MAX_ASID_AVAILABLE in tlb.c: "PCID 0 is reserved for use by
+ * non-PCID-aware users". Build cr3 with pcid 0
+ */
+ tramp->tramp32_cr3 = __sme_pa(hv_crash_ptpgs[0]);
+
+ /* Note, when restoring X86_CR4_PCIDE, cr3[11:0] must be zero */
+ tramp->kernel_cr3 = __sme_pa(init_mm.pgd);
+
+ tramp->gdtr32.limit = sizeof(struct hv_crash_tramp_gdt);
+ tramp->gdtr32.address = trampoline_pa +
+ (ulong)&tramp->tramp_gdt - trampoline_va;
+
+ /* base:0 limit:0xfffff type:b dpl:0 P:1 L:1 D:0 avl:0 G:1 */
+ tramp->tramp_gdt.cs64 = 0x00af9a000000ffff;
+
+ tramp->cs_jmptgt.csval = 0x8;
+ offs = (ulong)&hv_crash_asm64 - (ulong)&hv_crash_asm32;
+ tramp->cs_jmptgt.address = trampoline_pa + offs;
+
+ tramp->c_entry_addr = (u64)&hv_crash_c_entry;
+
+ devirt_arg = trampoline_pa + (ulong)dest - trampoline_va;
+
+ return 0;
+}
+
+/*
+ * Build 32bit trampoline page table for transition from protected mode
+ * non-paging to long-mode paging. This transition needs pagetables below 4G.
+ */
+static void hv_crash_build_tramp_pt(void)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ u64 pa, addr = trampoline_pa;
+
+ p4d = hv_crash_ptpgs[0] + pgd_index(addr) * sizeof(p4d);
+ pa = virt_to_phys(hv_crash_ptpgs[1]);
+ set_p4d(p4d, __p4d(_PAGE_TABLE | pa));
+ p4d->p4d &= ~(_PAGE_NX); /* enable execute */
+
+ pud = hv_crash_ptpgs[1] + pud_index(addr) * sizeof(pud);
+ pa = virt_to_phys(hv_crash_ptpgs[2]);
+ set_pud(pud, __pud(_PAGE_TABLE | pa));
+
+ pmd = hv_crash_ptpgs[2] + pmd_index(addr) * sizeof(pmd);
+ pa = virt_to_phys(hv_crash_ptpgs[3]);
+ set_pmd(pmd, __pmd(_PAGE_TABLE | pa));
+
+ pte = hv_crash_ptpgs[3] + pte_index(addr) * sizeof(pte);
+ set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+}
+
+/*
+ * Setup trampoline for devirtualization:
+ * - a page below 4G, ie 32bit addr containing asm glue code that hyp jmps to
+ * in protected mode.
+ * - 4 pages for a temporary page table that asm code uses to turn paging on
+ * - a temporary gdt to use in the compat mode.
+ *
+ * Returns: 0 on success
+ */
+static int hv_crash_trampoline_setup(void)
+{
+ int i, rc, order;
+ struct page *page;
+ u64 trampoline_va;
+ gfp_t flags32 = GFP_KERNEL | GFP_DMA32 | __GFP_ZERO;
+
+ /* page for 32bit trampoline assembly code + hv_crash_tramp_data */
+ page = alloc_page(flags32);
+ if (page == NULL) {
+ pr_err("%s: failed to alloc asm stub page\n", __func__);
+ return -1;
+ }
+
+ trampoline_va = (u64)page_to_virt(page);
+ trampoline_pa = (u32)page_to_phys(page);
+
+ order = 2; /* alloc 2^2 pages */
+ page = alloc_pages(flags32, order);
+ if (page == NULL) {
+ pr_err("%s: failed to alloc pt pages\n", __func__);
+ free_page(trampoline_va);
+ return -1;
+ }
+
+ for (i = 0; i < 4; i++, page++)
+ hv_crash_ptpgs[i] = page_to_virt(page);
+
+ hv_crash_build_tramp_pt();
+
+ rc = hv_crash_setup_trampdata(trampoline_va);
+ if (rc)
+ goto errout;
+
+ return 0;
+
+errout:
+ free_page(trampoline_va);
+ free_pages((ulong)hv_crash_ptpgs[0], order);
+
+ return rc;
+}
+
+/* Setup for kdump kexec to collect hypervisor RAM when running as root */
+void hv_root_crash_init(void)
+{
+ int rc;
+ struct hv_input_get_system_property *input;
+ struct hv_output_get_system_property *output;
+ unsigned long flags;
+ u64 status;
+ union hv_pfn_range cda_info;
+
+ if (pgtable_l5_enabled()) {
+ pr_err("Hyper-V: crash dump not yet supported on 5level PTs\n");
+ return;
+ }
+
+ rc = register_nmi_handler(NMI_LOCAL, hv_crash_nmi_local, NMI_FLAG_FIRST,
+ "hv_crash_nmi");
+ if (rc) {
+ pr_err("Hyper-V: failed to register crash nmi handler\n");
+ return;
+ }
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->property_id = HV_SYSTEM_PROPERTY_CRASHDUMPAREA;
+
+ status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
+ cda_info.as_uint64 = output->hv_cda_info.as_uint64;
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status)) {
+ pr_err("Hyper-V: %s: property:%d %s\n", __func__,
+ input->property_id, hv_result_to_string(status));
+ goto err_out;
+ }
+
+ if (cda_info.base_pfn == 0) {
+ pr_err("Hyper-V: hypervisor crash dump area pfn is 0\n");
+ goto err_out;
+ }
+
+ hv_cda = phys_to_virt(cda_info.base_pfn << HV_HYP_PAGE_SHIFT);
+
+ rc = hv_crash_trampoline_setup();
+ if (rc)
+ goto err_out;
+
+ smp_ops.crash_stop_other_cpus = hv_crash_stop_other_cpus;
+
+ crash_kexec_post_notifiers = true;
+ hv_crash_enabled = true;
+ pr_info("Hyper-V: both linux and hypervisor kdump support enabled\n");
+
+ return;
+
+err_out:
+ unregister_nmi_handler(NMI_LOCAL, "hv_crash_nmi");
+ pr_err("Hyper-V: only linux root kdump support enabled\n");
+}
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 085ef4f2e73a..14de43f4bc6c 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -170,6 +170,10 @@ static int hv_cpu_init(unsigned int cpu)
wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
}
+ /* Allow Hyper-V stimer vector to be injected from Hypervisor. */
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+ apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, true);
+
return hyperv_init_ghcb();
}
@@ -277,6 +281,9 @@ static int hv_cpu_die(unsigned int cpu)
*ghcb_va = NULL;
}
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+ apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, false);
+
hv_common_cpu_die(cpu);
if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
@@ -551,6 +558,8 @@ void __init hyperv_init(void)
memunmap(src);
hv_remap_tsc_clocksource();
+ hv_root_crash_init();
+ hv_sleep_notifiers_register();
} else {
hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
diff --git a/arch/x86/hyperv/hv_trampoline.S b/arch/x86/hyperv/hv_trampoline.S
new file mode 100644
index 000000000000..25f02ff12286
--- /dev/null
+++ b/arch/x86/hyperv/hv_trampoline.S
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * X86 specific Hyper-V kdump/crash related code.
+ *
+ * Copyright (C) 2025, Microsoft, Inc.
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/alternative.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * void noreturn hv_crash_asm32(arg1)
+ * arg1 == edi == 32bit PA of struct hv_crash_tramp_data
+ *
+ * The hypervisor jumps here upon devirtualization in protected mode. This
+ * code gets copied to a page in the low 4G ie, 32bit space so it can run
+ * in the protected mode. Hence we cannot use any compile/link time offsets or
+ * addresses. It restores long mode via temporary gdt and page tables and
+ * eventually jumps to kernel code entry at HV_CRASHDATA_OFFS_C_entry.
+ *
+ * PreCondition (ie, Hypervisor call back ABI):
+ * o CR0 is set to 0x0021: PE(prot mode) and NE are set, paging is disabled
+ * o CR4 is set to 0x0
+ * o IA32_EFER is set to 0x901 (SCE and NXE are set)
+ * o EDI is set to the Arg passed to HVCALL_DISABLE_HYP_EX.
+ * o CS, DS, ES, FS, GS are all initialized with a base of 0 and limit 0xFFFF
+ * o IDTR, TR and GDTR are initialized with a base of 0 and limit of 0xFFFF
+ * o LDTR is initialized as invalid (limit of 0)
+ * o MSR PAT is power on default.
+ * o Other state/registers are cleared. All TLBs flushed.
+ */
+
+#define HV_CRASHDATA_OFFS_TRAMPCR3 0x0 /* 0 */
+#define HV_CRASHDATA_OFFS_KERNCR3 0x8 /* 8 */
+#define HV_CRASHDATA_OFFS_GDTRLIMIT 0x12 /* 18 */
+#define HV_CRASHDATA_OFFS_CS_JMPTGT 0x28 /* 40 */
+#define HV_CRASHDATA_OFFS_C_entry 0x30 /* 48 */
+
+ .text
+ .code32
+
+SYM_CODE_START(hv_crash_asm32)
+ UNWIND_HINT_UNDEFINED
+ ENDBR
+ movl $X86_CR4_PAE, %ecx
+ movl %ecx, %cr4
+
+ movl %edi, %ebx
+ add $HV_CRASHDATA_OFFS_TRAMPCR3, %ebx
+ movl %cs:(%ebx), %eax
+ movl %eax, %cr3
+
+ /* Setup EFER for long mode now */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ /* Turn paging on using the temp 32bit trampoline page table */
+ movl %cr0, %eax
+ orl $(X86_CR0_PG), %eax
+ movl %eax, %cr0
+
+ /* since kernel cr3 could be above 4G, we need to be in the long mode
+ * before we can load 64bits of the kernel cr3. We use a temp gdt for
+ * that with CS.L=1 and CS.D=0 */
+ mov %edi, %eax
+ add $HV_CRASHDATA_OFFS_GDTRLIMIT, %eax
+ lgdtl %cs:(%eax)
+
+ /* not done yet, restore CS now to switch to CS.L=1 */
+ mov %edi, %eax
+ add $HV_CRASHDATA_OFFS_CS_JMPTGT, %eax
+ ljmp %cs:*(%eax)
+SYM_CODE_END(hv_crash_asm32)
+
+ /* we now run in full 64bit IA32-e long mode, CS.L=1 and CS.D=0 */
+ .code64
+ .balign 8
+SYM_CODE_START(hv_crash_asm64)
+ UNWIND_HINT_UNDEFINED
+ ENDBR
+ /* restore kernel page tables so we can jump to kernel code */
+ mov %edi, %eax
+ add $HV_CRASHDATA_OFFS_KERNCR3, %eax
+ movq %cs:(%eax), %rbx
+ movq %rbx, %cr3
+
+ mov %edi, %eax
+ add $HV_CRASHDATA_OFFS_C_entry, %eax
+ movq %cs:(%eax), %rbx
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rbx
+
+ int $3
+
+SYM_INNER_LABEL(hv_crash_asm_end, SYM_L_GLOBAL)
+SYM_CODE_END(hv_crash_asm64)
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 042e8712d8de..c0edaed0efb3 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -9,12 +9,17 @@
#include <asm/apic.h>
#include <asm/boot.h>
#include <asm/desc.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/types.h>
#include <asm/i8259.h>
#include <asm/mshyperv.h>
#include <asm/msr.h>
#include <asm/realmode.h>
#include <asm/reboot.h>
+#include <asm/smap.h>
+#include <linux/export.h>
#include <../kernel/smpboot.h>
+#include "../../kernel/fpu/legacy.h"
extern struct boot_params boot_params;
static struct real_mode_header hv_vtl_real_mode_header;
@@ -249,3 +254,28 @@ int __init hv_vtl_early_init(void)
return 0;
}
+
+DEFINE_STATIC_CALL_NULL(__mshv_vtl_return_hypercall, void (*)(void));
+
+void mshv_vtl_return_call_init(u64 vtl_return_offset)
+{
+ static_call_update(__mshv_vtl_return_hypercall,
+ (void *)((u8 *)hv_hypercall_pg + vtl_return_offset));
+}
+EXPORT_SYMBOL(mshv_vtl_return_call_init);
+
+void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0)
+{
+ struct hv_vp_assist_page *hvp;
+
+ hvp = hv_vp_assist_page[smp_processor_id()];
+ hvp->vtl_ret_x64rax = vtl0->rax;
+ hvp->vtl_ret_x64rcx = vtl0->rcx;
+
+ kernel_fpu_begin_mask(0);
+ fxrstor(&vtl0->fx_state);
+ __mshv_vtl_return_call(vtl0);
+ fxsave(&vtl0->fx_state);
+ kernel_fpu_end();
+}
+EXPORT_SYMBOL(mshv_vtl_return_call);
diff --git a/arch/x86/hyperv/mshv-asm-offsets.c b/arch/x86/hyperv/mshv-asm-offsets.c
new file mode 100644
index 000000000000..882c1db6df16
--- /dev/null
+++ b/arch/x86/hyperv/mshv-asm-offsets.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ *
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Author:
+ * Naman Jain <namjain@microsoft.com>
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/kbuild.h>
+#include <asm/mshyperv.h>
+
+static void __used common(void)
+{
+ if (IS_ENABLED(CONFIG_HYPERV_VTL_MODE)) {
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rax, mshv_vtl_cpu_context, rax);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rcx, mshv_vtl_cpu_context, rcx);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rdx, mshv_vtl_cpu_context, rdx);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rbx, mshv_vtl_cpu_context, rbx);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rbp, mshv_vtl_cpu_context, rbp);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rsi, mshv_vtl_cpu_context, rsi);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rdi, mshv_vtl_cpu_context, rdi);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r8, mshv_vtl_cpu_context, r8);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r9, mshv_vtl_cpu_context, r9);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r10, mshv_vtl_cpu_context, r10);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r11, mshv_vtl_cpu_context, r11);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r12, mshv_vtl_cpu_context, r12);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r13, mshv_vtl_cpu_context, r13);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r14, mshv_vtl_cpu_context, r14);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r15, mshv_vtl_cpu_context, r15);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_cr2, mshv_vtl_cpu_context, cr2);
+ }
+}
diff --git a/arch/x86/hyperv/mshv_vtl_asm.S b/arch/x86/hyperv/mshv_vtl_asm.S
new file mode 100644
index 000000000000..f595eefad9ab
--- /dev/null
+++ b/arch/x86/hyperv/mshv_vtl_asm.S
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Assembly level code for mshv_vtl VTL transition
+ *
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Author:
+ * Naman Jain <namjain@microsoft.com>
+ */
+
+#include <linux/linkage.h>
+#include <linux/static_call_types.h>
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/frame.h>
+#include "mshv-asm-offsets.h"
+
+ .text
+ .section .noinstr.text, "ax"
+/*
+ * void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0)
+ *
+ * This function is used to context switch between different Virtual Trust Levels.
+ * It is marked as 'noinstr' to prevent against instrumentation and debugging facilities.
+ * NMIs aren't a problem because the NMI handler saves/restores CR2 specifically to guard
+ * against #PFs in NMI context clobbering the guest state.
+ */
+SYM_FUNC_START(__mshv_vtl_return_call)
+ /* Push callee save registers */
+ pushq %rbp
+ mov %rsp, %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+
+ /* register switch to VTL0 clobbers all registers except rax/rcx */
+ mov %_ASM_ARG1, %rax
+
+ /* grab rbx/rbp/rsi/rdi/r8-r15 */
+ mov MSHV_VTL_CPU_CONTEXT_rbx(%rax), %rbx
+ mov MSHV_VTL_CPU_CONTEXT_rbp(%rax), %rbp
+ mov MSHV_VTL_CPU_CONTEXT_rsi(%rax), %rsi
+ mov MSHV_VTL_CPU_CONTEXT_rdi(%rax), %rdi
+ mov MSHV_VTL_CPU_CONTEXT_r8(%rax), %r8
+ mov MSHV_VTL_CPU_CONTEXT_r9(%rax), %r9
+ mov MSHV_VTL_CPU_CONTEXT_r10(%rax), %r10
+ mov MSHV_VTL_CPU_CONTEXT_r11(%rax), %r11
+ mov MSHV_VTL_CPU_CONTEXT_r12(%rax), %r12
+ mov MSHV_VTL_CPU_CONTEXT_r13(%rax), %r13
+ mov MSHV_VTL_CPU_CONTEXT_r14(%rax), %r14
+ mov MSHV_VTL_CPU_CONTEXT_r15(%rax), %r15
+
+ mov MSHV_VTL_CPU_CONTEXT_cr2(%rax), %rdx
+ mov %rdx, %cr2
+ mov MSHV_VTL_CPU_CONTEXT_rdx(%rax), %rdx
+
+ /* stash host registers on stack */
+ pushq %rax
+ pushq %rcx
+
+ xor %ecx, %ecx
+
+ /* make a hypercall to switch VTL */
+ call STATIC_CALL_TRAMP_STR(__mshv_vtl_return_hypercall)
+
+ /* stash guest registers on stack, restore saved host copies */
+ pushq %rax
+ pushq %rcx
+ mov 16(%rsp), %rcx
+ mov 24(%rsp), %rax
+
+ mov %rdx, MSHV_VTL_CPU_CONTEXT_rdx(%rax)
+ mov %cr2, %rdx
+ mov %rdx, MSHV_VTL_CPU_CONTEXT_cr2(%rax)
+ pop MSHV_VTL_CPU_CONTEXT_rcx(%rax)
+ pop MSHV_VTL_CPU_CONTEXT_rax(%rax)
+ add $16, %rsp
+
+ /* save rbx/rbp/rsi/rdi/r8-r15 */
+ mov %rbx, MSHV_VTL_CPU_CONTEXT_rbx(%rax)
+ mov %rbp, MSHV_VTL_CPU_CONTEXT_rbp(%rax)
+ mov %rsi, MSHV_VTL_CPU_CONTEXT_rsi(%rax)
+ mov %rdi, MSHV_VTL_CPU_CONTEXT_rdi(%rax)
+ mov %r8, MSHV_VTL_CPU_CONTEXT_r8(%rax)
+ mov %r9, MSHV_VTL_CPU_CONTEXT_r9(%rax)
+ mov %r10, MSHV_VTL_CPU_CONTEXT_r10(%rax)
+ mov %r11, MSHV_VTL_CPU_CONTEXT_r11(%rax)
+ mov %r12, MSHV_VTL_CPU_CONTEXT_r12(%rax)
+ mov %r13, MSHV_VTL_CPU_CONTEXT_r13(%rax)
+ mov %r14, MSHV_VTL_CPU_CONTEXT_r14(%rax)
+ mov %r15, MSHV_VTL_CPU_CONTEXT_r15(%rax)
+
+ /* pop callee-save registers r12-r15, rbx */
+ pop %rbx
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+
+ pop %rbp
+ RET
+SYM_FUNC_END(__mshv_vtl_return_call)
+/*
+ * Make sure that static_call_key symbol: __SCK____mshv_vtl_return_hypercall is accessible here.
+ * Below code is inspired from __ADDRESSABLE(sym) macro. Symbol name is kept simple, to avoid
+ * naming it something like "__UNIQUE_ID_addressable___SCK____mshv_vtl_return_hypercall_662.0"
+ * which would otherwise have been generated by the macro.
+ */
+ .section .discard.addressable,"aw"
+ .align 8
+ .type mshv_vtl_return_sym, @object
+ .size mshv_vtl_return_sym, 8
+mshv_vtl_return_sym:
+ .quad __SCK____mshv_vtl_return_hypercall
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 605abd02158d..eef4c3a5ba28 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -11,6 +11,7 @@
#include <asm/paravirt.h>
#include <asm/msr.h>
#include <hyperv/hvhdk.h>
+#include <asm/fpu/types.h>
/*
* Hyper-V always provides a single IO-APIC at this MMIO address.
@@ -176,6 +177,8 @@ int hyperv_flush_guest_mapping_range(u64 as,
int hyperv_fill_flush_guest_mapping_list(
struct hv_guest_mapping_flush_list *flush,
u64 start_gfn, u64 end_gfn);
+void hv_sleep_notifiers_register(void);
+void hv_machine_power_off(void);
#ifdef CONFIG_X86_64
void hv_apic_init(void);
@@ -237,6 +240,15 @@ static __always_inline u64 hv_raw_get_msr(unsigned int reg)
}
int hv_apicid_to_vp_index(u32 apic_id);
+#if IS_ENABLED(CONFIG_MSHV_ROOT) && IS_ENABLED(CONFIG_CRASH_DUMP)
+void hv_root_crash_init(void);
+void hv_crash_asm32(void);
+void hv_crash_asm64(void);
+void hv_crash_asm_end(void);
+#else /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */
+static inline void hv_root_crash_init(void) {}
+#endif /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */
+
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
static inline void hyperv_setup_mmu_ops(void) {}
@@ -260,13 +272,46 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; }
static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; }
#endif /* CONFIG_HYPERV */
+struct mshv_vtl_cpu_context {
+ union {
+ struct {
+ u64 rax;
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u64 cr2;
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+ };
+ u64 gp_regs[16];
+ };
+
+ struct fxregs_state fx_state;
+};
#ifdef CONFIG_HYPERV_VTL_MODE
void __init hv_vtl_init_platform(void);
int __init hv_vtl_early_init(void);
+void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0);
+void mshv_vtl_return_call_init(u64 vtl_return_offset);
+void mshv_vtl_return_hypercall(void);
+void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0);
#else
static inline void __init hv_vtl_init_platform(void) {}
static inline int __init hv_vtl_early_init(void) { return 0; }
+static inline void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
+static inline void mshv_vtl_return_call_init(u64 vtl_return_offset) {}
+static inline void mshv_vtl_return_hypercall(void) {}
+static inline void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
#endif
#include <asm-generic/mshyperv.h>
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index c4febdbcfe4d..579fb2c64cfd 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -28,9 +28,9 @@
#include <asm/apic.h>
#include <asm/timer.h>
#include <asm/reboot.h>
+#include <asm/msr.h>
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
-#include <asm/msr.h>
#include <asm/numa.h>
#include <asm/svm.h>
@@ -39,6 +39,12 @@ bool hv_nested;
struct ms_hyperv_info ms_hyperv;
#if IS_ENABLED(CONFIG_HYPERV)
+/*
+ * When running with the paravisor, controls proxying the synthetic interrupts
+ * from the host
+ */
+static bool hv_para_sint_proxy;
+
static inline unsigned int hv_get_nested_msr(unsigned int reg)
{
if (hv_is_sint_msr(reg))
@@ -75,17 +81,51 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_msr);
void hv_set_non_nested_msr(unsigned int reg, u64 value)
{
if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) {
+ /* The hypervisor will get the intercept. */
hv_ivm_msr_write(reg, value);
- /* Write proxy bit via wrmsl instruction */
- if (hv_is_sint_msr(reg))
- wrmsrq(reg, value | 1 << 20);
+ /* Using wrmsrq so the following goes to the paravisor. */
+ if (hv_is_sint_msr(reg)) {
+ union hv_synic_sint sint = { .as_uint64 = value };
+
+ sint.proxy = hv_para_sint_proxy;
+ native_wrmsrq(reg, sint.as_uint64);
+ }
} else {
- wrmsrq(reg, value);
+ native_wrmsrq(reg, value);
}
}
EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
+/*
+ * Enable or disable proxying synthetic interrupts
+ * to the paravisor.
+ */
+void hv_para_set_sint_proxy(bool enable)
+{
+ hv_para_sint_proxy = enable;
+}
+
+/*
+ * Get the SynIC register value from the paravisor.
+ */
+u64 hv_para_get_synic_register(unsigned int reg)
+{
+ if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+ return ~0ULL;
+ return native_read_msr(reg);
+}
+
+/*
+ * Set the SynIC register value with the paravisor.
+ */
+void hv_para_set_synic_register(unsigned int reg, u64 val)
+{
+ if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+ return;
+ native_write_msr(reg, val);
+}
+
u64 hv_get_msr(unsigned int reg)
{
if (hv_nested)
@@ -215,7 +255,7 @@ static void hv_machine_shutdown(void)
#endif /* CONFIG_KEXEC_CORE */
#ifdef CONFIG_CRASH_DUMP
-static void hv_machine_crash_shutdown(struct pt_regs *regs)
+static void hv_guest_crash_shutdown(struct pt_regs *regs)
{
if (hv_crash_handler)
hv_crash_handler(regs);
@@ -440,7 +480,7 @@ EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
static void __init ms_hyperv_init_platform(void)
{
- int hv_max_functions_eax;
+ int hv_max_functions_eax, eax;
#ifdef CONFIG_PARAVIRT
pv_info.name = "Hyper-V";
@@ -470,11 +510,27 @@ static void __init ms_hyperv_init_platform(void)
hv_identify_partition_type();
+ if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ ms_hyperv.hints |= HV_DEPRECATING_AEOI_RECOMMENDED;
+
if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
hv_nested = true;
pr_info("Hyper-V: running on a nested hypervisor\n");
}
+ /*
+ * There is no check against the max function for HYPERV_CPUID_VIRT_STACK_* CPUID
+ * leaves as the hypervisor doesn't handle them. Even a nested root partition (L2
+ * root) will not get them because the nested (L1) hypervisor filters them out.
+ * These are handled through intercept processing by the Windows Hyper-V stack
+ * or the paravisor.
+ */
+ eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
+ ms_hyperv.confidential_vmbus_available =
+ eax & HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE;
+ ms_hyperv.msi_ext_dest_id =
+ eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
x86_platform.calibrate_tsc = hv_get_tsc_khz;
@@ -565,11 +621,14 @@ static void __init ms_hyperv_init_platform(void)
#endif
#if IS_ENABLED(CONFIG_HYPERV)
+ if (hv_root_partition())
+ machine_ops.power_off = hv_machine_power_off;
#if defined(CONFIG_KEXEC_CORE)
machine_ops.shutdown = hv_machine_shutdown;
#endif
#if defined(CONFIG_CRASH_DUMP)
- machine_ops.crash_shutdown = hv_machine_crash_shutdown;
+ if (!hv_root_partition())
+ machine_ops.crash_shutdown = hv_guest_crash_shutdown;
#endif
#endif
/*
@@ -675,21 +734,10 @@ static bool __init ms_hyperv_x2apic_available(void)
* pci-hyperv host bridge.
*
* Note: for a Hyper-V root partition, this will always return false.
- * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by
- * default, they are implemented as intercepts by the Windows Hyper-V stack.
- * Even a nested root partition (L2 root) will not get them because the
- * nested (L1) hypervisor filters them out.
*/
static bool __init ms_hyperv_msi_ext_dest_id(void)
{
- u32 eax;
-
- eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE);
- if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE)
- return false;
-
- eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
- return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+ return ms_hyperv.msi_ext_dest_id;
}
#ifdef CONFIG_AMD_MEM_ENCRYPT
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 0b8c391a0342..7937ac0cbd0f 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -17,7 +17,8 @@ config HYPERV
config HYPERV_VTL_MODE
bool "Enable Linux to boot in VTL context"
- depends on (X86_64 || ARM64) && HYPERV
+ depends on (X86_64 && HAVE_STATIC_CALL) || ARM64
+ depends on HYPERV
depends on SMP
default n
help
@@ -75,6 +76,8 @@ config MSHV_ROOT
depends on PAGE_SIZE_4KB
select EVENTFD
select VIRT_XFER_TO_GUEST_WORK
+ select HMM_MIRROR
+ select MMU_NOTIFIER
default n
help
Select this option to enable support for booting and running as root
@@ -82,4 +85,28 @@ config MSHV_ROOT
If unsure, say N.
+config MSHV_VTL
+ tristate "Microsoft Hyper-V VTL driver"
+ depends on X86_64 && HYPERV_VTL_MODE
+ depends on HYPERV_VMBUS
+ # Mapping VTL0 memory to a userspace process in VTL2 is supported in OpenHCL.
+ # VTL2 for OpenHCL makes use of Huge Pages to improve performance on VMs,
+ # specially with large memory requirements.
+ depends on TRANSPARENT_HUGEPAGE
+ # MTRRs are controlled by VTL0, and are not specific to individual VTLs.
+ # Therefore, do not attempt to access or modify MTRRs here.
+ depends on !MTRR
+ select CPUMASK_OFFSTACK
+ select VIRT_XFER_TO_GUEST_WORK
+ default n
+ help
+ Select this option to enable Hyper-V VTL driver support.
+ This driver provides interfaces for Virtual Machine Manager (VMM) running in VTL2
+ userspace to create VTLs and partitions, setup and manage VTL0 memory and
+ allow userspace to make direct hypercalls. This also allows to map VTL0's address
+ space to a usermode process in VTL2 and supports getting new VMBus messages and channel
+ events in VTL2.
+
+ If unsure, say N.
+
endmenu
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index 1a1677bf4dac..a49f93c2d245 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_HYPERV_VMBUS) += hv_vmbus.o
obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
obj-$(CONFIG_MSHV_ROOT) += mshv_root.o
+obj-$(CONFIG_MSHV_VTL) += mshv_vtl.o
CFLAGS_hv_trace.o = -I$(src)
CFLAGS_hv_balloon.o = -I$(src)
@@ -13,8 +14,12 @@ hv_vmbus-y := vmbus_drv.o \
hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o
hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
- mshv_root_hv_call.o mshv_portid_table.o
+ mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o
+mshv_vtl-y := mshv_vtl_main.o
# Code that must be built-in
obj-$(CONFIG_HYPERV) += hv_common.o
-obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o
+obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o
+ifneq ($(CONFIG_MSHV_ROOT)$(CONFIG_MSHV_VTL),)
+ obj-y += mshv_common.o
+endif
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 162d6aeece7b..6821f225248b 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -410,6 +410,21 @@ static int create_gpadl_header(enum hv_gpadl_type type, void *kbuffer,
return 0;
}
+static void vmbus_free_channel_msginfo(struct vmbus_channel_msginfo *msginfo)
+{
+ struct vmbus_channel_msginfo *submsginfo, *tmp;
+
+ if (!msginfo)
+ return;
+
+ list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist,
+ msglistentry) {
+ kfree(submsginfo);
+ }
+
+ kfree(msginfo);
+}
+
/*
* __vmbus_establish_gpadl - Establish a GPADL for a buffer or ringbuffer
*
@@ -429,7 +444,7 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
struct vmbus_channel_gpadl_header *gpadlmsg;
struct vmbus_channel_gpadl_body *gpadl_body;
struct vmbus_channel_msginfo *msginfo = NULL;
- struct vmbus_channel_msginfo *submsginfo, *tmp;
+ struct vmbus_channel_msginfo *submsginfo;
struct list_head *curr;
u32 next_gpadl_handle;
unsigned long flags;
@@ -444,20 +459,24 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
return ret;
}
- /*
- * Set the "decrypted" flag to true for the set_memory_decrypted()
- * success case. In the failure case, the encryption state of the
- * memory is unknown. Leave "decrypted" as true to ensure the
- * memory will be leaked instead of going back on the free list.
- */
- gpadl->decrypted = true;
- ret = set_memory_decrypted((unsigned long)kbuffer,
- PFN_UP(size));
- if (ret) {
- dev_warn(&channel->device_obj->device,
- "Failed to set host visibility for new GPADL %d.\n",
- ret);
- return ret;
+ gpadl->decrypted = !((channel->co_external_memory && type == HV_GPADL_BUFFER) ||
+ (channel->co_ring_buffer && type == HV_GPADL_RING));
+ if (gpadl->decrypted) {
+ /*
+ * The "decrypted" flag being true assumes that set_memory_decrypted() succeeds.
+ * But if it fails, the encryption state of the memory is unknown. In that case,
+ * leave "decrypted" as true to ensure the memory is leaked instead of going back
+ * on the free list.
+ */
+ ret = set_memory_decrypted((unsigned long)kbuffer,
+ PFN_UP(size));
+ if (ret) {
+ dev_warn(&channel->device_obj->device,
+ "Failed to set host visibility for new GPADL %d.\n",
+ ret);
+ vmbus_free_channel_msginfo(msginfo);
+ return ret;
+ }
}
init_completion(&msginfo->waitevent);
@@ -532,12 +551,8 @@ cleanup:
spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
list_del(&msginfo->msglistentry);
spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
- list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist,
- msglistentry) {
- kfree(submsginfo);
- }
- kfree(msginfo);
+ vmbus_free_channel_msginfo(msginfo);
if (ret) {
/*
@@ -545,8 +560,10 @@ cleanup:
* left as true so the memory is leaked instead of being
* put back on the free list.
*/
- if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size)))
- gpadl->decrypted = false;
+ if (gpadl->decrypted) {
+ if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size)))
+ gpadl->decrypted = false;
+ }
}
return ret;
@@ -573,7 +590,7 @@ EXPORT_SYMBOL_GPL(vmbus_establish_gpadl);
* keeps track of the next available slot in the array. Initially, each
* slot points to the next one (as in a Linked List). The last slot
* does not point to anything, so its value is U64_MAX by default.
- * @size The size of the array
+ * @size: The size of the array
*/
static u64 *request_arr_init(u32 size)
{
@@ -677,12 +694,13 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
goto error_clean_ring;
err = hv_ringbuffer_init(&newchannel->outbound,
- page, send_pages, 0);
+ page, send_pages, 0, newchannel->co_ring_buffer);
if (err)
goto error_free_gpadl;
err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages],
- recv_pages, newchannel->max_pkt_size);
+ recv_pages, newchannel->max_pkt_size,
+ newchannel->co_ring_buffer);
if (err)
goto error_free_gpadl;
@@ -863,8 +881,11 @@ post_msg_err:
kfree(info);
- ret = set_memory_encrypted((unsigned long)gpadl->buffer,
- PFN_UP(gpadl->size));
+ if (gpadl->decrypted)
+ ret = set_memory_encrypted((unsigned long)gpadl->buffer,
+ PFN_UP(gpadl->size));
+ else
+ ret = 0;
if (ret)
pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret);
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 65dd299e2944..74fed2c073d4 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -844,14 +844,14 @@ static void vmbus_wait_for_unload(void)
= per_cpu_ptr(hv_context.cpu_context, cpu);
/*
- * In a CoCo VM the synic_message_page is not allocated
+ * In a CoCo VM the hyp_synic_message_page is not allocated
* in hv_synic_alloc(). Instead it is set/cleared in
- * hv_synic_enable_regs() and hv_synic_disable_regs()
+ * hv_hyp_synic_enable_regs() and hv_hyp_synic_disable_regs()
* such that it is set only when the CPU is online. If
* not all present CPUs are online, the message page
* might be NULL, so skip such CPUs.
*/
- page_addr = hv_cpu->synic_message_page;
+ page_addr = hv_cpu->hyp_synic_message_page;
if (!page_addr)
continue;
@@ -892,7 +892,7 @@ completed:
struct hv_per_cpu_context *hv_cpu
= per_cpu_ptr(hv_context.cpu_context, cpu);
- page_addr = hv_cpu->synic_message_page;
+ page_addr = hv_cpu->hyp_synic_message_page;
if (!page_addr)
continue;
@@ -1022,6 +1022,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
struct vmbus_channel_offer_channel *offer;
struct vmbus_channel *oldchannel, *newchannel;
size_t offer_sz;
+ bool co_ring_buffer, co_external_memory;
offer = (struct vmbus_channel_offer_channel *)hdr;
@@ -1034,6 +1035,22 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
return;
}
+ co_ring_buffer = is_co_ring_buffer(offer);
+ co_external_memory = is_co_external_memory(offer);
+ if (!co_ring_buffer && co_external_memory) {
+ pr_err("Invalid offer relid=%d: the ring buffer isn't encrypted\n",
+ offer->child_relid);
+ return;
+ }
+ if (co_ring_buffer || co_external_memory) {
+ if (vmbus_proto_version < VERSION_WIN10_V6_0 || !vmbus_is_confidential()) {
+ pr_err("Invalid offer relid=%d: no support for confidential VMBus\n",
+ offer->child_relid);
+ atomic_dec(&vmbus_connection.offer_in_progress);
+ return;
+ }
+ }
+
oldchannel = find_primary_channel_by_offer(offer);
if (oldchannel != NULL) {
@@ -1112,6 +1129,8 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
pr_err("Unable to allocate channel object\n");
return;
}
+ newchannel->co_ring_buffer = co_ring_buffer;
+ newchannel->co_external_memory = co_external_memory;
vmbus_setup_channel_state(newchannel, offer);
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 1fe3573ae52a..5d9cb5bf2d62 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version);
* Linux guests and are not listed.
*/
static __u32 vmbus_versions[] = {
+ VERSION_WIN10_V6_0,
VERSION_WIN10_V5_3,
VERSION_WIN10_V5_2,
VERSION_WIN10_V5_1,
@@ -65,7 +66,7 @@ static __u32 vmbus_versions[] = {
* Maximal VMBus protocol version guests can negotiate. Useful to cap the
* VMBus version for testing and debugging purpose.
*/
-static uint max_version = VERSION_WIN10_V5_3;
+static uint max_version = VERSION_WIN10_V6_0;
module_param(max_version, uint, S_IRUGO);
MODULE_PARM_DESC(max_version,
@@ -105,6 +106,9 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID;
}
+ if (vmbus_is_confidential() && version >= VERSION_WIN10_V6_0)
+ msg->feature_flags = VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS;
+
/*
* shared_gpa_boundary is zero in non-SNP VMs, so it's safe to always
* bitwise OR it
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index b14c5f9e0ef2..c100f04b3581 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -18,6 +18,7 @@
#include <linux/clockchips.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
+#include <linux/export.h>
#include <clocksource/hyperv_timer.h>
#include <asm/mshyperv.h>
#include <linux/set_memory.h>
@@ -25,6 +26,7 @@
/* The one and only */
struct hv_context hv_context;
+EXPORT_SYMBOL_FOR_MODULES(hv_context, "mshv_vtl");
/*
* hv_init - Main initialization routine.
@@ -74,7 +76,11 @@ int hv_post_message(union hv_connection_id connection_id,
aligned_msg->payload_size = payload_size;
memcpy((void *)aligned_msg->payload, payload, payload_size);
- if (ms_hyperv.paravisor_present) {
+ if (ms_hyperv.paravisor_present && !vmbus_is_confidential()) {
+ /*
+ * If the VMBus isn't confidential, use the CoCo-specific
+ * mechanism to communicate with the hypervisor.
+ */
if (hv_isolation_type_tdx())
status = hv_tdx_hypercall(HVCALL_POST_MESSAGE,
virt_to_phys(aligned_msg), 0);
@@ -88,6 +94,11 @@ int hv_post_message(union hv_connection_id connection_id,
u64 control = HVCALL_POST_MESSAGE;
control |= hv_nested ? HV_HYPERCALL_NESTED : 0;
+ /*
+ * If there is no paravisor, this will go to the hypervisor.
+ * In the Confidential VMBus case, there is the paravisor
+ * to which this will trap.
+ */
status = hv_do_hypercall(control, aligned_msg, NULL);
}
@@ -95,11 +106,72 @@ int hv_post_message(union hv_connection_id connection_id,
return hv_result(status);
}
+EXPORT_SYMBOL_FOR_MODULES(hv_post_message, "mshv_vtl");
+
+static int hv_alloc_page(void **page, bool decrypt, const char *note)
+{
+ int ret = 0;
+
+ /*
+ * After the page changes its encryption status, its contents might
+ * appear scrambled on some hardware. Thus `get_zeroed_page` would
+ * zero the page out in vain, so do that explicitly exactly once.
+ *
+ * By default, the page is allocated encrypted in a CoCo VM.
+ */
+ *page = (void *)__get_free_page(GFP_KERNEL);
+ if (!*page)
+ return -ENOMEM;
+
+ if (decrypt)
+ ret = set_memory_decrypted((unsigned long)*page, 1);
+ if (ret)
+ goto failed;
+
+ memset(*page, 0, PAGE_SIZE);
+ return 0;
+
+failed:
+ /*
+ * Report the failure but don't put the page back on the free list as
+ * its encryption status is unknown.
+ */
+ pr_err("allocation failed for %s page, error %d, decrypted %d\n",
+ note, ret, decrypt);
+ *page = NULL;
+ return ret;
+}
+
+static int hv_free_page(void **page, bool encrypt, const char *note)
+{
+ int ret = 0;
+
+ if (!*page)
+ return 0;
+
+ if (encrypt)
+ ret = set_memory_encrypted((unsigned long)*page, 1);
+
+ /*
+ * In the case of the failure, the page is leaked. Something is wrong,
+ * prefer to lose the page with the unknown encryption status and stay afloat.
+ */
+ if (ret)
+ pr_err("deallocation failed for %s page, error %d, encrypt %d\n",
+ note, ret, encrypt);
+ else
+ free_page((unsigned long)*page);
+
+ *page = NULL;
+
+ return ret;
+}
int hv_synic_alloc(void)
{
int cpu, ret = -ENOMEM;
struct hv_per_cpu_context *hv_cpu;
+ const bool decrypt = !vmbus_is_confidential();
/*
* First, zero all per-cpu memory areas so hv_synic_free() can
@@ -125,73 +197,37 @@ int hv_synic_alloc(void)
vmbus_on_msg_dpc, (unsigned long)hv_cpu);
if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
- hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC);
- if (!hv_cpu->post_msg_page) {
- pr_err("Unable to allocate post msg page\n");
+ ret = hv_alloc_page(&hv_cpu->post_msg_page,
+ decrypt, "post msg");
+ if (ret)
goto err;
- }
-
- ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1);
- if (ret) {
- pr_err("Failed to decrypt post msg page: %d\n", ret);
- /* Just leak the page, as it's unsafe to free the page. */
- hv_cpu->post_msg_page = NULL;
- goto err;
- }
-
- memset(hv_cpu->post_msg_page, 0, PAGE_SIZE);
}
/*
- * Synic message and event pages are allocated by paravisor.
- * Skip these pages allocation here.
+ * If these SynIC pages are not allocated, SIEF and SIM pages
+ * are configured using what the root partition or the paravisor
+ * provides upon reading the SIEFP and SIMP registers.
*/
if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
- hv_cpu->synic_message_page =
- (void *)get_zeroed_page(GFP_ATOMIC);
- if (!hv_cpu->synic_message_page) {
- pr_err("Unable to allocate SYNIC message page\n");
+ ret = hv_alloc_page(&hv_cpu->hyp_synic_message_page,
+ decrypt, "hypervisor SynIC msg");
+ if (ret)
goto err;
- }
-
- hv_cpu->synic_event_page =
- (void *)get_zeroed_page(GFP_ATOMIC);
- if (!hv_cpu->synic_event_page) {
- pr_err("Unable to allocate SYNIC event page\n");
-
- free_page((unsigned long)hv_cpu->synic_message_page);
- hv_cpu->synic_message_page = NULL;
+ ret = hv_alloc_page(&hv_cpu->hyp_synic_event_page,
+ decrypt, "hypervisor SynIC event");
+ if (ret)
goto err;
- }
}
- if (!ms_hyperv.paravisor_present &&
- (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
- ret = set_memory_decrypted((unsigned long)
- hv_cpu->synic_message_page, 1);
- if (ret) {
- pr_err("Failed to decrypt SYNIC msg page: %d\n", ret);
- hv_cpu->synic_message_page = NULL;
-
- /*
- * Free the event page here so that hv_synic_free()
- * won't later try to re-encrypt it.
- */
- free_page((unsigned long)hv_cpu->synic_event_page);
- hv_cpu->synic_event_page = NULL;
+ if (vmbus_is_confidential()) {
+ ret = hv_alloc_page(&hv_cpu->para_synic_message_page,
+ false, "paravisor SynIC msg");
+ if (ret)
goto err;
- }
-
- ret = set_memory_decrypted((unsigned long)
- hv_cpu->synic_event_page, 1);
- if (ret) {
- pr_err("Failed to decrypt SYNIC event page: %d\n", ret);
- hv_cpu->synic_event_page = NULL;
+ ret = hv_alloc_page(&hv_cpu->para_synic_event_page,
+ false, "paravisor SynIC event");
+ if (ret)
goto err;
- }
-
- memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
- memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
}
}
@@ -207,70 +243,46 @@ err:
void hv_synic_free(void)
{
- int cpu, ret;
+ int cpu;
+ const bool encrypt = !vmbus_is_confidential();
for_each_present_cpu(cpu) {
struct hv_per_cpu_context *hv_cpu =
per_cpu_ptr(hv_context.cpu_context, cpu);
- /* It's better to leak the page if the encryption fails. */
- if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
- if (hv_cpu->post_msg_page) {
- ret = set_memory_encrypted((unsigned long)
- hv_cpu->post_msg_page, 1);
- if (ret) {
- pr_err("Failed to encrypt post msg page: %d\n", ret);
- hv_cpu->post_msg_page = NULL;
- }
- }
+ if (ms_hyperv.paravisor_present && hv_isolation_type_tdx())
+ hv_free_page(&hv_cpu->post_msg_page,
+ encrypt, "post msg");
+ if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
+ hv_free_page(&hv_cpu->hyp_synic_event_page,
+ encrypt, "hypervisor SynIC event");
+ hv_free_page(&hv_cpu->hyp_synic_message_page,
+ encrypt, "hypervisor SynIC msg");
}
-
- if (!ms_hyperv.paravisor_present &&
- (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
- if (hv_cpu->synic_message_page) {
- ret = set_memory_encrypted((unsigned long)
- hv_cpu->synic_message_page, 1);
- if (ret) {
- pr_err("Failed to encrypt SYNIC msg page: %d\n", ret);
- hv_cpu->synic_message_page = NULL;
- }
- }
-
- if (hv_cpu->synic_event_page) {
- ret = set_memory_encrypted((unsigned long)
- hv_cpu->synic_event_page, 1);
- if (ret) {
- pr_err("Failed to encrypt SYNIC event page: %d\n", ret);
- hv_cpu->synic_event_page = NULL;
- }
- }
+ if (vmbus_is_confidential()) {
+ hv_free_page(&hv_cpu->para_synic_event_page,
+ false, "paravisor SynIC event");
+ hv_free_page(&hv_cpu->para_synic_message_page,
+ false, "paravisor SynIC msg");
}
-
- free_page((unsigned long)hv_cpu->post_msg_page);
- free_page((unsigned long)hv_cpu->synic_event_page);
- free_page((unsigned long)hv_cpu->synic_message_page);
}
kfree(hv_context.hv_numa_map);
}
/*
- * hv_synic_init - Initialize the Synthetic Interrupt Controller.
- *
- * If it is already initialized by another entity (ie x2v shim), we need to
- * retrieve the initialized message and event pages. Otherwise, we create and
- * initialize the message and event pages.
+ * hv_hyp_synic_enable_regs - Initialize the Synthetic Interrupt Controller
+ * with the hypervisor.
*/
-void hv_synic_enable_regs(unsigned int cpu)
+void hv_hyp_synic_enable_regs(unsigned int cpu)
{
struct hv_per_cpu_context *hv_cpu =
per_cpu_ptr(hv_context.cpu_context, cpu);
union hv_synic_simp simp;
union hv_synic_siefp siefp;
union hv_synic_sint shared_sint;
- union hv_synic_scontrol sctrl;
- /* Setup the Synic's message page */
+ /* Setup the Synic's message page with the hypervisor. */
simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
simp.simp_enabled = 1;
@@ -278,18 +290,18 @@ void hv_synic_enable_regs(unsigned int cpu)
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
- hv_cpu->synic_message_page =
+ hv_cpu->hyp_synic_message_page =
(void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
- if (!hv_cpu->synic_message_page)
+ if (!hv_cpu->hyp_synic_message_page)
pr_err("Fail to map synic message page.\n");
} else {
- simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
+ simp.base_simp_gpa = virt_to_phys(hv_cpu->hyp_synic_message_page)
>> HV_HYP_PAGE_SHIFT;
}
hv_set_msr(HV_MSR_SIMP, simp.as_uint64);
- /* Setup the Synic's event page */
+ /* Setup the Synic's event page with the hypervisor. */
siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
siefp.siefp_enabled = 1;
@@ -297,16 +309,17 @@ void hv_synic_enable_regs(unsigned int cpu)
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
- hv_cpu->synic_event_page =
+ hv_cpu->hyp_synic_event_page =
(void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
- if (!hv_cpu->synic_event_page)
+ if (!hv_cpu->hyp_synic_event_page)
pr_err("Fail to map synic event page.\n");
} else {
- siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
+ siefp.base_siefp_gpa = virt_to_phys(hv_cpu->hyp_synic_event_page)
>> HV_HYP_PAGE_SHIFT;
}
hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
+ hv_enable_coco_interrupt(cpu, vmbus_interrupt, true);
/* Setup the shared SINT. */
if (vmbus_irq != -1)
@@ -317,6 +330,11 @@ void hv_synic_enable_regs(unsigned int cpu)
shared_sint.masked = false;
shared_sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+}
+
+static void hv_hyp_synic_enable_interrupts(void)
+{
+ union hv_synic_scontrol sctrl;
/* Enable the global synic bit */
sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
@@ -325,23 +343,72 @@ void hv_synic_enable_regs(unsigned int cpu)
hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
}
+static void hv_para_synic_enable_regs(unsigned int cpu)
+{
+ union hv_synic_simp simp;
+ union hv_synic_siefp siefp;
+ struct hv_per_cpu_context *hv_cpu
+ = per_cpu_ptr(hv_context.cpu_context, cpu);
+
+ /* Setup the Synic's message page with the paravisor. */
+ simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP);
+ simp.simp_enabled = 1;
+ simp.base_simp_gpa = virt_to_phys(hv_cpu->para_synic_message_page)
+ >> HV_HYP_PAGE_SHIFT;
+ hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64);
+
+ /* Setup the Synic's event page with the paravisor. */
+ siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP);
+ siefp.siefp_enabled = 1;
+ siefp.base_siefp_gpa = virt_to_phys(hv_cpu->para_synic_event_page)
+ >> HV_HYP_PAGE_SHIFT;
+ hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64);
+}
+
+static void hv_para_synic_enable_interrupts(void)
+{
+ union hv_synic_scontrol sctrl;
+
+ /* Enable the global synic bit */
+ sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL);
+ sctrl.enable = 1;
+ hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64);
+}
+
int hv_synic_init(unsigned int cpu)
{
- hv_synic_enable_regs(cpu);
+ if (vmbus_is_confidential())
+ hv_para_synic_enable_regs(cpu);
+
+ /*
+ * The SINT is set in hv_hyp_synic_enable_regs() by calling
+ * hv_set_msr(). hv_set_msr() in turn has special case code for the
+ * SINT MSRs that write to the hypervisor version of the MSR *and*
+ * the paravisor version of the MSR (but *without* the proxy bit when
+ * VMBus is confidential).
+ *
+ * Then enable interrupts via the paravisor if VMBus is confidential,
+ * and otherwise via the hypervisor.
+ */
+
+ hv_hyp_synic_enable_regs(cpu);
+ if (vmbus_is_confidential())
+ hv_para_synic_enable_interrupts();
+ else
+ hv_hyp_synic_enable_interrupts();
hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT);
return 0;
}
-void hv_synic_disable_regs(unsigned int cpu)
+void hv_hyp_synic_disable_regs(unsigned int cpu)
{
struct hv_per_cpu_context *hv_cpu =
per_cpu_ptr(hv_context.cpu_context, cpu);
union hv_synic_sint shared_sint;
union hv_synic_simp simp;
union hv_synic_siefp siefp;
- union hv_synic_scontrol sctrl;
shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT);
@@ -350,18 +417,21 @@ void hv_synic_disable_regs(unsigned int cpu)
/* Need to correctly cleanup in the case of SMP!!! */
/* Disable the interrupt */
hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+ hv_enable_coco_interrupt(cpu, vmbus_interrupt, false);
simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
/*
- * In Isolation VM, sim and sief pages are allocated by
+ * In Isolation VM, simp and sief pages are allocated by
* paravisor. These pages also will be used by kdump
* kernel. So just reset enable bit here and keep page
* addresses.
*/
simp.simp_enabled = 0;
if (ms_hyperv.paravisor_present || hv_root_partition()) {
- iounmap(hv_cpu->synic_message_page);
- hv_cpu->synic_message_page = NULL;
+ if (hv_cpu->hyp_synic_message_page) {
+ iounmap(hv_cpu->hyp_synic_message_page);
+ hv_cpu->hyp_synic_message_page = NULL;
+ }
} else {
simp.base_simp_gpa = 0;
}
@@ -372,21 +442,51 @@ void hv_synic_disable_regs(unsigned int cpu)
siefp.siefp_enabled = 0;
if (ms_hyperv.paravisor_present || hv_root_partition()) {
- iounmap(hv_cpu->synic_event_page);
- hv_cpu->synic_event_page = NULL;
+ if (hv_cpu->hyp_synic_event_page) {
+ iounmap(hv_cpu->hyp_synic_event_page);
+ hv_cpu->hyp_synic_event_page = NULL;
+ }
} else {
siefp.base_siefp_gpa = 0;
}
hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
+}
+
+static void hv_hyp_synic_disable_interrupts(void)
+{
+ union hv_synic_scontrol sctrl;
/* Disable the global synic bit */
sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
sctrl.enable = 0;
hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
+}
- if (vmbus_irq != -1)
- disable_percpu_irq(vmbus_irq);
+static void hv_para_synic_disable_regs(unsigned int cpu)
+{
+ union hv_synic_simp simp;
+ union hv_synic_siefp siefp;
+
+ /* Disable SynIC's message page in the paravisor. */
+ simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP);
+ simp.simp_enabled = 0;
+ hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64);
+
+ /* Disable SynIC's event page in the paravisor. */
+ siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP);
+ siefp.siefp_enabled = 0;
+ hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64);
+}
+
+static void hv_para_synic_disable_interrupts(void)
+{
+ union hv_synic_scontrol sctrl;
+
+ /* Disable the global synic bit */
+ sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL);
+ sctrl.enable = 0;
+ hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64);
}
#define HV_MAX_TRIES 3
@@ -399,16 +499,18 @@ void hv_synic_disable_regs(unsigned int cpu)
* that the normal interrupt handling mechanism will find and process the channel interrupt
* "very soon", and in the process clear the bit.
*/
-static bool hv_synic_event_pending(void)
+static bool __hv_synic_event_pending(union hv_synic_event_flags *event, int sint)
{
- struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
- union hv_synic_event_flags *event =
- (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT;
- unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
+ unsigned long *recv_int_page;
bool pending;
u32 relid;
int tries = 0;
+ if (!event)
+ return false;
+
+ event += sint;
+ recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
retry:
pending = false;
for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) {
@@ -425,6 +527,17 @@ retry:
return pending;
}
+static bool hv_synic_event_pending(void)
+{
+ struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
+ union hv_synic_event_flags *hyp_synic_event_page = hv_cpu->hyp_synic_event_page;
+ union hv_synic_event_flags *para_synic_event_page = hv_cpu->para_synic_event_page;
+
+ return
+ __hv_synic_event_pending(hyp_synic_event_page, VMBUS_MESSAGE_SINT) ||
+ __hv_synic_event_pending(para_synic_event_page, VMBUS_MESSAGE_SINT);
+}
+
static int hv_pick_new_cpu(struct vmbus_channel *channel)
{
int ret = -EBUSY;
@@ -517,7 +630,27 @@ int hv_synic_cleanup(unsigned int cpu)
always_cleanup:
hv_stimer_legacy_cleanup(cpu);
- hv_synic_disable_regs(cpu);
+ /*
+ * First, disable the event and message pages
+ * used for communicating with the host, and then
+ * disable the host interrupts if VMBus is not
+ * confidential.
+ */
+ hv_hyp_synic_disable_regs(cpu);
+ if (!vmbus_is_confidential())
+ hv_hyp_synic_disable_interrupts();
+
+ /*
+ * Perform the same steps for the Confidential VMBus.
+ * The sequencing provides the guarantee that no data
+ * may be posted for processing before disabling interrupts.
+ */
+ if (vmbus_is_confidential()) {
+ hv_para_synic_disable_regs(cpu);
+ hv_para_synic_disable_interrupts();
+ }
+ if (vmbus_irq != -1)
+ disable_percpu_irq(vmbus_irq);
return ret;
}
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index e109a620c83f..0a3ab7efed46 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -315,9 +315,9 @@ int __init hv_common_init(void)
int i;
union hv_hypervisor_version_info version;
- /* Get information about the Hyper-V host version */
+ /* Get information about the Microsoft Hypervisor version */
if (!hv_get_hypervisor_version(&version))
- pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n",
+ pr_info("Hyper-V: Hypervisor Build %d.%d.%d.%d-%d-%d\n",
version.major_version, version.minor_version,
version.build_number, version.service_number,
version.service_pack, version.service_branch);
@@ -487,7 +487,7 @@ int hv_common_cpu_init(unsigned int cpu)
* online and then taken offline
*/
if (!*inputarg) {
- mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
+ mem = kmalloc_array(pgcount, HV_HYP_PAGE_SIZE, flags);
if (!mem)
return -ENOMEM;
@@ -716,6 +716,27 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
}
EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
+void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set)
+{
+}
+EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt);
+
+void __weak hv_para_set_sint_proxy(bool enable)
+{
+}
+EXPORT_SYMBOL_GPL(hv_para_set_sint_proxy);
+
+u64 __weak hv_para_get_synic_register(unsigned int reg)
+{
+ return ~0ULL;
+}
+EXPORT_SYMBOL_GPL(hv_para_get_synic_register);
+
+void __weak hv_para_set_synic_register(unsigned int reg, u64 val)
+{
+}
+EXPORT_SYMBOL_GPL(hv_para_set_synic_register);
+
void hv_identify_partition_type(void)
{
/* Assume guest role */
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 36ee89c0358b..7e9c8e169c66 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -586,7 +586,7 @@ static int util_probe(struct hv_device *dev,
(struct hv_util_service *)dev_id->driver_data;
int ret;
- srv->recv_buffer = kmalloc(HV_HYP_PAGE_SIZE * 4, GFP_KERNEL);
+ srv->recv_buffer = kmalloc_array(4, HV_HYP_PAGE_SIZE, GFP_KERNEL);
if (!srv->recv_buffer)
return -ENOMEM;
srv->channel = dev->channel;
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 0b450e53161e..b2862e0a317a 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -15,6 +15,7 @@
#include <linux/list.h>
#include <linux/bitops.h>
#include <asm/sync_bitops.h>
+#include <asm/mshyperv.h>
#include <linux/atomic.h>
#include <linux/hyperv.h>
#include <linux/interrupt.h>
@@ -32,6 +33,7 @@
*/
#define HV_UTIL_NEGO_TIMEOUT 55
+void vmbus_isr(void);
/* Definitions for the monitored notification facility */
union hv_monitor_trigger_group {
@@ -120,8 +122,26 @@ enum {
* Per cpu state for channel handling
*/
struct hv_per_cpu_context {
- void *synic_message_page;
- void *synic_event_page;
+ /*
+ * SynIC pages for communicating with the host.
+ *
+ * These pages are accessible to the host partition and the hypervisor.
+ * They may be used for exchanging data with the host partition and the
+ * hypervisor even when they aren't trusted yet the guest partition
+ * must be prepared to handle the malicious behavior.
+ */
+ void *hyp_synic_message_page;
+ void *hyp_synic_event_page;
+ /*
+ * SynIC pages for communicating with the paravisor.
+ *
+ * These pages may be accessed from within the guest partition only in
+ * CoCo VMs. Neither the host partition nor the hypervisor can access
+ * these pages in that case; they are used for exchanging data with the
+ * paravisor.
+ */
+ void *para_synic_message_page;
+ void *para_synic_event_page;
/*
* The page is only used in hv_post_message() for a TDX VM (with the
@@ -171,10 +191,10 @@ extern int hv_synic_alloc(void);
extern void hv_synic_free(void);
-extern void hv_synic_enable_regs(unsigned int cpu);
+extern void hv_hyp_synic_enable_regs(unsigned int cpu);
extern int hv_synic_init(unsigned int cpu);
-extern void hv_synic_disable_regs(unsigned int cpu);
+extern void hv_hyp_synic_disable_regs(unsigned int cpu);
extern int hv_synic_cleanup(unsigned int cpu);
/* Interface */
@@ -182,7 +202,8 @@ extern int hv_synic_cleanup(unsigned int cpu);
void hv_ringbuffer_pre_init(struct vmbus_channel *channel);
int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
- struct page *pages, u32 pagecnt, u32 max_pkt_size);
+ struct page *pages, u32 pagecnt, u32 max_pkt_size,
+ bool confidential);
void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info);
@@ -333,6 +354,51 @@ extern const struct vmbus_channel_message_table_entry
/* General vmbus interface */
+bool vmbus_is_confidential(void);
+
+#if IS_ENABLED(CONFIG_HYPERV_VMBUS)
+/* Free the message slot and signal end-of-message if required */
+static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
+{
+ /*
+ * On crash we're reading some other CPU's message page and we need
+ * to be careful: this other CPU may already had cleared the header
+ * and the host may already had delivered some other message there.
+ * In case we blindly write msg->header.message_type we're going
+ * to lose it. We can still lose a message of the same type but
+ * we count on the fact that there can only be one
+ * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages
+ * on crash.
+ */
+ if (cmpxchg(&msg->header.message_type, old_msg_type,
+ HVMSG_NONE) != old_msg_type)
+ return;
+
+ /*
+ * The cmxchg() above does an implicit memory barrier to
+ * ensure the write to MessageType (ie set to
+ * HVMSG_NONE) happens before we read the
+ * MessagePending and EOMing. Otherwise, the EOMing
+ * will not deliver any more messages since there is
+ * no empty slot
+ */
+ if (msg->header.message_flags.msg_pending) {
+ /*
+ * This will cause message queue rescan to
+ * possibly deliver another msg from the
+ * hypervisor
+ */
+ if (vmbus_is_confidential())
+ hv_para_set_synic_register(HV_MSR_EOM, 0);
+ else
+ hv_set_msr(HV_MSR_EOM, 0);
+ }
+}
+
+extern int vmbus_interrupt;
+extern int vmbus_irq;
+#endif /* CONFIG_HYPERV_VMBUS */
+
struct hv_device *vmbus_device_create(const guid_t *type,
const guid_t *instance,
struct vmbus_channel *channel);
diff --git a/drivers/hv/mshv_common.c b/drivers/hv/mshv_common.c
index aa2be51979fd..58027b23c206 100644
--- a/drivers/hv/mshv_common.c
+++ b/drivers/hv/mshv_common.c
@@ -14,6 +14,9 @@
#include <asm/mshyperv.h>
#include <linux/resume_user_mode.h>
#include <linux/export.h>
+#include <linux/acpi.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
#include "mshv.h"
@@ -138,3 +141,99 @@ int hv_call_get_partition_property(u64 partition_id,
return 0;
}
EXPORT_SYMBOL_GPL(hv_call_get_partition_property);
+
+/*
+ * Corresponding sleep states have to be initialized in order for a subsequent
+ * HVCALL_ENTER_SLEEP_STATE call to succeed. Currently only S5 state as per
+ * ACPI 6.4 chapter 7.4.2 is relevant, while S1, S2 and S3 can be supported.
+ *
+ * In order to pass proper PM values to mshv, ACPI should be initialized and
+ * should support S5 sleep state when this method is invoked.
+ */
+static int hv_initialize_sleep_states(void)
+{
+ u64 status;
+ unsigned long flags;
+ struct hv_input_set_system_property *in;
+ acpi_status acpi_status;
+ u8 sleep_type_a, sleep_type_b;
+
+ if (!acpi_sleep_state_supported(ACPI_STATE_S5)) {
+ pr_err("%s: S5 sleep state not supported.\n", __func__);
+ return -ENODEV;
+ }
+
+ acpi_status = acpi_get_sleep_type_data(ACPI_STATE_S5, &sleep_type_a,
+ &sleep_type_b);
+ if (ACPI_FAILURE(acpi_status))
+ return -ENODEV;
+
+ local_irq_save(flags);
+ in = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(in, 0, sizeof(*in));
+
+ in->property_id = HV_SYSTEM_PROPERTY_SLEEP_STATE;
+ in->set_sleep_state_info.sleep_state = HV_SLEEP_STATE_S5;
+ in->set_sleep_state_info.pm1a_slp_typ = sleep_type_a;
+ in->set_sleep_state_info.pm1b_slp_typ = sleep_type_b;
+
+ status = hv_do_hypercall(HVCALL_SET_SYSTEM_PROPERTY, in, NULL);
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "\n");
+ return hv_result_to_errno(status);
+ }
+
+ return 0;
+}
+
+/*
+ * This notifier initializes sleep states in mshv hypervisor which will be
+ * used during power off.
+ */
+static int hv_reboot_notifier_handler(struct notifier_block *this,
+ unsigned long code, void *another)
+{
+ int ret = 0;
+
+ if (code == SYS_HALT || code == SYS_POWER_OFF)
+ ret = hv_initialize_sleep_states();
+
+ return ret ? NOTIFY_DONE : NOTIFY_OK;
+}
+
+static struct notifier_block hv_reboot_notifier = {
+ .notifier_call = hv_reboot_notifier_handler,
+};
+
+void hv_sleep_notifiers_register(void)
+{
+ int ret;
+
+ ret = register_reboot_notifier(&hv_reboot_notifier);
+ if (ret)
+ pr_err("%s: cannot register reboot notifier %d\n", __func__,
+ ret);
+}
+
+/*
+ * Power off the machine by entering S5 sleep state via Hyper-V hypercall.
+ * This call does not return if successful.
+ */
+void hv_machine_power_off(void)
+{
+ unsigned long flags;
+ struct hv_input_enter_sleep_state *in;
+
+ local_irq_save(flags);
+ in = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ in->sleep_state = HV_SLEEP_STATE_S5;
+
+ (void)hv_do_hypercall(HVCALL_ENTER_SLEEP_STATE, in, NULL);
+ local_irq_restore(flags);
+
+ /* should never reach here */
+ BUG();
+
+}
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 806674722868..d93a18f09c76 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -163,8 +163,10 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
return -EOPNOTSUPP;
+#if IS_ENABLED(CONFIG_X86)
if (irq->lapic_control.logical_dest_mode)
return -EOPNOTSUPP;
+#endif
vp = partition->pt_vp_array[irq->lapic_apic_id];
@@ -196,8 +198,10 @@ static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
unsigned int seq;
int idx;
+#if IS_ENABLED(CONFIG_X86)
WARN_ON(irqfd->irqfd_resampler &&
!irq->lapic_control.level_triggered);
+#endif
idx = srcu_read_lock(&partition->pt_irq_srcu);
if (irqfd->irqfd_girq_ent.guest_irq_num) {
@@ -469,6 +473,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
spin_lock_irq(&pt->pt_irqfds_lock);
+#if IS_ENABLED(CONFIG_X86)
if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
!irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
/*
@@ -479,6 +484,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
ret = -EINVAL;
goto fail;
}
+#endif
ret = 0;
hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
@@ -592,7 +598,7 @@ static void mshv_irqfd_release(struct mshv_partition *pt)
int mshv_irqfd_wq_init(void)
{
- irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", 0, 0);
+ irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", WQ_PERCPU, 0);
if (!irqfd_cleanup_wq)
return -ENOMEM;
diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c
index d0fb9ef734f4..798e7e1ab06e 100644
--- a/drivers/hv/mshv_irq.c
+++ b/drivers/hv/mshv_irq.c
@@ -119,6 +119,10 @@ void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent,
lirq->lapic_vector = ent->girq_irq_data & 0xFF;
lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF;
lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8;
+#if IS_ENABLED(CONFIG_X86)
lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1;
lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1;
+#elif IS_ENABLED(CONFIG_ARM64)
+ lirq->lapic_control.asserted = 1;
+#endif
}
diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
new file mode 100644
index 000000000000..202b9d551e39
--- /dev/null
+++ b/drivers/hv/mshv_regions.c
@@ -0,0 +1,555 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Memory region management for mshv_root module.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/hmm.h>
+#include <linux/hyperv.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include <asm/mshyperv.h>
+
+#include "mshv_root.h"
+
+#define MSHV_MAP_FAULT_IN_PAGES PTRS_PER_PMD
+
+/**
+ * mshv_region_process_chunk - Processes a contiguous chunk of memory pages
+ * in a region.
+ * @region : Pointer to the memory region structure.
+ * @flags : Flags to pass to the handler.
+ * @page_offset: Offset into the region's pages array to start processing.
+ * @page_count : Number of pages to process.
+ * @handler : Callback function to handle the chunk.
+ *
+ * This function scans the region's pages starting from @page_offset,
+ * checking for contiguous present pages of the same size (normal or huge).
+ * It invokes @handler for the chunk of contiguous pages found. Returns the
+ * number of pages handled, or a negative error code if the first page is
+ * not present or the handler fails.
+ *
+ * Note: The @handler callback must be able to handle both normal and huge
+ * pages.
+ *
+ * Return: Number of pages handled, or negative error code.
+ */
+static long mshv_region_process_chunk(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count,
+ int (*handler)(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset,
+ u64 page_count))
+{
+ u64 count, stride;
+ unsigned int page_order;
+ struct page *page;
+ int ret;
+
+ page = region->pages[page_offset];
+ if (!page)
+ return -EINVAL;
+
+ page_order = folio_order(page_folio(page));
+ /* The hypervisor only supports 4K and 2M page sizes */
+ if (page_order && page_order != HPAGE_PMD_ORDER)
+ return -EINVAL;
+
+ stride = 1 << page_order;
+
+ /* Start at stride since the first page is validated */
+ for (count = stride; count < page_count; count += stride) {
+ page = region->pages[page_offset + count];
+
+ /* Break if current page is not present */
+ if (!page)
+ break;
+
+ /* Break if page size changes */
+ if (page_order != folio_order(page_folio(page)))
+ break;
+ }
+
+ ret = handler(region, flags, page_offset, count);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+/**
+ * mshv_region_process_range - Processes a range of memory pages in a
+ * region.
+ * @region : Pointer to the memory region structure.
+ * @flags : Flags to pass to the handler.
+ * @page_offset: Offset into the region's pages array to start processing.
+ * @page_count : Number of pages to process.
+ * @handler : Callback function to handle each chunk of contiguous
+ * pages.
+ *
+ * Iterates over the specified range of pages in @region, skipping
+ * non-present pages. For each contiguous chunk of present pages, invokes
+ * @handler via mshv_region_process_chunk.
+ *
+ * Note: The @handler callback must be able to handle both normal and huge
+ * pages.
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+static int mshv_region_process_range(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count,
+ int (*handler)(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset,
+ u64 page_count))
+{
+ long ret;
+
+ if (page_offset + page_count > region->nr_pages)
+ return -EINVAL;
+
+ while (page_count) {
+ /* Skip non-present pages */
+ if (!region->pages[page_offset]) {
+ page_offset++;
+ page_count--;
+ continue;
+ }
+
+ ret = mshv_region_process_chunk(region, flags,
+ page_offset,
+ page_count,
+ handler);
+ if (ret < 0)
+ return ret;
+
+ page_offset += ret;
+ page_count -= ret;
+ }
+
+ return 0;
+}
+
+struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
+ u64 uaddr, u32 flags)
+{
+ struct mshv_mem_region *region;
+
+ region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages);
+ if (!region)
+ return ERR_PTR(-ENOMEM);
+
+ region->nr_pages = nr_pages;
+ region->start_gfn = guest_pfn;
+ region->start_uaddr = uaddr;
+ region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE;
+ if (flags & BIT(MSHV_SET_MEM_BIT_WRITABLE))
+ region->hv_map_flags |= HV_MAP_GPA_WRITABLE;
+ if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
+ region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
+
+ kref_init(&region->refcount);
+
+ return region;
+}
+
+static int mshv_region_chunk_share(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count)
+{
+ struct page *page = region->pages[page_offset];
+
+ if (PageHuge(page) || PageTransCompound(page))
+ flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
+
+ return hv_call_modify_spa_host_access(region->partition->pt_id,
+ region->pages + page_offset,
+ page_count,
+ HV_MAP_GPA_READABLE |
+ HV_MAP_GPA_WRITABLE,
+ flags, true);
+}
+
+int mshv_region_share(struct mshv_mem_region *region)
+{
+ u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
+
+ return mshv_region_process_range(region, flags,
+ 0, region->nr_pages,
+ mshv_region_chunk_share);
+}
+
+static int mshv_region_chunk_unshare(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count)
+{
+ struct page *page = region->pages[page_offset];
+
+ if (PageHuge(page) || PageTransCompound(page))
+ flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
+
+ return hv_call_modify_spa_host_access(region->partition->pt_id,
+ region->pages + page_offset,
+ page_count, 0,
+ flags, false);
+}
+
+int mshv_region_unshare(struct mshv_mem_region *region)
+{
+ u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
+
+ return mshv_region_process_range(region, flags,
+ 0, region->nr_pages,
+ mshv_region_chunk_unshare);
+}
+
+static int mshv_region_chunk_remap(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count)
+{
+ struct page *page = region->pages[page_offset];
+
+ if (PageHuge(page) || PageTransCompound(page))
+ flags |= HV_MAP_GPA_LARGE_PAGE;
+
+ return hv_call_map_gpa_pages(region->partition->pt_id,
+ region->start_gfn + page_offset,
+ page_count, flags,
+ region->pages + page_offset);
+}
+
+static int mshv_region_remap_pages(struct mshv_mem_region *region,
+ u32 map_flags,
+ u64 page_offset, u64 page_count)
+{
+ return mshv_region_process_range(region, map_flags,
+ page_offset, page_count,
+ mshv_region_chunk_remap);
+}
+
+int mshv_region_map(struct mshv_mem_region *region)
+{
+ u32 map_flags = region->hv_map_flags;
+
+ return mshv_region_remap_pages(region, map_flags,
+ 0, region->nr_pages);
+}
+
+static void mshv_region_invalidate_pages(struct mshv_mem_region *region,
+ u64 page_offset, u64 page_count)
+{
+ if (region->type == MSHV_REGION_TYPE_MEM_PINNED)
+ unpin_user_pages(region->pages + page_offset, page_count);
+
+ memset(region->pages + page_offset, 0,
+ page_count * sizeof(struct page *));
+}
+
+void mshv_region_invalidate(struct mshv_mem_region *region)
+{
+ mshv_region_invalidate_pages(region, 0, region->nr_pages);
+}
+
+int mshv_region_pin(struct mshv_mem_region *region)
+{
+ u64 done_count, nr_pages;
+ struct page **pages;
+ __u64 userspace_addr;
+ int ret;
+
+ for (done_count = 0; done_count < region->nr_pages; done_count += ret) {
+ pages = region->pages + done_count;
+ userspace_addr = region->start_uaddr +
+ done_count * HV_HYP_PAGE_SIZE;
+ nr_pages = min(region->nr_pages - done_count,
+ MSHV_PIN_PAGES_BATCH_SIZE);
+
+ /*
+ * Pinning assuming 4k pages works for large pages too.
+ * All page structs within the large page are returned.
+ *
+ * Pin requests are batched because pin_user_pages_fast
+ * with the FOLL_LONGTERM flag does a large temporary
+ * allocation of contiguous memory.
+ */
+ ret = pin_user_pages_fast(userspace_addr, nr_pages,
+ FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ if (ret < 0)
+ goto release_pages;
+ }
+
+ return 0;
+
+release_pages:
+ mshv_region_invalidate_pages(region, 0, done_count);
+ return ret;
+}
+
+static int mshv_region_chunk_unmap(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count)
+{
+ struct page *page = region->pages[page_offset];
+
+ if (PageHuge(page) || PageTransCompound(page))
+ flags |= HV_UNMAP_GPA_LARGE_PAGE;
+
+ return hv_call_unmap_gpa_pages(region->partition->pt_id,
+ region->start_gfn + page_offset,
+ page_count, flags);
+}
+
+static int mshv_region_unmap(struct mshv_mem_region *region)
+{
+ return mshv_region_process_range(region, 0,
+ 0, region->nr_pages,
+ mshv_region_chunk_unmap);
+}
+
+static void mshv_region_destroy(struct kref *ref)
+{
+ struct mshv_mem_region *region =
+ container_of(ref, struct mshv_mem_region, refcount);
+ struct mshv_partition *partition = region->partition;
+ int ret;
+
+ if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
+ mshv_region_movable_fini(region);
+
+ if (mshv_partition_encrypted(partition)) {
+ ret = mshv_region_share(region);
+ if (ret) {
+ pt_err(partition,
+ "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n",
+ ret);
+ return;
+ }
+ }
+
+ mshv_region_unmap(region);
+
+ mshv_region_invalidate(region);
+
+ vfree(region);
+}
+
+void mshv_region_put(struct mshv_mem_region *region)
+{
+ kref_put(&region->refcount, mshv_region_destroy);
+}
+
+int mshv_region_get(struct mshv_mem_region *region)
+{
+ return kref_get_unless_zero(&region->refcount);
+}
+
+/**
+ * mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region
+ * @region: Pointer to the memory region structure
+ * @range: Pointer to the HMM range structure
+ *
+ * This function performs the following steps:
+ * 1. Reads the notifier sequence for the HMM range.
+ * 2. Acquires a read lock on the memory map.
+ * 3. Handles HMM faults for the specified range.
+ * 4. Releases the read lock on the memory map.
+ * 5. If successful, locks the memory region mutex.
+ * 6. Verifies if the notifier sequence has changed during the operation.
+ * If it has, releases the mutex and returns -EBUSY to match with
+ * hmm_range_fault() return code for repeating.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region,
+ struct hmm_range *range)
+{
+ int ret;
+
+ range->notifier_seq = mmu_interval_read_begin(range->notifier);
+ mmap_read_lock(region->mni.mm);
+ ret = hmm_range_fault(range);
+ mmap_read_unlock(region->mni.mm);
+ if (ret)
+ return ret;
+
+ mutex_lock(&region->mutex);
+
+ if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) {
+ mutex_unlock(&region->mutex);
+ cond_resched();
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+/**
+ * mshv_region_range_fault - Handle memory range faults for a given region.
+ * @region: Pointer to the memory region structure.
+ * @page_offset: Offset of the page within the region.
+ * @page_count: Number of pages to handle.
+ *
+ * This function resolves memory faults for a specified range of pages
+ * within a memory region. It uses HMM (Heterogeneous Memory Management)
+ * to fault in the required pages and updates the region's page array.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int mshv_region_range_fault(struct mshv_mem_region *region,
+ u64 page_offset, u64 page_count)
+{
+ struct hmm_range range = {
+ .notifier = &region->mni,
+ .default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
+ };
+ unsigned long *pfns;
+ int ret;
+ u64 i;
+
+ pfns = kmalloc_array(page_count, sizeof(*pfns), GFP_KERNEL);
+ if (!pfns)
+ return -ENOMEM;
+
+ range.hmm_pfns = pfns;
+ range.start = region->start_uaddr + page_offset * HV_HYP_PAGE_SIZE;
+ range.end = range.start + page_count * HV_HYP_PAGE_SIZE;
+
+ do {
+ ret = mshv_region_hmm_fault_and_lock(region, &range);
+ } while (ret == -EBUSY);
+
+ if (ret)
+ goto out;
+
+ for (i = 0; i < page_count; i++)
+ region->pages[page_offset + i] = hmm_pfn_to_page(pfns[i]);
+
+ ret = mshv_region_remap_pages(region, region->hv_map_flags,
+ page_offset, page_count);
+
+ mutex_unlock(&region->mutex);
+out:
+ kfree(pfns);
+ return ret;
+}
+
+bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn)
+{
+ u64 page_offset, page_count;
+ int ret;
+
+ /* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */
+ page_offset = ALIGN_DOWN(gfn - region->start_gfn,
+ MSHV_MAP_FAULT_IN_PAGES);
+
+ /* Map more pages than requested to reduce the number of faults. */
+ page_count = min(region->nr_pages - page_offset,
+ MSHV_MAP_FAULT_IN_PAGES);
+
+ ret = mshv_region_range_fault(region, page_offset, page_count);
+
+ WARN_ONCE(ret,
+ "p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n",
+ region->partition->pt_id, region->start_uaddr,
+ region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT),
+ gfn, page_offset, page_count);
+
+ return !ret;
+}
+
+/**
+ * mshv_region_interval_invalidate - Invalidate a range of memory region
+ * @mni: Pointer to the mmu_interval_notifier structure
+ * @range: Pointer to the mmu_notifier_range structure
+ * @cur_seq: Current sequence number for the interval notifier
+ *
+ * This function invalidates a memory region by remapping its pages with
+ * no access permissions. It locks the region's mutex to ensure thread safety
+ * and updates the sequence number for the interval notifier. If the range
+ * is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking
+ * lock and returns false if unsuccessful.
+ *
+ * NOTE: Failure to invalidate a region is a serious error, as the pages will
+ * be considered freed while they are still mapped by the hypervisor.
+ * Any attempt to access such pages will likely crash the system.
+ *
+ * Return: true if the region was successfully invalidated, false otherwise.
+ */
+static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
+{
+ struct mshv_mem_region *region = container_of(mni,
+ struct mshv_mem_region,
+ mni);
+ u64 page_offset, page_count;
+ unsigned long mstart, mend;
+ int ret = -EPERM;
+
+ if (mmu_notifier_range_blockable(range))
+ mutex_lock(&region->mutex);
+ else if (!mutex_trylock(&region->mutex))
+ goto out_fail;
+
+ mmu_interval_set_seq(mni, cur_seq);
+
+ mstart = max(range->start, region->start_uaddr);
+ mend = min(range->end, region->start_uaddr +
+ (region->nr_pages << HV_HYP_PAGE_SHIFT));
+
+ page_offset = HVPFN_DOWN(mstart - region->start_uaddr);
+ page_count = HVPFN_DOWN(mend - mstart);
+
+ ret = mshv_region_remap_pages(region, HV_MAP_GPA_NO_ACCESS,
+ page_offset, page_count);
+ if (ret)
+ goto out_fail;
+
+ mshv_region_invalidate_pages(region, page_offset, page_count);
+
+ mutex_unlock(&region->mutex);
+
+ return true;
+
+out_fail:
+ WARN_ONCE(ret,
+ "Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n",
+ region->start_uaddr,
+ region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT),
+ range->start, range->end, range->event,
+ page_offset, page_offset + page_count - 1, (u64)range->mm, ret);
+ return false;
+}
+
+static const struct mmu_interval_notifier_ops mshv_region_mni_ops = {
+ .invalidate = mshv_region_interval_invalidate,
+};
+
+void mshv_region_movable_fini(struct mshv_mem_region *region)
+{
+ mmu_interval_notifier_remove(&region->mni);
+}
+
+bool mshv_region_movable_init(struct mshv_mem_region *region)
+{
+ int ret;
+
+ ret = mmu_interval_notifier_insert(&region->mni, current->mm,
+ region->start_uaddr,
+ region->nr_pages << HV_HYP_PAGE_SHIFT,
+ &mshv_region_mni_ops);
+ if (ret)
+ return false;
+
+ mutex_init(&region->mutex);
+
+ return true;
+}
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index e3931b0f1269..3c1d88b36741 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -15,6 +15,7 @@
#include <linux/hashtable.h>
#include <linux/dev_printk.h>
#include <linux/build_bug.h>
+#include <linux/mmu_notifier.h>
#include <uapi/linux/mshv.h>
/*
@@ -70,18 +71,23 @@ do { \
#define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__)
#define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__)
+enum mshv_region_type {
+ MSHV_REGION_TYPE_MEM_PINNED,
+ MSHV_REGION_TYPE_MEM_MOVABLE,
+ MSHV_REGION_TYPE_MMIO
+};
+
struct mshv_mem_region {
struct hlist_node hnode;
+ struct kref refcount;
u64 nr_pages;
u64 start_gfn;
u64 start_uaddr;
u32 hv_map_flags;
- struct {
- u64 large_pages: 1; /* 2MiB */
- u64 range_pinned: 1;
- u64 reserved: 62;
- } flags;
struct mshv_partition *partition;
+ enum mshv_region_type type;
+ struct mmu_interval_notifier mni;
+ struct mutex mutex; /* protects region pages remapping */
struct page *pages[];
};
@@ -98,6 +104,8 @@ struct mshv_partition {
u64 pt_id;
refcount_t pt_ref_count;
struct mutex pt_mutex;
+
+ spinlock_t pt_mem_regions_lock;
struct hlist_head pt_mem_regions; // not ordered
u32 pt_vp_count;
@@ -169,7 +177,7 @@ struct mshv_girq_routing_table {
};
struct hv_synic_pages {
- struct hv_message_page *synic_message_page;
+ struct hv_message_page *hyp_synic_message_page;
struct hv_synic_event_flags_page *synic_event_flags_page;
struct hv_synic_event_ring_page *synic_event_ring_page;
};
@@ -178,6 +186,7 @@ struct mshv_root {
struct hv_synic_pages __percpu *synic_pages;
spinlock_t pt_ht_lock;
DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS);
+ struct hv_partition_property_vmm_capabilities vmm_caps;
};
/*
@@ -278,11 +287,12 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
/* Choose between pages and bytes */
struct hv_vp_state_data state_data, u64 page_count,
struct page **pages, u32 num_bytes, u8 *bytes);
-int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
- union hv_input_vtl input_vtl,
- struct page **state_page);
-int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
- union hv_input_vtl input_vtl);
+int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl,
+ struct page **state_page);
+int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ struct page *state_page,
+ union hv_input_vtl input_vtl);
int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
u64 connection_partition_id, struct hv_port_info *port_info,
u8 port_vtl, u8 min_connection_vtl, int node);
@@ -295,17 +305,32 @@ int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
int hv_call_disconnect_port(u64 connection_partition_id,
union hv_connection_id connection_id);
int hv_call_notify_port_ring_empty(u32 sint_index);
-int hv_call_map_stat_page(enum hv_stats_object_type type,
- const union hv_stats_object_identity *identity,
- void **addr);
-int hv_call_unmap_stat_page(enum hv_stats_object_type type,
- const union hv_stats_object_identity *identity);
+int hv_map_stats_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ void **addr);
+int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr,
+ const union hv_stats_object_identity *identity);
int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
u64 page_struct_count, u32 host_access,
u32 flags, u8 acquire);
+int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg,
+ void *property_value, size_t property_value_sz);
extern struct mshv_root mshv_root;
extern enum hv_scheduler_type hv_scheduler_type;
extern u8 * __percpu *hv_synic_eventring_tail;
+struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
+ u64 uaddr, u32 flags);
+int mshv_region_share(struct mshv_mem_region *region);
+int mshv_region_unshare(struct mshv_mem_region *region);
+int mshv_region_map(struct mshv_mem_region *region);
+void mshv_region_invalidate(struct mshv_mem_region *region);
+int mshv_region_pin(struct mshv_mem_region *region);
+void mshv_region_put(struct mshv_mem_region *region);
+int mshv_region_get(struct mshv_mem_region *region);
+bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn);
+void mshv_region_movable_fini(struct mshv_mem_region *region);
+bool mshv_region_movable_init(struct mshv_mem_region *region);
+
#endif /* _MSHV_ROOT_H_ */
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index c9c274f29c3c..598eaff4ff29 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -388,7 +388,13 @@ int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector,
memset(input, 0, sizeof(*input));
input->partition_id = partition_id;
input->vector = vector;
+ /*
+ * NOTE: dest_addr only needs to be provided while asserting an
+ * interrupt on x86 platform
+ */
+#if IS_ENABLED(CONFIG_X86)
input->dest_addr = dest_addr;
+#endif
input->control = control;
status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL);
local_irq_restore(flags);
@@ -526,9 +532,9 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
return ret;
}
-int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
- union hv_input_vtl input_vtl,
- struct page **state_page)
+static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl,
+ struct page **state_page)
{
struct hv_input_map_vp_state_page *input;
struct hv_output_map_vp_state_page *output;
@@ -542,12 +548,20 @@ int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+ memset(input, 0, sizeof(*input));
input->partition_id = partition_id;
input->vp_index = vp_index;
input->type = type;
input->input_vtl = input_vtl;
- status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, output);
+ if (*state_page) {
+ input->flags.map_location_provided = 1;
+ input->requested_map_location =
+ page_to_pfn(*state_page);
+ }
+
+ status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input,
+ output);
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
if (hv_result_success(status))
@@ -565,8 +579,41 @@ int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
return ret;
}
-int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
- union hv_input_vtl input_vtl)
+static bool mshv_use_overlay_gpfn(void)
+{
+ return hv_l1vh_partition() &&
+ mshv_root.vmm_caps.vmm_can_provide_overlay_gpfn;
+}
+
+int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl,
+ struct page **state_page)
+{
+ int ret = 0;
+ struct page *allocated_page = NULL;
+
+ if (mshv_use_overlay_gpfn()) {
+ allocated_page = alloc_page(GFP_KERNEL);
+ if (!allocated_page)
+ return -ENOMEM;
+ *state_page = allocated_page;
+ } else {
+ *state_page = NULL;
+ }
+
+ ret = hv_call_map_vp_state_page(partition_id, vp_index, type, input_vtl,
+ state_page);
+
+ if (ret && allocated_page) {
+ __free_page(allocated_page);
+ *state_page = NULL;
+ }
+
+ return ret;
+}
+
+static int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl)
{
unsigned long flags;
u64 status;
@@ -590,6 +637,48 @@ int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
return hv_result_to_errno(status);
}
+int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ struct page *state_page, union hv_input_vtl input_vtl)
+{
+ int ret = hv_call_unmap_vp_state_page(partition_id, vp_index, type, input_vtl);
+
+ if (mshv_use_overlay_gpfn() && state_page)
+ __free_page(state_page);
+
+ return ret;
+}
+
+int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code,
+ u64 arg, void *property_value,
+ size_t property_value_sz)
+{
+ u64 status;
+ unsigned long flags;
+ struct hv_input_get_partition_property_ex *input;
+ struct hv_output_get_partition_property_ex *output;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->partition_id = partition_id;
+ input->property_code = property_code;
+ input->arg = arg;
+ status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY_EX, input, output);
+
+ if (!hv_result_success(status)) {
+ local_irq_restore(flags);
+ hv_status_debug(status, "\n");
+ return hv_result_to_errno(status);
+ }
+ memcpy(property_value, &output->property_value, property_value_sz);
+
+ local_irq_restore(flags);
+
+ return 0;
+}
+
int
hv_call_clear_virtual_interrupt(u64 partition_id)
{
@@ -724,9 +813,51 @@ hv_call_notify_port_ring_empty(u32 sint_index)
return hv_result_to_errno(status);
}
-int hv_call_map_stat_page(enum hv_stats_object_type type,
- const union hv_stats_object_identity *identity,
- void **addr)
+static int hv_call_map_stats_page2(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ u64 map_location)
+{
+ unsigned long flags;
+ struct hv_input_map_stats_page2 *input;
+ u64 status;
+ int ret;
+
+ if (!map_location || !mshv_use_overlay_gpfn())
+ return -EINVAL;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->type = type;
+ input->identity = *identity;
+ input->map_location = map_location;
+
+ status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE2, input, NULL);
+
+ local_irq_restore(flags);
+
+ ret = hv_result_to_errno(status);
+
+ if (!ret)
+ break;
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ hv_status_debug(status, "\n");
+ break;
+ }
+
+ ret = hv_call_deposit_pages(NUMA_NO_NODE,
+ hv_current_partition_id, 1);
+ } while (!ret);
+
+ return ret;
+}
+
+static int hv_call_map_stats_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ void **addr)
{
unsigned long flags;
struct hv_input_map_stats_page *input;
@@ -765,8 +896,38 @@ int hv_call_map_stat_page(enum hv_stats_object_type type,
return ret;
}
-int hv_call_unmap_stat_page(enum hv_stats_object_type type,
- const union hv_stats_object_identity *identity)
+int hv_map_stats_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ void **addr)
+{
+ int ret;
+ struct page *allocated_page = NULL;
+
+ if (!addr)
+ return -EINVAL;
+
+ if (mshv_use_overlay_gpfn()) {
+ allocated_page = alloc_page(GFP_KERNEL);
+ if (!allocated_page)
+ return -ENOMEM;
+
+ ret = hv_call_map_stats_page2(type, identity,
+ page_to_pfn(allocated_page));
+ *addr = page_address(allocated_page);
+ } else {
+ ret = hv_call_map_stats_page(type, identity, addr);
+ }
+
+ if (ret && allocated_page) {
+ __free_page(allocated_page);
+ *addr = NULL;
+ }
+
+ return ret;
+}
+
+static int hv_call_unmap_stats_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity)
{
unsigned long flags;
struct hv_input_unmap_stats_page *input;
@@ -785,6 +946,19 @@ int hv_call_unmap_stat_page(enum hv_stats_object_type type,
return hv_result_to_errno(status);
}
+int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr,
+ const union hv_stats_object_identity *identity)
+{
+ int ret;
+
+ ret = hv_call_unmap_stats_page(type, identity);
+
+ if (mshv_use_overlay_gpfn() && page_addr)
+ __free_page(virt_to_page(page_addr));
+
+ return ret;
+}
+
int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
u64 page_struct_count, u32 host_access,
u32 flags, u8 acquire)
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 1d8d8d00e4e0..1134a82c7881 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -42,7 +42,7 @@ MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
/* TODO move this to another file when debugfs code is added */
enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */
#if defined(CONFIG_X86)
- VpRootDispatchThreadBlocked = 201,
+ VpRootDispatchThreadBlocked = 202,
#elif defined(CONFIG_ARM64)
VpRootDispatchThreadBlocked = 94,
#endif
@@ -123,6 +123,7 @@ static struct miscdevice mshv_dev = {
*/
static u16 mshv_passthru_hvcalls[] = {
HVCALL_GET_PARTITION_PROPERTY,
+ HVCALL_GET_PARTITION_PROPERTY_EX,
HVCALL_SET_PARTITION_PROPERTY,
HVCALL_INSTALL_INTERCEPT,
HVCALL_GET_VP_REGISTERS,
@@ -137,6 +138,16 @@ static u16 mshv_passthru_hvcalls[] = {
HVCALL_GET_VP_CPUID_VALUES,
};
+/*
+ * Only allow hypercalls that are safe to be called by the VMM with the host
+ * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
+ * hypercall cannot be misused by the VMM before adding it to this list.
+ */
+static u16 mshv_self_passthru_hvcalls[] = {
+ HVCALL_GET_PARTITION_PROPERTY,
+ HVCALL_GET_PARTITION_PROPERTY_EX,
+};
+
static bool mshv_hvcall_is_async(u16 code)
{
switch (code) {
@@ -148,18 +159,38 @@ static bool mshv_hvcall_is_async(u16 code)
return false;
}
+static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
+{
+ int i;
+ int n = ARRAY_SIZE(mshv_passthru_hvcalls);
+ u16 *allowed_hvcalls = mshv_passthru_hvcalls;
+
+ if (pt_id == HV_PARTITION_ID_SELF) {
+ n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
+ allowed_hvcalls = mshv_self_passthru_hvcalls;
+ }
+
+ for (i = 0; i < n; ++i)
+ if (allowed_hvcalls[i] == code)
+ return true;
+
+ return false;
+}
+
static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
bool partition_locked,
void __user *user_args)
{
u64 status;
- int ret = 0, i;
+ int ret = 0;
bool is_async;
struct mshv_root_hvcall args;
struct page *page;
unsigned int pages_order;
void *input_pg = NULL;
void *output_pg = NULL;
+ u16 reps_completed;
+ u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
if (copy_from_user(&args, user_args, sizeof(args)))
return -EFAULT;
@@ -171,17 +202,13 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
return -EINVAL;
- for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i)
- if (args.code == mshv_passthru_hvcalls[i])
- break;
-
- if (i >= ARRAY_SIZE(mshv_passthru_hvcalls))
+ if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
return -EINVAL;
is_async = mshv_hvcall_is_async(args.code);
if (is_async) {
/* async hypercalls can only be called from partition fd */
- if (!partition_locked)
+ if (!partition || !partition_locked)
return -EINVAL;
ret = mshv_init_async_handler(partition);
if (ret)
@@ -209,43 +236,44 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
* NOTE: This only works because all the allowed hypercalls' input
* structs begin with a u64 partition_id field.
*/
- *(u64 *)input_pg = partition->pt_id;
+ *(u64 *)input_pg = pt_id;
- if (args.reps)
- status = hv_do_rep_hypercall(args.code, args.reps, 0,
- input_pg, output_pg);
- else
- status = hv_do_hypercall(args.code, input_pg, output_pg);
-
- if (hv_result(status) == HV_STATUS_CALL_PENDING) {
- if (is_async) {
- mshv_async_hvcall_handler(partition, &status);
- } else { /* Paranoia check. This shouldn't happen! */
- ret = -EBADFD;
- goto free_pages_out;
+ reps_completed = 0;
+ do {
+ if (args.reps) {
+ status = hv_do_rep_hypercall_ex(args.code, args.reps,
+ 0, reps_completed,
+ input_pg, output_pg);
+ reps_completed = hv_repcomp(status);
+ } else {
+ status = hv_do_hypercall(args.code, input_pg, output_pg);
}
- }
- if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
- ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1);
- if (!ret)
- ret = -EAGAIN;
- } else if (!hv_result_success(status)) {
- ret = hv_result_to_errno(status);
- }
+ if (hv_result(status) == HV_STATUS_CALL_PENDING) {
+ if (is_async) {
+ mshv_async_hvcall_handler(partition, &status);
+ } else { /* Paranoia check. This shouldn't happen! */
+ ret = -EBADFD;
+ goto free_pages_out;
+ }
+ }
+
+ if (hv_result_success(status))
+ break;
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
+ ret = hv_result_to_errno(status);
+ else
+ ret = hv_call_deposit_pages(NUMA_NO_NODE,
+ pt_id, 1);
+ } while (!ret);
- /*
- * Always return the status and output data regardless of result.
- * The VMM may need it to determine how to proceed. E.g. the status may
- * contain the number of reps completed if a rep hypercall partially
- * succeeded.
- */
args.status = hv_result(status);
- args.reps = args.reps ? hv_repcomp(status) : 0;
+ args.reps = reps_completed;
if (copy_to_user(user_args, &args, sizeof(args)))
ret = -EFAULT;
- if (output_pg &&
+ if (!ret && output_pg &&
copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
ret = -EFAULT;
@@ -569,14 +597,98 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
"sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
+static struct mshv_mem_region *
+mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
+{
+ struct mshv_mem_region *region;
+
+ hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
+ if (gfn >= region->start_gfn &&
+ gfn < region->start_gfn + region->nr_pages)
+ return region;
+ }
+
+ return NULL;
+}
+
+#ifdef CONFIG_X86_64
+static struct mshv_mem_region *
+mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
+{
+ struct mshv_mem_region *region;
+
+ spin_lock(&p->pt_mem_regions_lock);
+ region = mshv_partition_region_by_gfn(p, gfn);
+ if (!region || !mshv_region_get(region)) {
+ spin_unlock(&p->pt_mem_regions_lock);
+ return NULL;
+ }
+ spin_unlock(&p->pt_mem_regions_lock);
+
+ return region;
+}
+
+/**
+ * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
+ * @vp: Pointer to the virtual processor structure.
+ *
+ * This function processes GPA intercepts by identifying the memory region
+ * corresponding to the intercepted GPA, aligning the page offset, and
+ * mapping the required pages. It ensures that the region is valid and
+ * handles faults efficiently by mapping multiple pages at once.
+ *
+ * Return: true if the intercept was handled successfully, false otherwise.
+ */
+static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
+{
+ struct mshv_partition *p = vp->vp_partition;
+ struct mshv_mem_region *region;
+ struct hv_x64_memory_intercept_message *msg;
+ bool ret;
+ u64 gfn;
+
+ msg = (struct hv_x64_memory_intercept_message *)
+ vp->vp_intercept_msg_page->u.payload;
+
+ gfn = HVPFN_DOWN(msg->guest_physical_address);
+
+ region = mshv_partition_region_by_gfn_get(p, gfn);
+ if (!region)
+ return false;
+
+ /* Only movable memory ranges are supported for GPA intercepts */
+ if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
+ ret = mshv_region_handle_gfn_fault(region, gfn);
+ else
+ ret = false;
+
+ mshv_region_put(region);
+
+ return ret;
+}
+#else /* CONFIG_X86_64 */
+static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
+#endif /* CONFIG_X86_64 */
+
+static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
+{
+ switch (vp->vp_intercept_msg_page->header.message_type) {
+ case HVMSG_GPA_INTERCEPT:
+ return mshv_handle_gpa_intercept(vp);
+ }
+ return false;
+}
+
static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
{
long rc;
- if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
- rc = mshv_run_vp_with_root_scheduler(vp);
- else
- rc = mshv_run_vp_with_hyp_scheduler(vp);
+ do {
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ rc = mshv_run_vp_with_root_scheduler(vp);
+ else
+ rc = mshv_run_vp_with_hyp_scheduler(vp);
+ } while (rc == 0 && mshv_vp_handle_intercept(vp));
if (rc)
return rc;
@@ -844,7 +956,8 @@ mshv_vp_release(struct inode *inode, struct file *filp)
return 0;
}
-static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index)
+static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
+ void *stats_pages[])
{
union hv_stats_object_identity identity = {
.vp.partition_id = partition_id,
@@ -852,10 +965,10 @@ static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index)
};
identity.vp.stats_area_type = HV_STATS_AREA_SELF;
- hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+ hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
- hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+ hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
}
static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
@@ -868,14 +981,14 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
int err;
identity.vp.stats_area_type = HV_STATS_AREA_SELF;
- err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
- &stats_pages[HV_STATS_AREA_SELF]);
+ err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+ &stats_pages[HV_STATS_AREA_SELF]);
if (err)
return err;
identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
- err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
- &stats_pages[HV_STATS_AREA_PARENT]);
+ err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+ &stats_pages[HV_STATS_AREA_PARENT]);
if (err)
goto unmap_self;
@@ -883,7 +996,7 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
unmap_self:
identity.vp.stats_area_type = HV_STATS_AREA_SELF;
- hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+ hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
return err;
}
@@ -893,7 +1006,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
{
struct mshv_create_vp args;
struct mshv_vp *vp;
- struct page *intercept_message_page, *register_page, *ghcb_page;
+ struct page *intercept_msg_page, *register_page, *ghcb_page;
void *stats_pages[2];
long ret;
@@ -911,33 +1024,34 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
if (ret)
return ret;
- ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
- input_vtl_zero,
- &intercept_message_page);
+ ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ input_vtl_zero, &intercept_msg_page);
if (ret)
goto destroy_vp;
if (!mshv_partition_encrypted(partition)) {
- ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_REGISTERS,
- input_vtl_zero,
- &register_page);
+ ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ input_vtl_zero, &register_page);
if (ret)
goto unmap_intercept_message_page;
}
if (mshv_partition_encrypted(partition) &&
is_ghcb_mapping_available()) {
- ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_GHCB,
- input_vtl_normal,
- &ghcb_page);
+ ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_GHCB,
+ input_vtl_normal, &ghcb_page);
if (ret)
goto unmap_register_page;
}
- if (hv_parent_partition()) {
+ /*
+ * This mapping of the stats page is for detecting if dispatch thread
+ * is blocked - only relevant for root scheduler
+ */
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) {
ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
stats_pages);
if (ret)
@@ -959,14 +1073,14 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
atomic64_set(&vp->run.vp_signaled_count, 0);
vp->vp_index = args.vp_index;
- vp->vp_intercept_msg_page = page_to_virt(intercept_message_page);
+ vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
if (!mshv_partition_encrypted(partition))
vp->vp_register_page = page_to_virt(register_page);
if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
vp->vp_ghcb_page = page_to_virt(ghcb_page);
- if (hv_parent_partition())
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
/*
@@ -989,24 +1103,22 @@ put_partition:
free_vp:
kfree(vp);
unmap_stats_pages:
- if (hv_parent_partition())
- mshv_vp_stats_unmap(partition->pt_id, args.vp_index);
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
unmap_ghcb_page:
- if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) {
- hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_GHCB,
- input_vtl_normal);
- }
+ if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
+ hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_GHCB, ghcb_page,
+ input_vtl_normal);
unmap_register_page:
- if (!mshv_partition_encrypted(partition)) {
- hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_REGISTERS,
- input_vtl_zero);
- }
+ if (!mshv_partition_encrypted(partition))
+ hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ register_page, input_vtl_zero);
unmap_intercept_message_page:
- hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
- input_vtl_zero);
+ hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ intercept_msg_page, input_vtl_zero);
destroy_vp:
hv_call_delete_vp(partition->pt_id, args.vp_index);
return ret;
@@ -1034,162 +1146,6 @@ static void mshv_async_hvcall_handler(void *data, u64 *status)
*status = partition->async_hypercall_status;
}
-static int
-mshv_partition_region_share(struct mshv_mem_region *region)
-{
- u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
-
- if (region->flags.large_pages)
- flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
-
- return hv_call_modify_spa_host_access(region->partition->pt_id,
- region->pages, region->nr_pages,
- HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE,
- flags, true);
-}
-
-static int
-mshv_partition_region_unshare(struct mshv_mem_region *region)
-{
- u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
-
- if (region->flags.large_pages)
- flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
-
- return hv_call_modify_spa_host_access(region->partition->pt_id,
- region->pages, region->nr_pages,
- 0,
- flags, false);
-}
-
-static int
-mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags,
- u64 page_offset, u64 page_count)
-{
- if (page_offset + page_count > region->nr_pages)
- return -EINVAL;
-
- if (region->flags.large_pages)
- map_flags |= HV_MAP_GPA_LARGE_PAGE;
-
- /* ask the hypervisor to map guest ram */
- return hv_call_map_gpa_pages(region->partition->pt_id,
- region->start_gfn + page_offset,
- page_count, map_flags,
- region->pages + page_offset);
-}
-
-static int
-mshv_region_map(struct mshv_mem_region *region)
-{
- u32 map_flags = region->hv_map_flags;
-
- return mshv_region_remap_pages(region, map_flags,
- 0, region->nr_pages);
-}
-
-static void
-mshv_region_evict_pages(struct mshv_mem_region *region,
- u64 page_offset, u64 page_count)
-{
- if (region->flags.range_pinned)
- unpin_user_pages(region->pages + page_offset, page_count);
-
- memset(region->pages + page_offset, 0,
- page_count * sizeof(struct page *));
-}
-
-static void
-mshv_region_evict(struct mshv_mem_region *region)
-{
- mshv_region_evict_pages(region, 0, region->nr_pages);
-}
-
-static int
-mshv_region_populate_pages(struct mshv_mem_region *region,
- u64 page_offset, u64 page_count)
-{
- u64 done_count, nr_pages;
- struct page **pages;
- __u64 userspace_addr;
- int ret;
-
- if (page_offset + page_count > region->nr_pages)
- return -EINVAL;
-
- for (done_count = 0; done_count < page_count; done_count += ret) {
- pages = region->pages + page_offset + done_count;
- userspace_addr = region->start_uaddr +
- (page_offset + done_count) *
- HV_HYP_PAGE_SIZE;
- nr_pages = min(page_count - done_count,
- MSHV_PIN_PAGES_BATCH_SIZE);
-
- /*
- * Pinning assuming 4k pages works for large pages too.
- * All page structs within the large page are returned.
- *
- * Pin requests are batched because pin_user_pages_fast
- * with the FOLL_LONGTERM flag does a large temporary
- * allocation of contiguous memory.
- */
- if (region->flags.range_pinned)
- ret = pin_user_pages_fast(userspace_addr,
- nr_pages,
- FOLL_WRITE | FOLL_LONGTERM,
- pages);
- else
- ret = -EOPNOTSUPP;
-
- if (ret < 0)
- goto release_pages;
- }
-
- if (PageHuge(region->pages[page_offset]))
- region->flags.large_pages = true;
-
- return 0;
-
-release_pages:
- mshv_region_evict_pages(region, page_offset, done_count);
- return ret;
-}
-
-static int
-mshv_region_populate(struct mshv_mem_region *region)
-{
- return mshv_region_populate_pages(region, 0, region->nr_pages);
-}
-
-static struct mshv_mem_region *
-mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
-{
- struct mshv_mem_region *region;
-
- hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
- if (gfn >= region->start_gfn &&
- gfn < region->start_gfn + region->nr_pages)
- return region;
- }
-
- return NULL;
-}
-
-static struct mshv_mem_region *
-mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr)
-{
- struct mshv_mem_region *region;
-
- hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
- if (uaddr >= region->start_uaddr &&
- uaddr < region->start_uaddr +
- (region->nr_pages << HV_HYP_PAGE_SHIFT))
- return region;
- }
-
- return NULL;
-}
-
/*
* NB: caller checks and makes sure mem->size is page aligned
* Returns: 0 with regionpp updated on success, or -errno
@@ -1199,53 +1155,61 @@ static int mshv_partition_create_region(struct mshv_partition *partition,
struct mshv_mem_region **regionpp,
bool is_mmio)
{
- struct mshv_mem_region *region;
+ struct mshv_mem_region *rg;
u64 nr_pages = HVPFN_DOWN(mem->size);
/* Reject overlapping regions */
- if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) ||
- mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) ||
- mshv_partition_region_by_uaddr(partition, mem->userspace_addr) ||
- mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1))
+ spin_lock(&partition->pt_mem_regions_lock);
+ hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
+ if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
+ rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
+ continue;
+ spin_unlock(&partition->pt_mem_regions_lock);
return -EEXIST;
+ }
+ spin_unlock(&partition->pt_mem_regions_lock);
- region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages);
- if (!region)
- return -ENOMEM;
-
- region->nr_pages = nr_pages;
- region->start_gfn = mem->guest_pfn;
- region->start_uaddr = mem->userspace_addr;
- region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE;
- if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE))
- region->hv_map_flags |= HV_MAP_GPA_WRITABLE;
- if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
- region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
+ rg = mshv_region_create(mem->guest_pfn, nr_pages,
+ mem->userspace_addr, mem->flags);
+ if (IS_ERR(rg))
+ return PTR_ERR(rg);
- /* Note: large_pages flag populated when we pin the pages */
- if (!is_mmio)
- region->flags.range_pinned = true;
+ if (is_mmio)
+ rg->type = MSHV_REGION_TYPE_MMIO;
+ else if (mshv_partition_encrypted(partition) ||
+ !mshv_region_movable_init(rg))
+ rg->type = MSHV_REGION_TYPE_MEM_PINNED;
+ else
+ rg->type = MSHV_REGION_TYPE_MEM_MOVABLE;
- region->partition = partition;
+ rg->partition = partition;
- *regionpp = region;
+ *regionpp = rg;
return 0;
}
-/*
- * Map guest ram. if snp, make sure to release that from the host first
- * Side Effects: In case of failure, pages are unpinned when feasible.
+/**
+ * mshv_prepare_pinned_region - Pin and map memory regions
+ * @region: Pointer to the memory region structure
+ *
+ * This function processes memory regions that are explicitly marked as pinned.
+ * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
+ * population. The function ensures the region is properly populated, handles
+ * encryption requirements for SNP partitions if applicable, maps the region,
+ * and performs necessary sharing or eviction operations based on the mapping
+ * result.
+ *
+ * Return: 0 on success, negative error code on failure.
*/
-static int
-mshv_partition_mem_region_map(struct mshv_mem_region *region)
+static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
{
struct mshv_partition *partition = region->partition;
int ret;
- ret = mshv_region_populate(region);
+ ret = mshv_region_pin(region);
if (ret) {
- pt_err(partition, "Failed to populate memory region: %d\n",
+ pt_err(partition, "Failed to pin memory region: %d\n",
ret);
goto err_out;
}
@@ -1258,12 +1222,12 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region)
* access to guest memory regions.
*/
if (mshv_partition_encrypted(partition)) {
- ret = mshv_partition_region_unshare(region);
+ ret = mshv_region_unshare(region);
if (ret) {
pt_err(partition,
"Failed to unshare memory region (guest_pfn: %llu): %d\n",
region->start_gfn, ret);
- goto evict_region;
+ goto invalidate_region;
}
}
@@ -1271,9 +1235,9 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region)
if (ret && mshv_partition_encrypted(partition)) {
int shrc;
- shrc = mshv_partition_region_share(region);
+ shrc = mshv_region_share(region);
if (!shrc)
- goto evict_region;
+ goto invalidate_region;
pt_err(partition,
"Failed to share memory region (guest_pfn: %llu): %d\n",
@@ -1287,8 +1251,8 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region)
return 0;
-evict_region:
- mshv_region_evict(region);
+invalidate_region:
+ mshv_region_invalidate(region);
err_out:
return ret;
}
@@ -1333,17 +1297,35 @@ mshv_map_user_memory(struct mshv_partition *partition,
if (ret)
return ret;
- if (is_mmio)
- ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn,
- mmio_pfn, HVPFN_DOWN(mem.size));
- else
- ret = mshv_partition_mem_region_map(region);
+ switch (region->type) {
+ case MSHV_REGION_TYPE_MEM_PINNED:
+ ret = mshv_prepare_pinned_region(region);
+ break;
+ case MSHV_REGION_TYPE_MEM_MOVABLE:
+ /*
+ * For movable memory regions, remap with no access to let
+ * the hypervisor track dirty pages, enabling pre-copy live
+ * migration.
+ */
+ ret = hv_call_map_gpa_pages(partition->pt_id,
+ region->start_gfn,
+ region->nr_pages,
+ HV_MAP_GPA_NO_ACCESS, NULL);
+ break;
+ case MSHV_REGION_TYPE_MMIO:
+ ret = hv_call_map_mmio_pages(partition->pt_id,
+ region->start_gfn,
+ mmio_pfn,
+ region->nr_pages);
+ break;
+ }
if (ret)
goto errout;
- /* Install the new region */
+ spin_lock(&partition->pt_mem_regions_lock);
hlist_add_head(&region->hnode, &partition->pt_mem_regions);
+ spin_unlock(&partition->pt_mem_regions_lock);
return 0;
@@ -1358,33 +1340,32 @@ mshv_unmap_user_memory(struct mshv_partition *partition,
struct mshv_user_mem_region mem)
{
struct mshv_mem_region *region;
- u32 unmap_flags = 0;
if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
return -EINVAL;
+ spin_lock(&partition->pt_mem_regions_lock);
+
region = mshv_partition_region_by_gfn(partition, mem.guest_pfn);
- if (!region)
- return -EINVAL;
+ if (!region) {
+ spin_unlock(&partition->pt_mem_regions_lock);
+ return -ENOENT;
+ }
/* Paranoia check */
if (region->start_uaddr != mem.userspace_addr ||
region->start_gfn != mem.guest_pfn ||
- region->nr_pages != HVPFN_DOWN(mem.size))
+ region->nr_pages != HVPFN_DOWN(mem.size)) {
+ spin_unlock(&partition->pt_mem_regions_lock);
return -EINVAL;
+ }
hlist_del(&region->hnode);
- if (region->flags.large_pages)
- unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
-
- /* ignore unmap failures and continue as process may be exiting */
- hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn,
- region->nr_pages, unmap_flags);
+ spin_unlock(&partition->pt_mem_regions_lock);
- mshv_region_evict(region);
+ mshv_region_put(region);
- vfree(region);
return 0;
}
@@ -1720,8 +1701,8 @@ static void destroy_partition(struct mshv_partition *partition)
{
struct mshv_vp *vp;
struct mshv_mem_region *region;
- int i, ret;
struct hlist_node *n;
+ int i;
if (refcount_read(&partition->pt_ref_count)) {
pt_err(partition,
@@ -1743,28 +1724,32 @@ static void destroy_partition(struct mshv_partition *partition)
if (!vp)
continue;
- if (hv_parent_partition())
- mshv_vp_stats_unmap(partition->pt_id, vp->vp_index);
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
+ (void **)vp->vp_stats_pages);
if (vp->vp_register_page) {
- (void)hv_call_unmap_vp_state_page(partition->pt_id,
- vp->vp_index,
- HV_VP_STATE_PAGE_REGISTERS,
- input_vtl_zero);
+ (void)hv_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ virt_to_page(vp->vp_register_page),
+ input_vtl_zero);
vp->vp_register_page = NULL;
}
- (void)hv_call_unmap_vp_state_page(partition->pt_id,
- vp->vp_index,
- HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
- input_vtl_zero);
+ (void)hv_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ virt_to_page(vp->vp_intercept_msg_page),
+ input_vtl_zero);
vp->vp_intercept_msg_page = NULL;
if (vp->vp_ghcb_page) {
- (void)hv_call_unmap_vp_state_page(partition->pt_id,
- vp->vp_index,
- HV_VP_STATE_PAGE_GHCB,
- input_vtl_normal);
+ (void)hv_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_GHCB,
+ virt_to_page(vp->vp_ghcb_page),
+ input_vtl_normal);
vp->vp_ghcb_page = NULL;
}
@@ -1781,24 +1766,10 @@ static void destroy_partition(struct mshv_partition *partition)
remove_partition(partition);
- /* Remove regions, regain access to the memory and unpin the pages */
hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
hnode) {
hlist_del(&region->hnode);
-
- if (mshv_partition_encrypted(partition)) {
- ret = mshv_partition_region_share(region);
- if (ret) {
- pt_err(partition,
- "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n",
- ret);
- return;
- }
- }
-
- mshv_region_evict(region);
-
- vfree(region);
+ mshv_region_put(region);
}
/* Withdraw and free all pages we deposited */
@@ -1865,41 +1836,117 @@ add_partition(struct mshv_partition *partition)
return 0;
}
-static long
-mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
+static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
+ HV_PARTITION_PROCESSOR_FEATURES_BANKS);
+
+static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
+ struct hv_partition_creation_properties *cr_props,
+ union hv_partition_isolation_properties *isol_props)
{
- struct mshv_create_partition args;
- u64 creation_flags;
- struct hv_partition_creation_properties creation_properties = {};
- union hv_partition_isolation_properties isolation_properties = {};
- struct mshv_partition *partition;
- long ret;
+ int i;
+ struct mshv_create_partition_v2 args;
+ union hv_partition_processor_features *disabled_procs;
+ union hv_partition_processor_xsave_features *disabled_xsave;
- if (copy_from_user(&args, user_arg, sizeof(args)))
+ /* First, copy v1 struct in case user is on previous versions */
+ if (copy_from_user(&args, user_arg,
+ sizeof(struct mshv_create_partition)))
return -EFAULT;
if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
return -EINVAL;
+ disabled_procs = &cr_props->disabled_processor_features;
+ disabled_xsave = &cr_props->disabled_processor_xsave_features;
+
+ /* Check if user provided newer struct with feature fields */
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
+ if (copy_from_user(&args, user_arg, sizeof(args)))
+ return -EFAULT;
+
+ /* Re-validate v1 fields after second copy_from_user() */
+ if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
+ args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
+ return -EINVAL;
+
+ if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
+ mshv_field_nonzero(args, pt_rsvd) ||
+ mshv_field_nonzero(args, pt_rsvd1))
+ return -EINVAL;
+
+ /*
+ * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
+ * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
+ * (i.e. 2).
+ *
+ * Further banks (index >= 2) will be modifiable as 'early'
+ * properties via the set partition property hypercall.
+ */
+ for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
+ disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
+
+#if IS_ENABLED(CONFIG_X86_64)
+ disabled_xsave->as_uint64 = args.pt_disabled_xsave;
+#else
+ /*
+ * In practice this field is ignored on arm64, but safer to
+ * zero it in case it is ever used.
+ */
+ disabled_xsave->as_uint64 = 0;
+
+ if (mshv_field_nonzero(args, pt_rsvd2))
+ return -EINVAL;
+#endif
+ } else {
+ /*
+ * v1 behavior: try to enable everything. The hypervisor will
+ * disable features that are not supported. The banks can be
+ * queried via the get partition property hypercall.
+ */
+ for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
+ disabled_procs->as_uint64[i] = 0;
+
+ disabled_xsave->as_uint64 = 0;
+ }
+
/* Only support EXO partitions */
- creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
- HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
+ *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
+ HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
+
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
- if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC))
- creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
- if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC))
- creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
- if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES))
- creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
+ isol_props->as_uint64 = 0;
switch (args.pt_isolation) {
case MSHV_PT_ISOLATION_NONE:
- isolation_properties.isolation_type =
- HV_PARTITION_ISOLATION_TYPE_NONE;
+ isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
break;
}
+ return 0;
+}
+
+static long
+mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
+{
+ u64 creation_flags;
+ struct hv_partition_creation_properties creation_properties;
+ union hv_partition_isolation_properties isolation_properties;
+ struct mshv_partition *partition;
+ long ret;
+
+ ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
+ &creation_properties,
+ &isolation_properties);
+ if (ret)
+ return ret;
+
partition = kzalloc(sizeof(*partition), GFP_KERNEL);
if (!partition)
return -ENOMEM;
@@ -1919,6 +1966,7 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
INIT_HLIST_HEAD(&partition->pt_devices);
+ spin_lock_init(&partition->pt_mem_regions_lock);
INIT_HLIST_HEAD(&partition->pt_mem_regions);
mshv_eventfd_init(partition);
@@ -1966,6 +2014,9 @@ static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
case MSHV_CREATE_PARTITION:
return mshv_ioctl_create_partition((void __user *)arg,
misc->this_device);
+ case MSHV_ROOT_HVCALL:
+ return mshv_ioctl_passthru_hvcall(NULL, false,
+ (void __user *)arg);
}
return -ENOTTY;
@@ -2182,6 +2233,22 @@ root_sched_deinit:
return err;
}
+static void mshv_init_vmm_caps(struct device *dev)
+{
+ /*
+ * This can only fail here if HVCALL_GET_PARTITION_PROPERTY_EX or
+ * HV_PARTITION_PROPERTY_VMM_CAPABILITIES are not supported. In that
+ * case it's valid to proceed as if all vmm_caps are disabled (zero).
+ */
+ if (hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
+ HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
+ 0, &mshv_root.vmm_caps,
+ sizeof(mshv_root.vmm_caps)))
+ dev_warn(dev, "Unable to get VMM capabilities\n");
+
+ dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
+}
+
static int __init mshv_parent_partition_init(void)
{
int ret;
@@ -2234,6 +2301,8 @@ static int __init mshv_parent_partition_init(void)
if (ret)
goto remove_cpu_state;
+ mshv_init_vmm_caps(dev);
+
ret = mshv_irqfd_wq_init();
if (ret)
goto exit_partition;
diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
index e6b6381b7c36..f8b0337cdc82 100644
--- a/drivers/hv/mshv_synic.c
+++ b/drivers/hv/mshv_synic.c
@@ -394,7 +394,7 @@ unlock_out:
void mshv_isr(void)
{
struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
- struct hv_message_page **msg_page = &spages->synic_message_page;
+ struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
struct hv_message *msg;
bool handled;
@@ -456,7 +456,7 @@ int mshv_synic_init(unsigned int cpu)
#endif
union hv_synic_scontrol sctrl;
struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
- struct hv_message_page **msg_page = &spages->synic_message_page;
+ struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
struct hv_synic_event_flags_page **event_flags_page =
&spages->synic_event_flags_page;
struct hv_synic_event_ring_page **event_ring_page =
@@ -550,7 +550,7 @@ int mshv_synic_cleanup(unsigned int cpu)
union hv_synic_sirbp sirbp;
union hv_synic_scontrol sctrl;
struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
- struct hv_message_page **msg_page = &spages->synic_message_page;
+ struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
struct hv_synic_event_flags_page **event_flags_page =
&spages->synic_event_flags_page;
struct hv_synic_event_ring_page **event_ring_page =
diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h
new file mode 100644
index 000000000000..a6eea52f7aa2
--- /dev/null
+++ b/drivers/hv/mshv_vtl.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _MSHV_VTL_H
+#define _MSHV_VTL_H
+
+#include <linux/mshv.h>
+#include <linux/types.h>
+
+struct mshv_vtl_run {
+ u32 cancel;
+ u32 vtl_ret_action_size;
+ u32 pad[2];
+ char exit_message[MSHV_MAX_RUN_MSG_SIZE];
+ union {
+ struct mshv_vtl_cpu_context cpu_context;
+
+ /*
+ * Reserving room for the cpu context to grow and to maintain compatibility
+ * with user mode.
+ */
+ char reserved[1024];
+ };
+ char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE];
+};
+
+#endif /* _MSHV_VTL_H */
diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c
new file mode 100644
index 000000000000..2cebe9de5a5a
--- /dev/null
+++ b/drivers/hv/mshv_vtl_main.c
@@ -0,0 +1,1392 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Author:
+ * Roman Kisel <romank@linux.microsoft.com>
+ * Saurabh Sengar <ssengar@linux.microsoft.com>
+ * Naman Jain <namjain@linux.microsoft.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpuhotplug.h>
+#include <linux/count_zeros.h>
+#include <linux/entry-virt.h>
+#include <linux/eventfd.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+#include <asm/debugreg.h>
+#include <asm/mshyperv.h>
+#include <trace/events/ipi.h>
+#include <uapi/asm/mtrr.h>
+#include <uapi/linux/mshv.h>
+#include <hyperv/hvhdk.h>
+
+#include "../../kernel/fpu/legacy.h"
+#include "mshv.h"
+#include "mshv_vtl.h"
+#include "hyperv_vmbus.h"
+
+MODULE_AUTHOR("Microsoft");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver");
+
+#define MSHV_ENTRY_REASON_LOWER_VTL_CALL 0x1
+#define MSHV_ENTRY_REASON_INTERRUPT 0x2
+#define MSHV_ENTRY_REASON_INTERCEPT 0x3
+
+#define MSHV_REAL_OFF_SHIFT 16
+#define MSHV_PG_OFF_CPU_MASK (BIT_ULL(MSHV_REAL_OFF_SHIFT) - 1)
+#define MSHV_RUN_PAGE_OFFSET 0
+#define MSHV_REG_PAGE_OFFSET 1
+#define VTL2_VMBUS_SINT_INDEX 7
+
+static struct device *mem_dev;
+
+static struct tasklet_struct msg_dpc;
+static wait_queue_head_t fd_wait_queue;
+static bool has_message;
+static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT];
+static DEFINE_MUTEX(flag_lock);
+static bool __read_mostly mshv_has_reg_page;
+
+/* hvcall code is of type u16, allocate a bitmap of size (1 << 16) to accommodate it */
+#define MAX_BITMAP_SIZE ((U16_MAX + 1) / 8)
+
+struct mshv_vtl_hvcall_fd {
+ u8 allow_bitmap[MAX_BITMAP_SIZE];
+ bool allow_map_initialized;
+ /*
+ * Used to protect hvcall setup in IOCTLs
+ */
+ struct mutex init_mutex;
+ struct miscdevice *dev;
+};
+
+struct mshv_vtl_poll_file {
+ struct file *file;
+ wait_queue_entry_t wait;
+ wait_queue_head_t *wqh;
+ poll_table pt;
+ int cpu;
+};
+
+struct mshv_vtl {
+ struct device *module_dev;
+ u64 id;
+};
+
+struct mshv_vtl_per_cpu {
+ struct mshv_vtl_run *run;
+ struct page *reg_page;
+};
+
+/* SYNIC_OVERLAY_PAGE_MSR - internal, identical to hv_synic_simp */
+union hv_synic_overlay_page_msr {
+ u64 as_uint64;
+ struct {
+ u64 enabled: 1;
+ u64 reserved: 11;
+ u64 pfn: 52;
+ } __packed;
+};
+
+static struct mutex mshv_vtl_poll_file_lock;
+static union hv_register_vsm_page_offsets mshv_vsm_page_offsets;
+static union hv_register_vsm_capabilities mshv_vsm_capabilities;
+
+static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file);
+static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions);
+static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu);
+
+static const union hv_input_vtl input_vtl_zero;
+static const union hv_input_vtl input_vtl_normal = {
+ .use_target_vtl = 1,
+};
+
+static const struct file_operations mshv_vtl_fops;
+
+static long
+mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev)
+{
+ struct mshv_vtl *vtl;
+ struct file *file;
+ int fd;
+
+ vtl = kzalloc(sizeof(*vtl), GFP_KERNEL);
+ if (!vtl)
+ return -ENOMEM;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ kfree(vtl);
+ return fd;
+ }
+ file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops,
+ vtl, O_RDWR);
+ if (IS_ERR(file)) {
+ kfree(vtl);
+ return PTR_ERR(file);
+ }
+ vtl->module_dev = module_dev;
+ fd_install(fd, file);
+
+ return fd;
+}
+
+static long
+mshv_ioctl_check_extension(void __user *user_arg)
+{
+ u32 arg;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ switch (arg) {
+ case MSHV_CAP_CORE_API_STABLE:
+ return 0;
+ case MSHV_CAP_REGISTER_PAGE:
+ return mshv_has_reg_page;
+ case MSHV_CAP_VTL_RETURN_ACTION:
+ return mshv_vsm_capabilities.return_action_available;
+ case MSHV_CAP_DR6_SHARED:
+ return mshv_vsm_capabilities.dr6_shared;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static long
+mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+ struct miscdevice *misc = filp->private_data;
+
+ switch (ioctl) {
+ case MSHV_CHECK_EXTENSION:
+ return mshv_ioctl_check_extension((void __user *)arg);
+ case MSHV_CREATE_VTL:
+ return mshv_ioctl_create_vtl((void __user *)arg, misc->this_device);
+ }
+
+ return -ENOTTY;
+}
+
+static const struct file_operations mshv_dev_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = mshv_dev_ioctl,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice mshv_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "mshv",
+ .fops = &mshv_dev_fops,
+ .mode = 0600,
+};
+
+static struct mshv_vtl_run *mshv_vtl_this_run(void)
+{
+ return *this_cpu_ptr(&mshv_vtl_per_cpu.run);
+}
+
+static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu)
+{
+ return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu);
+}
+
+static struct page *mshv_vtl_cpu_reg_page(int cpu)
+{
+ return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu);
+}
+
+static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu)
+{
+ struct hv_register_assoc reg_assoc = {};
+ union hv_synic_overlay_page_msr overlay = {};
+ struct page *reg_page;
+
+ reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL);
+ if (!reg_page) {
+ WARN(1, "failed to allocate register page\n");
+ return;
+ }
+
+ overlay.enabled = 1;
+ overlay.pfn = page_to_hvpfn(reg_page);
+ reg_assoc.name = HV_X64_REGISTER_REG_PAGE;
+ reg_assoc.value.reg64 = overlay.as_uint64;
+
+ if (hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl_zero, &reg_assoc)) {
+ WARN(1, "failed to setup register page\n");
+ __free_page(reg_page);
+ return;
+ }
+
+ per_cpu->reg_page = reg_page;
+ mshv_has_reg_page = true;
+}
+
+static void mshv_vtl_synic_enable_regs(unsigned int cpu)
+{
+ union hv_synic_sint sint;
+
+ sint.as_uint64 = 0;
+ sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ sint.masked = false;
+ sint.auto_eoi = hv_recommend_using_aeoi();
+
+ /* Enable intercepts */
+ if (!mshv_vsm_capabilities.intercept_page_available)
+ hv_set_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
+ sint.as_uint64);
+
+ /* VTL2 Host VSP SINT is (un)masked when the user mode requests that */
+}
+
+static int mshv_vtl_get_vsm_regs(void)
+{
+ struct hv_register_assoc registers[2];
+ int ret, count = 2;
+
+ registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS;
+ registers[1].name = HV_REGISTER_VSM_CAPABILITIES;
+
+ ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ count, input_vtl_zero, registers);
+ if (ret)
+ return ret;
+
+ mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64;
+ mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64;
+
+ return ret;
+}
+
+static int mshv_vtl_configure_vsm_partition(struct device *dev)
+{
+ union hv_register_vsm_partition_config config;
+ struct hv_register_assoc reg_assoc;
+
+ config.as_uint64 = 0;
+ config.default_vtl_protection_mask = HV_MAP_GPA_PERMISSIONS_MASK;
+ config.enable_vtl_protection = 1;
+ config.zero_memory_on_reset = 1;
+ config.intercept_vp_startup = 1;
+ config.intercept_cpuid_unimplemented = 1;
+
+ if (mshv_vsm_capabilities.intercept_page_available) {
+ dev_dbg(dev, "using intercept page\n");
+ config.intercept_page = 1;
+ }
+
+ reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG;
+ reg_assoc.value.reg64 = config.as_uint64;
+
+ return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl_zero, &reg_assoc);
+}
+
+static void mshv_vtl_vmbus_isr(void)
+{
+ struct hv_per_cpu_context *per_cpu;
+ struct hv_message *msg;
+ u32 message_type;
+ union hv_synic_event_flags *event_flags;
+ struct eventfd_ctx *eventfd;
+ u16 i;
+
+ per_cpu = this_cpu_ptr(hv_context.cpu_context);
+ if (smp_processor_id() == 0) {
+ msg = (struct hv_message *)per_cpu->hyp_synic_message_page + VTL2_VMBUS_SINT_INDEX;
+ message_type = READ_ONCE(msg->header.message_type);
+ if (message_type != HVMSG_NONE)
+ tasklet_schedule(&msg_dpc);
+ }
+
+ event_flags = (union hv_synic_event_flags *)per_cpu->hyp_synic_event_page +
+ VTL2_VMBUS_SINT_INDEX;
+ for_each_set_bit(i, event_flags->flags, HV_EVENT_FLAGS_COUNT) {
+ if (!sync_test_and_clear_bit(i, event_flags->flags))
+ continue;
+ rcu_read_lock();
+ eventfd = READ_ONCE(flag_eventfds[i]);
+ if (eventfd)
+ eventfd_signal(eventfd);
+ rcu_read_unlock();
+ }
+
+ vmbus_isr();
+}
+
+static int mshv_vtl_alloc_context(unsigned int cpu)
+{
+ struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu);
+
+ per_cpu->run = (struct mshv_vtl_run *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (!per_cpu->run)
+ return -ENOMEM;
+
+ if (mshv_vsm_capabilities.intercept_page_available)
+ mshv_vtl_configure_reg_page(per_cpu);
+
+ mshv_vtl_synic_enable_regs(cpu);
+
+ return 0;
+}
+
+static int mshv_vtl_cpuhp_online;
+
+static int hv_vtl_setup_synic(void)
+{
+ int ret;
+
+ /* Use our isr to first filter out packets destined for userspace */
+ hv_setup_vmbus_handler(mshv_vtl_vmbus_isr);
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vtl:online",
+ mshv_vtl_alloc_context, NULL);
+ if (ret < 0) {
+ hv_setup_vmbus_handler(vmbus_isr);
+ return ret;
+ }
+
+ mshv_vtl_cpuhp_online = ret;
+
+ return 0;
+}
+
+static void hv_vtl_remove_synic(void)
+{
+ cpuhp_remove_state(mshv_vtl_cpuhp_online);
+ hv_setup_vmbus_handler(vmbus_isr);
+}
+
+static int vtl_get_vp_register(struct hv_register_assoc *reg)
+{
+ return hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl_normal, reg);
+}
+
+static int vtl_set_vp_register(struct hv_register_assoc *reg)
+{
+ return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl_normal, reg);
+}
+
+static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg)
+{
+ struct mshv_vtl_ram_disposition vtl0_mem;
+ struct dev_pagemap *pgmap;
+ void *addr;
+
+ if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem)))
+ return -EFAULT;
+ /* vtl0_mem.last_pfn is excluded in the pagemap range for VTL0 as per design */
+ if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) {
+ dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn (%llx)\n",
+ vtl0_mem.start_pfn, vtl0_mem.last_pfn);
+ return -EFAULT;
+ }
+
+ pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL);
+ if (!pgmap)
+ return -ENOMEM;
+
+ pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn);
+ pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1;
+ pgmap->nr_range = 1;
+ pgmap->type = MEMORY_DEVICE_GENERIC;
+
+ /*
+ * Determine the highest page order that can be used for the given memory range.
+ * This works best when the range is aligned; i.e. both the start and the length.
+ */
+ pgmap->vmemmap_shift = count_trailing_zeros(vtl0_mem.start_pfn | vtl0_mem.last_pfn);
+ dev_dbg(vtl->module_dev,
+ "Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page order: %lu\n",
+ vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap->vmemmap_shift);
+
+ addr = devm_memremap_pages(mem_dev, pgmap);
+ if (IS_ERR(addr)) {
+ dev_err(vtl->module_dev, "devm_memremap_pages error: %ld\n", PTR_ERR(addr));
+ kfree(pgmap);
+ return -EFAULT;
+ }
+
+ /* Don't free pgmap, since it has to stick around until the memory
+ * is unmapped, which will never happen as there is no scenario
+ * where VTL0 can be released/shutdown without bringing down VTL2.
+ */
+ return 0;
+}
+
+static void mshv_vtl_cancel(int cpu)
+{
+ int here = get_cpu();
+
+ if (here != cpu) {
+ if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1))
+ smp_send_reschedule(cpu);
+ } else {
+ WRITE_ONCE(mshv_vtl_this_run()->cancel, 1);
+ }
+ put_cpu();
+}
+
+static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
+{
+ struct mshv_vtl_poll_file *poll_file = container_of(wait, struct mshv_vtl_poll_file, wait);
+
+ mshv_vtl_cancel(poll_file->cpu);
+
+ return 0;
+}
+
+static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct mshv_vtl_poll_file *poll_file = container_of(pt, struct mshv_vtl_poll_file, pt);
+
+ WARN_ON(poll_file->wqh);
+ poll_file->wqh = wqh;
+ add_wait_queue(wqh, &poll_file->wait);
+}
+
+static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user *user_input)
+{
+ struct file *file, *old_file;
+ struct mshv_vtl_poll_file *poll_file;
+ struct mshv_vtl_set_poll_file input;
+
+ if (copy_from_user(&input, user_input, sizeof(input)))
+ return -EFAULT;
+
+ if (input.cpu >= num_possible_cpus() || !cpu_online(input.cpu))
+ return -EINVAL;
+ /*
+ * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists.
+ * CPU is expected to remain online after above cpu_online() check.
+ */
+
+ file = NULL;
+ file = fget(input.fd);
+ if (!file)
+ return -EBADFD;
+
+ poll_file = per_cpu_ptr(&mshv_vtl_poll_file, READ_ONCE(input.cpu));
+ if (!poll_file)
+ return -EINVAL;
+
+ mutex_lock(&mshv_vtl_poll_file_lock);
+
+ if (poll_file->wqh)
+ remove_wait_queue(poll_file->wqh, &poll_file->wait);
+ poll_file->wqh = NULL;
+
+ old_file = poll_file->file;
+ poll_file->file = file;
+ poll_file->cpu = input.cpu;
+
+ if (file) {
+ init_waitqueue_func_entry(&poll_file->wait, mshv_vtl_poll_file_wake);
+ init_poll_funcptr(&poll_file->pt, mshv_vtl_ptable_queue_proc);
+ vfs_poll(file, &poll_file->pt);
+ }
+
+ mutex_unlock(&mshv_vtl_poll_file_lock);
+
+ if (old_file)
+ fput(old_file);
+
+ return 0;
+}
+
+/* Static table mapping register names to their corresponding actions */
+static const struct {
+ enum hv_register_name reg_name;
+ int debug_reg_num; /* -1 if not a debug register */
+ u32 msr_addr; /* 0 if not an MSR */
+} reg_table[] = {
+ /* Debug registers */
+ {HV_X64_REGISTER_DR0, 0, 0},
+ {HV_X64_REGISTER_DR1, 1, 0},
+ {HV_X64_REGISTER_DR2, 2, 0},
+ {HV_X64_REGISTER_DR3, 3, 0},
+ {HV_X64_REGISTER_DR6, 6, 0},
+ /* MTRR MSRs */
+ {HV_X64_REGISTER_MSR_MTRR_CAP, -1, MSR_MTRRcap},
+ {HV_X64_REGISTER_MSR_MTRR_DEF_TYPE, -1, MSR_MTRRdefType},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0, -1, MTRRphysBase_MSR(0)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1, -1, MTRRphysBase_MSR(1)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2, -1, MTRRphysBase_MSR(2)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3, -1, MTRRphysBase_MSR(3)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4, -1, MTRRphysBase_MSR(4)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5, -1, MTRRphysBase_MSR(5)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6, -1, MTRRphysBase_MSR(6)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7, -1, MTRRphysBase_MSR(7)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8, -1, MTRRphysBase_MSR(8)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9, -1, MTRRphysBase_MSR(9)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA, -1, MTRRphysBase_MSR(0xa)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB, -1, MTRRphysBase_MSR(0xb)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC, -1, MTRRphysBase_MSR(0xc)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASED, -1, MTRRphysBase_MSR(0xd)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE, -1, MTRRphysBase_MSR(0xe)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF, -1, MTRRphysBase_MSR(0xf)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0, -1, MTRRphysMask_MSR(0)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1, -1, MTRRphysMask_MSR(1)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2, -1, MTRRphysMask_MSR(2)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3, -1, MTRRphysMask_MSR(3)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4, -1, MTRRphysMask_MSR(4)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5, -1, MTRRphysMask_MSR(5)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6, -1, MTRRphysMask_MSR(6)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7, -1, MTRRphysMask_MSR(7)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8, -1, MTRRphysMask_MSR(8)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9, -1, MTRRphysMask_MSR(9)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA, -1, MTRRphysMask_MSR(0xa)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB, -1, MTRRphysMask_MSR(0xb)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC, -1, MTRRphysMask_MSR(0xc)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD, -1, MTRRphysMask_MSR(0xd)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE, -1, MTRRphysMask_MSR(0xe)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF, -1, MTRRphysMask_MSR(0xf)},
+ {HV_X64_REGISTER_MSR_MTRR_FIX64K00000, -1, MSR_MTRRfix64K_00000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX16K80000, -1, MSR_MTRRfix16K_80000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX16KA0000, -1, MSR_MTRRfix16K_A0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KC0000, -1, MSR_MTRRfix4K_C0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KC8000, -1, MSR_MTRRfix4K_C8000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KD0000, -1, MSR_MTRRfix4K_D0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KD8000, -1, MSR_MTRRfix4K_D8000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KE0000, -1, MSR_MTRRfix4K_E0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KE8000, -1, MSR_MTRRfix4K_E8000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KF0000, -1, MSR_MTRRfix4K_F0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KF8000, -1, MSR_MTRRfix4K_F8000},
+};
+
+static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set)
+{
+ u64 *reg64;
+ enum hv_register_name gpr_name;
+ int i;
+
+ gpr_name = regs->name;
+ reg64 = &regs->value.reg64;
+
+ /* Search for the register in the table */
+ for (i = 0; i < ARRAY_SIZE(reg_table); i++) {
+ if (reg_table[i].reg_name != gpr_name)
+ continue;
+ if (reg_table[i].debug_reg_num != -1) {
+ /* Handle debug registers */
+ if (gpr_name == HV_X64_REGISTER_DR6 &&
+ !mshv_vsm_capabilities.dr6_shared)
+ goto hypercall;
+ if (set)
+ native_set_debugreg(reg_table[i].debug_reg_num, *reg64);
+ else
+ *reg64 = native_get_debugreg(reg_table[i].debug_reg_num);
+ } else {
+ /* Handle MSRs */
+ if (set)
+ wrmsrl(reg_table[i].msr_addr, *reg64);
+ else
+ rdmsrl(reg_table[i].msr_addr, *reg64);
+ }
+ return 0;
+ }
+
+hypercall:
+ return 1;
+}
+
+static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0)
+{
+ struct hv_vp_assist_page *hvp;
+
+ hvp = hv_vp_assist_page[smp_processor_id()];
+
+ /*
+ * Process signal event direct set in the run page, if any.
+ */
+ if (mshv_vsm_capabilities.return_action_available) {
+ u32 offset = READ_ONCE(mshv_vtl_this_run()->vtl_ret_action_size);
+
+ WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0);
+
+ /*
+ * Hypervisor will take care of clearing out the actions
+ * set in the assist page.
+ */
+ memcpy(hvp->vtl_ret_actions,
+ mshv_vtl_this_run()->vtl_ret_actions,
+ min_t(u32, offset, sizeof(hvp->vtl_ret_actions)));
+ }
+
+ mshv_vtl_return_call(vtl0);
+}
+
+static bool mshv_vtl_process_intercept(void)
+{
+ struct hv_per_cpu_context *mshv_cpu;
+ void *synic_message_page;
+ struct hv_message *msg;
+ u32 message_type;
+
+ mshv_cpu = this_cpu_ptr(hv_context.cpu_context);
+ synic_message_page = mshv_cpu->hyp_synic_message_page;
+ if (unlikely(!synic_message_page))
+ return true;
+
+ msg = (struct hv_message *)synic_message_page + HV_SYNIC_INTERCEPTION_SINT_INDEX;
+ message_type = READ_ONCE(msg->header.message_type);
+ if (message_type == HVMSG_NONE)
+ return true;
+
+ memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg));
+ vmbus_signal_eom(msg, message_type);
+
+ return false;
+}
+
+static int mshv_vtl_ioctl_return_to_lower_vtl(void)
+{
+ preempt_disable();
+ for (;;) {
+ unsigned long irq_flags;
+ struct hv_vp_assist_page *hvp;
+ int ret;
+
+ if (__xfer_to_guest_mode_work_pending()) {
+ preempt_enable();
+ ret = xfer_to_guest_mode_handle_work();
+ if (ret)
+ return ret;
+ preempt_disable();
+ }
+
+ local_irq_save(irq_flags);
+ if (READ_ONCE(mshv_vtl_this_run()->cancel)) {
+ local_irq_restore(irq_flags);
+ preempt_enable();
+ return -EINTR;
+ }
+
+ mshv_vtl_return(&mshv_vtl_this_run()->cpu_context);
+ local_irq_restore(irq_flags);
+
+ hvp = hv_vp_assist_page[smp_processor_id()];
+ this_cpu_inc(num_vtl0_transitions);
+ switch (hvp->vtl_entry_reason) {
+ case MSHV_ENTRY_REASON_INTERRUPT:
+ if (!mshv_vsm_capabilities.intercept_page_available &&
+ likely(!mshv_vtl_process_intercept()))
+ goto done;
+ break;
+
+ case MSHV_ENTRY_REASON_INTERCEPT:
+ WARN_ON(!mshv_vsm_capabilities.intercept_page_available);
+ memcpy(mshv_vtl_this_run()->exit_message, hvp->intercept_message,
+ sizeof(hvp->intercept_message));
+ goto done;
+
+ default:
+ panic("unknown entry reason: %d", hvp->vtl_entry_reason);
+ }
+ }
+
+done:
+ preempt_enable();
+
+ return 0;
+}
+
+static long
+mshv_vtl_ioctl_get_regs(void __user *user_args)
+{
+ struct mshv_vp_registers args;
+ struct hv_register_assoc reg;
+ long ret;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ /* This IOCTL supports processing only one register at a time. */
+ if (args.count != 1)
+ return -EINVAL;
+
+ if (copy_from_user(&reg, (void __user *)args.regs_ptr,
+ sizeof(reg)))
+ return -EFAULT;
+
+ ret = mshv_vtl_get_set_reg(&reg, false);
+ if (!ret)
+ goto copy_args; /* No need of hypercall */
+ ret = vtl_get_vp_register(&reg);
+ if (ret)
+ return ret;
+
+copy_args:
+ if (copy_to_user((void __user *)args.regs_ptr, &reg, sizeof(reg)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static long
+mshv_vtl_ioctl_set_regs(void __user *user_args)
+{
+ struct mshv_vp_registers args;
+ struct hv_register_assoc reg;
+ long ret;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ /* This IOCTL supports processing only one register at a time. */
+ if (args.count != 1)
+ return -EINVAL;
+
+ if (copy_from_user(&reg, (void __user *)args.regs_ptr, sizeof(reg)))
+ return -EFAULT;
+
+ ret = mshv_vtl_get_set_reg(&reg, true);
+ if (!ret)
+ return ret; /* No need of hypercall */
+ ret = vtl_set_vp_register(&reg);
+
+ return ret;
+}
+
+static long
+mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+ long ret;
+ struct mshv_vtl *vtl = filp->private_data;
+
+ switch (ioctl) {
+ case MSHV_SET_POLL_FILE:
+ ret = mshv_vtl_ioctl_set_poll_file((struct mshv_vtl_set_poll_file __user *)arg);
+ break;
+ case MSHV_GET_VP_REGISTERS:
+ ret = mshv_vtl_ioctl_get_regs((void __user *)arg);
+ break;
+ case MSHV_SET_VP_REGISTERS:
+ ret = mshv_vtl_ioctl_set_regs((void __user *)arg);
+ break;
+ case MSHV_RETURN_TO_LOWER_VTL:
+ ret = mshv_vtl_ioctl_return_to_lower_vtl();
+ break;
+ case MSHV_ADD_VTL0_MEMORY:
+ ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg);
+ break;
+ default:
+ dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl);
+ ret = -ENOTTY;
+ }
+
+ return ret;
+}
+
+static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf)
+{
+ struct page *page;
+ int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK;
+ int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT;
+
+ if (!cpu_online(cpu))
+ return VM_FAULT_SIGBUS;
+ /*
+ * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists.
+ * CPU is expected to remain online after above cpu_online() check.
+ */
+
+ if (real_off == MSHV_RUN_PAGE_OFFSET) {
+ page = virt_to_page(mshv_vtl_cpu_run(cpu));
+ } else if (real_off == MSHV_REG_PAGE_OFFSET) {
+ if (!mshv_has_reg_page)
+ return VM_FAULT_SIGBUS;
+ page = mshv_vtl_cpu_reg_page(cpu);
+ } else {
+ return VM_FAULT_NOPAGE;
+ }
+
+ get_page(page);
+ vmf->page = page;
+
+ return 0;
+}
+
+static const struct vm_operations_struct mshv_vtl_vm_ops = {
+ .fault = mshv_vtl_fault,
+};
+
+static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &mshv_vtl_vm_ops;
+
+ return 0;
+}
+
+static int mshv_vtl_release(struct inode *inode, struct file *filp)
+{
+ struct mshv_vtl *vtl = filp->private_data;
+
+ kfree(vtl);
+
+ return 0;
+}
+
+static const struct file_operations mshv_vtl_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = mshv_vtl_ioctl,
+ .release = mshv_vtl_release,
+ .mmap = mshv_vtl_mmap,
+};
+
+static void mshv_vtl_synic_mask_vmbus_sint(const u8 *mask)
+{
+ union hv_synic_sint sint;
+
+ sint.as_uint64 = 0;
+ sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ sint.masked = (*mask != 0);
+ sint.auto_eoi = hv_recommend_using_aeoi();
+
+ hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX,
+ sint.as_uint64);
+
+ if (!sint.masked)
+ pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id());
+ else
+ pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id());
+}
+
+static void mshv_vtl_read_remote(void *buffer)
+{
+ struct hv_per_cpu_context *mshv_cpu = this_cpu_ptr(hv_context.cpu_context);
+ struct hv_message *msg = (struct hv_message *)mshv_cpu->hyp_synic_message_page +
+ VTL2_VMBUS_SINT_INDEX;
+ u32 message_type = READ_ONCE(msg->header.message_type);
+
+ WRITE_ONCE(has_message, false);
+ if (message_type == HVMSG_NONE)
+ return;
+
+ memcpy(buffer, msg, sizeof(*msg));
+ vmbus_signal_eom(msg, message_type);
+}
+
+static bool vtl_synic_mask_vmbus_sint_masked = true;
+
+static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, size_t size, loff_t *offset)
+{
+ struct hv_message msg = {};
+ int ret;
+
+ if (size < sizeof(msg))
+ return -EINVAL;
+
+ for (;;) {
+ smp_call_function_single(VMBUS_CONNECT_CPU, mshv_vtl_read_remote, &msg, true);
+ if (msg.header.message_type != HVMSG_NONE)
+ break;
+
+ if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
+ return 0; /* EOF */
+
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ ret = wait_event_interruptible(fd_wait_queue,
+ READ_ONCE(has_message) ||
+ READ_ONCE(vtl_synic_mask_vmbus_sint_masked));
+ if (ret)
+ return ret;
+ }
+
+ if (copy_to_user(arg, &msg, sizeof(msg)))
+ return -EFAULT;
+
+ return sizeof(msg);
+}
+
+static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait)
+{
+ __poll_t mask = 0;
+
+ poll_wait(filp, &fd_wait_queue, wait);
+ if (READ_ONCE(has_message) || READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ return mask;
+}
+
+static void mshv_vtl_sint_on_msg_dpc(unsigned long data)
+{
+ WRITE_ONCE(has_message, true);
+ wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
+}
+
+static int mshv_vtl_sint_ioctl_post_msg(struct mshv_vtl_sint_post_msg __user *arg)
+{
+ struct mshv_vtl_sint_post_msg message;
+ u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT];
+
+ if (copy_from_user(&message, arg, sizeof(message)))
+ return -EFAULT;
+ if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT)
+ return -EINVAL;
+ if (copy_from_user(payload, (void __user *)message.payload_ptr,
+ message.payload_size))
+ return -EFAULT;
+
+ return hv_post_message((union hv_connection_id)message.connection_id,
+ message.message_type, (void *)payload,
+ message.payload_size);
+}
+
+static int mshv_vtl_sint_ioctl_signal_event(struct mshv_vtl_signal_event __user *arg)
+{
+ u64 input, status;
+ struct mshv_vtl_signal_event signal_event;
+
+ if (copy_from_user(&signal_event, arg, sizeof(signal_event)))
+ return -EFAULT;
+
+ input = signal_event.connection_id | ((u64)signal_event.flag << 32);
+
+ status = hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, input);
+
+ return hv_result_to_errno(status);
+}
+
+static int mshv_vtl_sint_ioctl_set_eventfd(struct mshv_vtl_set_eventfd __user *arg)
+{
+ struct mshv_vtl_set_eventfd set_eventfd;
+ struct eventfd_ctx *eventfd, *old_eventfd;
+
+ if (copy_from_user(&set_eventfd, arg, sizeof(set_eventfd)))
+ return -EFAULT;
+ if (set_eventfd.flag >= HV_EVENT_FLAGS_COUNT)
+ return -EINVAL;
+
+ eventfd = NULL;
+ if (set_eventfd.fd >= 0) {
+ eventfd = eventfd_ctx_fdget(set_eventfd.fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+ }
+
+ guard(mutex)(&flag_lock);
+ old_eventfd = READ_ONCE(flag_eventfds[set_eventfd.flag]);
+ WRITE_ONCE(flag_eventfds[set_eventfd.flag], eventfd);
+
+ if (old_eventfd) {
+ synchronize_rcu();
+ eventfd_ctx_put(old_eventfd);
+ }
+
+ return 0;
+}
+
+static int mshv_vtl_sint_ioctl_pause_msg_stream(struct mshv_sint_mask __user *arg)
+{
+ static DEFINE_MUTEX(vtl2_vmbus_sint_mask_mutex);
+ struct mshv_sint_mask mask;
+
+ if (copy_from_user(&mask, arg, sizeof(mask)))
+ return -EFAULT;
+ guard(mutex)(&vtl2_vmbus_sint_mask_mutex);
+ on_each_cpu((smp_call_func_t)mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1);
+ WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0);
+ if (mask.mask)
+ wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
+
+ return 0;
+}
+
+static long mshv_vtl_sint_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case MSHV_SINT_POST_MESSAGE:
+ return mshv_vtl_sint_ioctl_post_msg((struct mshv_vtl_sint_post_msg __user *)arg);
+ case MSHV_SINT_SIGNAL_EVENT:
+ return mshv_vtl_sint_ioctl_signal_event((struct mshv_vtl_signal_event __user *)arg);
+ case MSHV_SINT_SET_EVENTFD:
+ return mshv_vtl_sint_ioctl_set_eventfd((struct mshv_vtl_set_eventfd __user *)arg);
+ case MSHV_SINT_PAUSE_MESSAGE_STREAM:
+ return mshv_vtl_sint_ioctl_pause_msg_stream((struct mshv_sint_mask __user *)arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+static const struct file_operations mshv_vtl_sint_ops = {
+ .owner = THIS_MODULE,
+ .read = mshv_vtl_sint_read,
+ .poll = mshv_vtl_sint_poll,
+ .unlocked_ioctl = mshv_vtl_sint_ioctl,
+};
+
+static struct miscdevice mshv_vtl_sint_dev = {
+ .name = "mshv_sint",
+ .fops = &mshv_vtl_sint_ops,
+ .mode = 0600,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int mshv_vtl_hvcall_dev_open(struct inode *node, struct file *f)
+{
+ struct miscdevice *dev = f->private_data;
+ struct mshv_vtl_hvcall_fd *fd;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ fd = vzalloc(sizeof(*fd));
+ if (!fd)
+ return -ENOMEM;
+ fd->dev = dev;
+ f->private_data = fd;
+ mutex_init(&fd->init_mutex);
+
+ return 0;
+}
+
+static int mshv_vtl_hvcall_dev_release(struct inode *node, struct file *f)
+{
+ struct mshv_vtl_hvcall_fd *fd;
+
+ fd = f->private_data;
+ if (fd) {
+ vfree(fd);
+ f->private_data = NULL;
+ }
+
+ return 0;
+}
+
+static int mshv_vtl_hvcall_do_setup(struct mshv_vtl_hvcall_fd *fd,
+ struct mshv_vtl_hvcall_setup __user *hvcall_setup_user)
+{
+ struct mshv_vtl_hvcall_setup hvcall_setup;
+
+ guard(mutex)(&fd->init_mutex);
+
+ if (fd->allow_map_initialized) {
+ dev_err(fd->dev->this_device,
+ "Hypercall allow map has already been set, pid %d\n",
+ current->pid);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(&hvcall_setup, hvcall_setup_user,
+ sizeof(struct mshv_vtl_hvcall_setup))) {
+ return -EFAULT;
+ }
+ if (hvcall_setup.bitmap_array_size > ARRAY_SIZE(fd->allow_bitmap))
+ return -EINVAL;
+
+ if (copy_from_user(&fd->allow_bitmap,
+ (void __user *)hvcall_setup.allow_bitmap_ptr,
+ hvcall_setup.bitmap_array_size)) {
+ return -EFAULT;
+ }
+
+ dev_info(fd->dev->this_device, "Hypercall allow map has been set, pid %d\n",
+ current->pid);
+ fd->allow_map_initialized = true;
+ return 0;
+}
+
+static bool mshv_vtl_hvcall_is_allowed(struct mshv_vtl_hvcall_fd *fd, u16 call_code)
+{
+ return test_bit(call_code, (unsigned long *)fd->allow_bitmap);
+}
+
+static int mshv_vtl_hvcall_call(struct mshv_vtl_hvcall_fd *fd,
+ struct mshv_vtl_hvcall __user *hvcall_user)
+{
+ struct mshv_vtl_hvcall hvcall;
+ void *in, *out;
+ int ret;
+
+ if (copy_from_user(&hvcall, hvcall_user, sizeof(struct mshv_vtl_hvcall)))
+ return -EFAULT;
+ if (hvcall.input_size > HV_HYP_PAGE_SIZE)
+ return -EINVAL;
+ if (hvcall.output_size > HV_HYP_PAGE_SIZE)
+ return -EINVAL;
+
+ /*
+ * By default, all hypercalls are not allowed.
+ * The user mode code has to set up the allow bitmap once.
+ */
+
+ if (!mshv_vtl_hvcall_is_allowed(fd, hvcall.control & 0xFFFF)) {
+ dev_err(fd->dev->this_device,
+ "Hypercall with control data %#llx isn't allowed\n",
+ hvcall.control);
+ return -EPERM;
+ }
+
+ /*
+ * This may create a problem for Confidential VM (CVM) usecase where we need to use
+ * Hyper-V driver allocated per-cpu input and output pages (hyperv_pcpu_input_arg and
+ * hyperv_pcpu_output_arg) for making a hypervisor call.
+ *
+ * TODO: Take care of this when CVM support is added.
+ */
+ in = (void *)__get_free_page(GFP_KERNEL);
+ out = (void *)__get_free_page(GFP_KERNEL);
+
+ if (copy_from_user(in, (void __user *)hvcall.input_ptr, hvcall.input_size)) {
+ ret = -EFAULT;
+ goto free_pages;
+ }
+
+ hvcall.status = hv_do_hypercall(hvcall.control, in, out);
+
+ if (copy_to_user((void __user *)hvcall.output_ptr, out, hvcall.output_size)) {
+ ret = -EFAULT;
+ goto free_pages;
+ }
+ ret = put_user(hvcall.status, &hvcall_user->status);
+free_pages:
+ free_page((unsigned long)in);
+ free_page((unsigned long)out);
+
+ return ret;
+}
+
+static long mshv_vtl_hvcall_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+ struct mshv_vtl_hvcall_fd *fd = f->private_data;
+
+ switch (cmd) {
+ case MSHV_HVCALL_SETUP:
+ return mshv_vtl_hvcall_do_setup(fd, (struct mshv_vtl_hvcall_setup __user *)arg);
+ case MSHV_HVCALL:
+ return mshv_vtl_hvcall_call(fd, (struct mshv_vtl_hvcall __user *)arg);
+ default:
+ break;
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+static const struct file_operations mshv_vtl_hvcall_dev_file_ops = {
+ .owner = THIS_MODULE,
+ .open = mshv_vtl_hvcall_dev_open,
+ .release = mshv_vtl_hvcall_dev_release,
+ .unlocked_ioctl = mshv_vtl_hvcall_dev_ioctl,
+};
+
+static struct miscdevice mshv_vtl_hvcall_dev = {
+ .name = "mshv_hvcall",
+ .nodename = "mshv_hvcall",
+ .fops = &mshv_vtl_hvcall_dev_file_ops,
+ .mode = 0600,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int mshv_vtl_low_open(struct inode *inodep, struct file *filp)
+{
+ pid_t pid = task_pid_vnr(current);
+ uid_t uid = current_uid().val;
+ int ret = 0;
+
+ pr_debug("%s: Opening VTL low, task group %d, uid %d\n", __func__, pid, uid);
+
+ if (capable(CAP_SYS_ADMIN)) {
+ filp->private_data = inodep;
+ } else {
+ pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d",
+ __func__, pid, uid);
+ ret = -EPERM;
+ }
+
+ return ret;
+}
+
+static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *pfn)
+{
+ unsigned long mask = size - 1;
+ unsigned long start = vmf->address & ~mask;
+ unsigned long end = start + size;
+ bool is_valid;
+
+ is_valid = (vmf->address & mask) == ((vmf->pgoff << PAGE_SHIFT) & mask) &&
+ start >= vmf->vma->vm_start &&
+ end <= vmf->vma->vm_end;
+
+ if (is_valid)
+ *pfn = vmf->pgoff & ~(mask >> PAGE_SHIFT);
+
+ return is_valid;
+}
+
+static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order)
+{
+ unsigned long pfn = vmf->pgoff;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
+
+ switch (order) {
+ case 0:
+ return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+
+ case PMD_ORDER:
+ if (can_fault(vmf, PMD_SIZE, &pfn))
+ ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
+ return ret;
+
+ case PUD_ORDER:
+ if (can_fault(vmf, PUD_SIZE, &pfn))
+ ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
+ return ret;
+
+ default:
+ return VM_FAULT_SIGBUS;
+ }
+}
+
+static vm_fault_t mshv_vtl_low_fault(struct vm_fault *vmf)
+{
+ return mshv_vtl_low_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct mshv_vtl_low_vm_ops = {
+ .fault = mshv_vtl_low_fault,
+ .huge_fault = mshv_vtl_low_huge_fault,
+};
+
+static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &mshv_vtl_low_vm_ops;
+ vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP);
+
+ return 0;
+}
+
+static const struct file_operations mshv_vtl_low_file_ops = {
+ .owner = THIS_MODULE,
+ .open = mshv_vtl_low_open,
+ .mmap = mshv_vtl_low_mmap,
+};
+
+static struct miscdevice mshv_vtl_low = {
+ .name = "mshv_vtl_low",
+ .nodename = "mshv_vtl_low",
+ .fops = &mshv_vtl_low_file_ops,
+ .mode = 0600,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int __init mshv_vtl_init(void)
+{
+ int ret;
+ struct device *dev = mshv_dev.this_device;
+
+ /*
+ * This creates /dev/mshv which provides functionality to create VTLs and partitions.
+ */
+ ret = misc_register(&mshv_dev);
+ if (ret) {
+ dev_err(dev, "mshv device register failed: %d\n", ret);
+ goto free_dev;
+ }
+
+ tasklet_init(&msg_dpc, mshv_vtl_sint_on_msg_dpc, 0);
+ init_waitqueue_head(&fd_wait_queue);
+
+ if (mshv_vtl_get_vsm_regs()) {
+ dev_emerg(dev, "Unable to get VSM capabilities !!\n");
+ ret = -ENODEV;
+ goto free_dev;
+ }
+ if (mshv_vtl_configure_vsm_partition(dev)) {
+ dev_emerg(dev, "VSM configuration failed !!\n");
+ ret = -ENODEV;
+ goto free_dev;
+ }
+
+ mshv_vtl_return_call_init(mshv_vsm_page_offsets.vtl_return_offset);
+ ret = hv_vtl_setup_synic();
+ if (ret)
+ goto free_dev;
+
+ /*
+ * mshv_sint device adds VMBus relay ioctl support.
+ * This provides a channel for VTL0 to communicate with VTL2.
+ */
+ ret = misc_register(&mshv_vtl_sint_dev);
+ if (ret)
+ goto free_synic;
+
+ /*
+ * mshv_hvcall device adds interface to enable userspace for direct hypercalls support.
+ */
+ ret = misc_register(&mshv_vtl_hvcall_dev);
+ if (ret)
+ goto free_sint;
+
+ /*
+ * mshv_vtl_low device is used to map VTL0 address space to a user-mode process in VTL2.
+ * It implements mmap() to allow a user-mode process in VTL2 to map to the address of VTL0.
+ */
+ ret = misc_register(&mshv_vtl_low);
+ if (ret)
+ goto free_hvcall;
+
+ /*
+ * "mshv vtl mem dev" device is later used to setup VTL0 memory.
+ */
+ mem_dev = kzalloc(sizeof(*mem_dev), GFP_KERNEL);
+ if (!mem_dev) {
+ ret = -ENOMEM;
+ goto free_low;
+ }
+
+ mutex_init(&mshv_vtl_poll_file_lock);
+
+ device_initialize(mem_dev);
+ dev_set_name(mem_dev, "mshv vtl mem dev");
+ ret = device_add(mem_dev);
+ if (ret) {
+ dev_err(dev, "mshv vtl mem dev add: %d\n", ret);
+ goto free_mem;
+ }
+
+ return 0;
+
+free_mem:
+ kfree(mem_dev);
+free_low:
+ misc_deregister(&mshv_vtl_low);
+free_hvcall:
+ misc_deregister(&mshv_vtl_hvcall_dev);
+free_sint:
+ misc_deregister(&mshv_vtl_sint_dev);
+free_synic:
+ hv_vtl_remove_synic();
+free_dev:
+ misc_deregister(&mshv_dev);
+
+ return ret;
+}
+
+static void __exit mshv_vtl_exit(void)
+{
+ device_del(mem_dev);
+ kfree(mem_dev);
+ misc_deregister(&mshv_vtl_low);
+ misc_deregister(&mshv_vtl_hvcall_dev);
+ misc_deregister(&mshv_vtl_sint_dev);
+ hv_vtl_remove_synic();
+ misc_deregister(&mshv_dev);
+}
+
+module_init(mshv_vtl_init);
+module_exit(mshv_vtl_exit);
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 23ce1fb70de1..3c421a7f78c0 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -184,7 +184,8 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
/* Initialize the ring buffer. */
int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
- struct page *pages, u32 page_cnt, u32 max_pkt_size)
+ struct page *pages, u32 page_cnt, u32 max_pkt_size,
+ bool confidential)
{
struct page **pages_wraparound;
int i;
@@ -208,7 +209,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
ring_info->ring_buffer = (struct hv_ring_buffer *)
vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP,
- pgprot_decrypted(PAGE_KERNEL));
+ confidential ? PAGE_KERNEL : pgprot_decrypted(PAGE_KERNEL));
kfree(pages_wraparound);
if (!ring_info->ring_buffer)
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 67734dc73e16..a53af6fe81a6 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -36,6 +36,7 @@
#include <linux/syscore_ops.h>
#include <linux/dma-map-ops.h>
#include <linux/pci.h>
+#include <linux/export.h>
#include <clocksource/hyperv_timer.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
@@ -57,6 +58,18 @@ int vmbus_irq;
int vmbus_interrupt;
/*
+ * If the Confidential VMBus is used, the data on the "wire" is not
+ * visible to either the host or the hypervisor.
+ */
+static bool is_confidential;
+
+bool vmbus_is_confidential(void)
+{
+ return is_confidential;
+}
+EXPORT_SYMBOL_GPL(vmbus_is_confidential);
+
+/*
* The panic notifier below is responsible solely for unloading the
* vmbus connection, which is necessary in a panic event.
*
@@ -1045,12 +1058,9 @@ static void vmbus_onmessage_work(struct work_struct *work)
kfree(ctx);
}
-void vmbus_on_msg_dpc(unsigned long data)
+static void __vmbus_on_msg_dpc(void *message_page_addr)
{
- struct hv_per_cpu_context *hv_cpu = (void *)data;
- void *page_addr = hv_cpu->synic_message_page;
- struct hv_message msg_copy, *msg = (struct hv_message *)page_addr +
- VMBUS_MESSAGE_SINT;
+ struct hv_message msg_copy, *msg;
struct vmbus_channel_message_header *hdr;
enum vmbus_channel_message_type msgtype;
const struct vmbus_channel_message_table_entry *entry;
@@ -1058,6 +1068,10 @@ void vmbus_on_msg_dpc(unsigned long data)
__u8 payload_size;
u32 message_type;
+ if (!message_page_addr)
+ return;
+ msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
+
/*
* 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
* it is being used in 'struct vmbus_channel_message_header' definition
@@ -1183,6 +1197,14 @@ msg_handled:
vmbus_signal_eom(msg, message_type);
}
+void vmbus_on_msg_dpc(unsigned long data)
+{
+ struct hv_per_cpu_context *hv_cpu = (void *)data;
+
+ __vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page);
+ __vmbus_on_msg_dpc(hv_cpu->para_synic_message_page);
+}
+
#ifdef CONFIG_PM_SLEEP
/*
* Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for
@@ -1221,21 +1243,19 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel)
#endif /* CONFIG_PM_SLEEP */
/*
- * Schedule all channels with events pending
+ * Schedule all channels with events pending.
+ * The event page can be directly checked to get the id of
+ * the channel that has the interrupt pending.
*/
-static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
+static void vmbus_chan_sched(void *event_page_addr)
{
unsigned long *recv_int_page;
u32 maxbits, relid;
+ union hv_synic_event_flags *event;
- /*
- * The event page can be directly checked to get the id of
- * the channel that has the interrupt pending.
- */
- void *page_addr = hv_cpu->synic_event_page;
- union hv_synic_event_flags *event
- = (union hv_synic_event_flags *)page_addr +
- VMBUS_MESSAGE_SINT;
+ if (!event_page_addr)
+ return;
+ event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT;
maxbits = HV_EVENT_FLAGS_COUNT;
recv_int_page = event->flags;
@@ -1243,6 +1263,11 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
if (unlikely(!recv_int_page))
return;
+ /*
+ * Suggested-by: Michael Kelley <mhklinux@outlook.com>
+ * One possible optimization would be to keep track of the largest relID that's in use,
+ * and only scan up to that relID.
+ */
for_each_set_bit(relid, recv_int_page, maxbits) {
void (*callback_fn)(void *context);
struct vmbus_channel *channel;
@@ -1306,29 +1331,39 @@ sched_unlock_rcu:
}
}
-static void vmbus_isr(void)
+static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr)
{
- struct hv_per_cpu_context *hv_cpu
- = this_cpu_ptr(hv_context.cpu_context);
- void *page_addr;
struct hv_message *msg;
- vmbus_chan_sched(hv_cpu);
-
- page_addr = hv_cpu->synic_message_page;
- msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
+ if (!message_page_addr)
+ return;
+ msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
/* Check if there are actual msgs to be processed */
if (msg->header.message_type != HVMSG_NONE) {
if (msg->header.message_type == HVMSG_TIMER_EXPIRED) {
hv_stimer0_isr();
vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED);
- } else
+ } else {
tasklet_schedule(&hv_cpu->msg_dpc);
+ }
}
+}
+
+void vmbus_isr(void)
+{
+ struct hv_per_cpu_context *hv_cpu
+ = this_cpu_ptr(hv_context.cpu_context);
+
+ vmbus_chan_sched(hv_cpu->hyp_synic_event_page);
+ vmbus_chan_sched(hv_cpu->para_synic_event_page);
+
+ vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page);
+ vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page);
add_interrupt_randomness(vmbus_interrupt);
}
+EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl");
static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
{
@@ -1343,6 +1378,59 @@ static void vmbus_percpu_work(struct work_struct *work)
hv_synic_init(cpu);
}
+static int vmbus_alloc_synic_and_connect(void)
+{
+ int ret, cpu;
+ struct work_struct __percpu *works;
+ int hyperv_cpuhp_online;
+
+ ret = hv_synic_alloc();
+ if (ret < 0)
+ goto err_alloc;
+
+ works = alloc_percpu(struct work_struct);
+ if (!works) {
+ ret = -ENOMEM;
+ goto err_alloc;
+ }
+
+ /*
+ * Initialize the per-cpu interrupt state and stimer state.
+ * Then connect to the host.
+ */
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, vmbus_percpu_work);
+ schedule_work_on(cpu, work);
+ }
+
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(works, cpu));
+
+ /* Register the callbacks for possible CPU online/offline'ing */
+ ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
+ hv_synic_init, hv_synic_cleanup);
+ cpus_read_unlock();
+ free_percpu(works);
+ if (ret < 0)
+ goto err_alloc;
+ hyperv_cpuhp_online = ret;
+
+ ret = vmbus_connect();
+ if (ret)
+ goto err_connect;
+ return 0;
+
+err_connect:
+ cpuhp_remove_state(hyperv_cpuhp_online);
+ return -ENODEV;
+err_alloc:
+ hv_synic_free();
+ return -ENOMEM;
+}
+
/*
* vmbus_bus_init -Main vmbus driver initialization routine.
*
@@ -1353,8 +1441,7 @@ static void vmbus_percpu_work(struct work_struct *work)
*/
static int vmbus_bus_init(void)
{
- int ret, cpu;
- struct work_struct __percpu *works;
+ int ret;
ret = hv_init();
if (ret != 0) {
@@ -1389,41 +1476,15 @@ static int vmbus_bus_init(void)
}
}
- ret = hv_synic_alloc();
- if (ret)
- goto err_alloc;
-
- works = alloc_percpu(struct work_struct);
- if (!works) {
- ret = -ENOMEM;
- goto err_alloc;
- }
-
/*
- * Initialize the per-cpu interrupt state and stimer state.
- * Then connect to the host.
+ * Cache the value as getting it involves a VM exit on x86(_64), and
+ * doing that on each VP while initializing SynIC's wastes time.
*/
- cpus_read_lock();
- for_each_online_cpu(cpu) {
- struct work_struct *work = per_cpu_ptr(works, cpu);
-
- INIT_WORK(work, vmbus_percpu_work);
- schedule_work_on(cpu, work);
- }
-
- for_each_online_cpu(cpu)
- flush_work(per_cpu_ptr(works, cpu));
-
- /* Register the callbacks for possible CPU online/offline'ing */
- ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
- hv_synic_init, hv_synic_cleanup);
- cpus_read_unlock();
- free_percpu(works);
- if (ret < 0)
- goto err_alloc;
- hyperv_cpuhp_online = ret;
-
- ret = vmbus_connect();
+ is_confidential = ms_hyperv.confidential_vmbus_available;
+ if (is_confidential)
+ pr_info("Establishing connection to the confidential VMBus\n");
+ hv_para_set_sint_proxy(!is_confidential);
+ ret = vmbus_alloc_synic_and_connect();
if (ret)
goto err_connect;
@@ -1439,9 +1500,6 @@ static int vmbus_bus_init(void)
return 0;
err_connect:
- cpuhp_remove_state(hyperv_cpuhp_online);
-err_alloc:
- hv_synic_free();
if (vmbus_irq == -1) {
hv_remove_vmbus_handler();
} else {
@@ -2798,7 +2856,7 @@ static void hv_crash_handler(struct pt_regs *regs)
*/
cpu = smp_processor_id();
hv_stimer_cleanup(cpu);
- hv_synic_disable_regs(cpu);
+ hv_hyp_synic_disable_regs(cpu);
};
static int hv_synic_suspend(void *data)
@@ -2823,14 +2881,14 @@ static int hv_synic_suspend(void *data)
* interrupts-disabled context.
*/
- hv_synic_disable_regs(0);
+ hv_hyp_synic_disable_regs(0);
return 0;
}
static void hv_synic_resume(void *data)
{
- hv_synic_enable_regs(0);
+ hv_hyp_synic_enable_regs(0);
/*
* Note: we don't need to call hv_stimer_init(0), because the timer
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 64ba6bc807d9..ecedab554c80 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -62,6 +62,8 @@ struct ms_hyperv_info {
};
};
u64 shared_gpa_boundary;
+ bool msi_ext_dest_id;
+ bool confidential_vmbus_available;
};
extern struct ms_hyperv_info ms_hyperv;
extern bool hv_nested;
@@ -124,10 +126,12 @@ static inline unsigned int hv_repcomp(u64 status)
/*
* Rep hypercalls. Callers of this functions are supposed to ensure that
- * rep_count and varhead_size comply with Hyper-V hypercall definition.
+ * rep_count, varhead_size, and rep_start comply with Hyper-V hypercall
+ * definition.
*/
-static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
- void *input, void *output)
+static inline u64 hv_do_rep_hypercall_ex(u16 code, u16 rep_count,
+ u16 varhead_size, u16 rep_start,
+ void *input, void *output)
{
u64 control = code;
u64 status;
@@ -135,6 +139,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET;
control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
+ control |= (u64)rep_start << HV_HYPERCALL_REP_START_OFFSET;
do {
status = hv_do_hypercall(control, input, output);
@@ -152,6 +157,14 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
return status;
}
+/* For the typical case where rep_start is 0 */
+static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
+ void *input, void *output)
+{
+ return hv_do_rep_hypercall_ex(code, rep_count, varhead_size, 0,
+ input, output);
+}
+
/* Generate the guest OS identifier as described in the Hyper-V TLFS */
static inline u64 hv_generate_guest_id(u64 kernel_version)
{
@@ -163,46 +176,6 @@ static inline u64 hv_generate_guest_id(u64 kernel_version)
return guest_id;
}
-#if IS_ENABLED(CONFIG_HYPERV_VMBUS)
-/* Free the message slot and signal end-of-message if required */
-static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
-{
- /*
- * On crash we're reading some other CPU's message page and we need
- * to be careful: this other CPU may already had cleared the header
- * and the host may already had delivered some other message there.
- * In case we blindly write msg->header.message_type we're going
- * to lose it. We can still lose a message of the same type but
- * we count on the fact that there can only be one
- * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages
- * on crash.
- */
- if (cmpxchg(&msg->header.message_type, old_msg_type,
- HVMSG_NONE) != old_msg_type)
- return;
-
- /*
- * The cmxchg() above does an implicit memory barrier to
- * ensure the write to MessageType (ie set to
- * HVMSG_NONE) happens before we read the
- * MessagePending and EOMing. Otherwise, the EOMing
- * will not deliver any more messages since there is
- * no empty slot
- */
- if (msg->header.message_flags.msg_pending) {
- /*
- * This will cause message queue rescan to
- * possibly deliver another msg from the
- * hypervisor
- */
- hv_set_msr(HV_MSR_EOM, 0);
- }
-}
-
-extern int vmbus_interrupt;
-extern int vmbus_irq;
-#endif /* CONFIG_HYPERV_VMBUS */
-
int hv_get_hypervisor_version(union hv_hypervisor_version_info *info);
void hv_setup_vmbus_handler(void (*handler)(void));
@@ -336,6 +309,10 @@ bool hv_is_isolation_supported(void);
bool hv_isolation_type_snp(void);
u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size);
u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2);
+void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set);
+void hv_para_set_sint_proxy(bool enable);
+u64 hv_para_get_synic_register(unsigned int reg);
+void hv_para_set_synic_register(unsigned int reg, u64 val);
void hyperv_cleanup(void);
bool hv_query_ext_cap(u64 cap_query);
void hv_setup_dma_ops(struct device *dev, bool coherent);
diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index 77abddfc750e..04b18d0e37af 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -260,6 +260,7 @@ union hv_hypervisor_version_info {
#define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082
/* Support for the extended IOAPIC RTE format */
#define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2)
+#define HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE BIT(3)
#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
#define HYPERV_CPUID_MIN 0x40000005
@@ -464,18 +465,21 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */
#define HVCALL_RESET_DEBUG_SESSION 0x006b
#define HVCALL_MAP_STATS_PAGE 0x006c
#define HVCALL_UNMAP_STATS_PAGE 0x006d
+#define HVCALL_SET_SYSTEM_PROPERTY 0x006f
#define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076
#define HVCALL_GET_SYSTEM_PROPERTY 0x007b
#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c
#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d
#define HVCALL_RETARGET_INTERRUPT 0x007e
+#define HVCALL_NOTIFY_PARTITION_EVENT 0x0087
+#define HVCALL_ENTER_SLEEP_STATE 0x0084
#define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b
#define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091
#define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094
#define HVCALL_CREATE_PORT 0x0095
#define HVCALL_CONNECT_PORT 0x0096
#define HVCALL_START_VP 0x0099
-#define HVCALL_GET_VP_INDEX_FROM_APIC_ID 0x009a
+#define HVCALL_GET_VP_INDEX_FROM_APIC_ID 0x009a
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
#define HVCALL_SIGNAL_EVENT_DIRECT 0x00c0
@@ -490,8 +494,11 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */
#define HVCALL_GET_VP_STATE 0x00e3
#define HVCALL_SET_VP_STATE 0x00e4
#define HVCALL_GET_VP_CPUID_VALUES 0x00f4
+#define HVCALL_GET_PARTITION_PROPERTY_EX 0x0101
#define HVCALL_MMIO_READ 0x0106
#define HVCALL_MMIO_WRITE 0x0107
+#define HVCALL_DISABLE_HYP_EX 0x010f
+#define HVCALL_MAP_STATS_PAGE2 0x0131
/* HV_HYPERCALL_INPUT */
#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0)
@@ -880,6 +887,48 @@ struct hv_get_vp_from_apic_id_in {
u32 apic_ids[];
} __packed;
+union hv_register_vsm_partition_config {
+ u64 as_uint64;
+ struct {
+ u64 enable_vtl_protection : 1;
+ u64 default_vtl_protection_mask : 4;
+ u64 zero_memory_on_reset : 1;
+ u64 deny_lower_vtl_startup : 1;
+ u64 intercept_acceptance : 1;
+ u64 intercept_enable_vtl_protection : 1;
+ u64 intercept_vp_startup : 1;
+ u64 intercept_cpuid_unimplemented : 1;
+ u64 intercept_unrecoverable_exception : 1;
+ u64 intercept_page : 1;
+ u64 mbz : 51;
+ } __packed;
+};
+
+union hv_register_vsm_capabilities {
+ u64 as_uint64;
+ struct {
+ u64 dr6_shared: 1;
+ u64 mbec_vtl_mask: 16;
+ u64 deny_lower_vtl_startup: 1;
+ u64 supervisor_shadow_stack: 1;
+ u64 hardware_hvpt_available: 1;
+ u64 software_hvpt_available: 1;
+ u64 hardware_hvpt_range_bits: 6;
+ u64 intercept_page_available: 1;
+ u64 return_action_available: 1;
+ u64 reserved: 35;
+ } __packed;
+};
+
+union hv_register_vsm_page_offsets {
+ struct {
+ u64 vtl_call_offset : 12;
+ u64 vtl_return_offset : 12;
+ u64 reserved_mbz : 40;
+ } __packed;
+ u64 as_uint64;
+};
+
struct hv_nested_enlightenments_control {
struct {
u32 directhypercall : 1;
@@ -1002,6 +1051,70 @@ enum hv_register_name {
/* VSM */
HV_REGISTER_VSM_VP_STATUS = 0x000D0003,
+
+ /* Synthetic VSM registers */
+ HV_REGISTER_VSM_CODE_PAGE_OFFSETS = 0x000D0002,
+ HV_REGISTER_VSM_CAPABILITIES = 0x000D0006,
+ HV_REGISTER_VSM_PARTITION_CONFIG = 0x000D0007,
+
+#if defined(CONFIG_X86)
+ /* X64 Debug Registers */
+ HV_X64_REGISTER_DR0 = 0x00050000,
+ HV_X64_REGISTER_DR1 = 0x00050001,
+ HV_X64_REGISTER_DR2 = 0x00050002,
+ HV_X64_REGISTER_DR3 = 0x00050003,
+ HV_X64_REGISTER_DR6 = 0x00050004,
+ HV_X64_REGISTER_DR7 = 0x00050005,
+
+ /* X64 Cache control MSRs */
+ HV_X64_REGISTER_MSR_MTRR_CAP = 0x0008000D,
+ HV_X64_REGISTER_MSR_MTRR_DEF_TYPE = 0x0008000E,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0 = 0x00080010,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1 = 0x00080011,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2 = 0x00080012,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3 = 0x00080013,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4 = 0x00080014,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5 = 0x00080015,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6 = 0x00080016,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7 = 0x00080017,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8 = 0x00080018,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9 = 0x00080019,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA = 0x0008001A,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB = 0x0008001B,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC = 0x0008001C,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASED = 0x0008001D,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE = 0x0008001E,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF = 0x0008001F,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0 = 0x00080040,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1 = 0x00080041,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2 = 0x00080042,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3 = 0x00080043,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4 = 0x00080044,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5 = 0x00080045,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6 = 0x00080046,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7 = 0x00080047,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8 = 0x00080048,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9 = 0x00080049,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA = 0x0008004A,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB = 0x0008004B,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC = 0x0008004C,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD = 0x0008004D,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE = 0x0008004E,
+ HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF = 0x0008004F,
+ HV_X64_REGISTER_MSR_MTRR_FIX64K00000 = 0x00080070,
+ HV_X64_REGISTER_MSR_MTRR_FIX16K80000 = 0x00080071,
+ HV_X64_REGISTER_MSR_MTRR_FIX16KA0000 = 0x00080072,
+ HV_X64_REGISTER_MSR_MTRR_FIX4KC0000 = 0x00080073,
+ HV_X64_REGISTER_MSR_MTRR_FIX4KC8000 = 0x00080074,
+ HV_X64_REGISTER_MSR_MTRR_FIX4KD0000 = 0x00080075,
+ HV_X64_REGISTER_MSR_MTRR_FIX4KD8000 = 0x00080076,
+ HV_X64_REGISTER_MSR_MTRR_FIX4KE0000 = 0x00080077,
+ HV_X64_REGISTER_MSR_MTRR_FIX4KE8000 = 0x00080078,
+ HV_X64_REGISTER_MSR_MTRR_FIX4KF0000 = 0x00080079,
+ HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A,
+
+ HV_X64_REGISTER_REG_PAGE = 0x0009001C,
+#endif
};
/*
diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
index b4067ada02cf..469186df7826 100644
--- a/include/hyperv/hvhdk.h
+++ b/include/hyperv/hvhdk.h
@@ -376,6 +376,46 @@ struct hv_input_set_partition_property {
u64 property_value;
} __packed;
+union hv_partition_property_arg {
+ u64 as_uint64;
+ struct {
+ union {
+ u32 arg;
+ u32 vp_index;
+ };
+ u16 reserved0;
+ u8 reserved1;
+ u8 object_type;
+ } __packed;
+};
+
+struct hv_input_get_partition_property_ex {
+ u64 partition_id;
+ u32 property_code; /* enum hv_partition_property_code */
+ u32 padding;
+ union {
+ union hv_partition_property_arg arg_data;
+ u64 arg;
+ };
+} __packed;
+
+/*
+ * NOTE: Should use hv_input_set_partition_property_ex_header to compute this
+ * size, but hv_input_get_partition_property_ex is identical so it suffices
+ */
+#define HV_PARTITION_PROPERTY_EX_MAX_VAR_SIZE \
+ (HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_partition_property_ex))
+
+union hv_partition_property_ex {
+ u8 buffer[HV_PARTITION_PROPERTY_EX_MAX_VAR_SIZE];
+ struct hv_partition_property_vmm_capabilities vmm_capabilities;
+ /* More fields to be filled in when needed */
+};
+
+struct hv_output_get_partition_property_ex {
+ union hv_partition_property_ex property_value;
+} __packed;
+
enum hv_vp_state_page_type {
HV_VP_STATE_PAGE_REGISTERS = 0,
HV_VP_STATE_PAGE_INTERCEPT_MESSAGE = 1,
@@ -539,9 +579,15 @@ union hv_interrupt_control {
u64 as_uint64;
struct {
u32 interrupt_type; /* enum hv_interrupt_type */
+#if IS_ENABLED(CONFIG_X86)
u32 level_triggered : 1;
u32 logical_dest_mode : 1;
u32 rsvd : 30;
+#elif IS_ENABLED(CONFIG_ARM64)
+ u32 rsvd1 : 2;
+ u32 asserted : 1;
+ u32 rsvd2 : 29;
+#endif
} __packed;
};
diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h
index 858f6a3925b3..41a29bf8ec14 100644
--- a/include/hyperv/hvhdk_mini.h
+++ b/include/hyperv/hvhdk_mini.h
@@ -96,8 +96,34 @@ enum hv_partition_property_code {
HV_PARTITION_PROPERTY_XSAVE_STATES = 0x00060007,
HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE = 0x00060008,
HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY = 0x00060009,
+
+ /* Extended properties with larger property values */
+ HV_PARTITION_PROPERTY_VMM_CAPABILITIES = 0x00090007,
};
+#define HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT 1
+#define HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT 59
+
+struct hv_partition_property_vmm_capabilities {
+ u16 bank_count;
+ u16 reserved[3];
+ union {
+ u64 as_uint64[HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT];
+ struct {
+ u64 map_gpa_preserve_adjustable: 1;
+ u64 vmm_can_provide_overlay_gpfn: 1;
+ u64 vp_affinity_property: 1;
+#if IS_ENABLED(CONFIG_ARM64)
+ u64 vmm_can_provide_gic_overlay_locations: 1;
+#else
+ u64 reservedbit3: 1;
+#endif
+ u64 assignable_synthetic_proc_features: 1;
+ u64 reserved0: HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT;
+ } __packed;
+ };
+} __packed;
+
enum hv_snp_status {
HV_SNP_STATUS_NONE = 0,
HV_SNP_STATUS_AVAILABLE = 1,
@@ -114,8 +140,33 @@ enum hv_snp_status {
enum hv_system_property {
/* Add more values when needed */
+ HV_SYSTEM_PROPERTY_SLEEP_STATE = 3,
HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15,
HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21,
+ HV_SYSTEM_PROPERTY_CRASHDUMPAREA = 47,
+};
+
+#define HV_PFN_RANGE_PGBITS 24 /* HV_SPA_PAGE_RANGE_ADDITIONAL_PAGES_BITS */
+union hv_pfn_range { /* HV_SPA_PAGE_RANGE */
+ u64 as_uint64;
+ struct {
+ /* 39:0: base pfn. 63:40: additional pages */
+ u64 base_pfn : 64 - HV_PFN_RANGE_PGBITS;
+ u64 add_pfns : HV_PFN_RANGE_PGBITS;
+ } __packed;
+};
+
+enum hv_sleep_state {
+ HV_SLEEP_STATE_S1 = 1,
+ HV_SLEEP_STATE_S2 = 2,
+ HV_SLEEP_STATE_S3 = 3,
+ HV_SLEEP_STATE_S4 = 4,
+ HV_SLEEP_STATE_S5 = 5,
+ /*
+ * After hypervisor has received this, any follow up sleep
+ * state registration requests will be rejected.
+ */
+ HV_SLEEP_STATE_LOCK = 6
};
enum hv_dynamic_processor_feature_property {
@@ -142,15 +193,50 @@ struct hv_output_get_system_property {
#if IS_ENABLED(CONFIG_X86)
u64 hv_processor_feature_value;
#endif
+ union hv_pfn_range hv_cda_info; /* CrashdumpAreaAddress */
+ u64 hv_tramp_pa; /* CrashdumpTrampolineAddress */
+ };
+} __packed;
+
+struct hv_sleep_state_info {
+ u32 sleep_state; /* enum hv_sleep_state */
+ u8 pm1a_slp_typ;
+ u8 pm1b_slp_typ;
+} __packed;
+
+struct hv_input_set_system_property {
+ u32 property_id; /* enum hv_system_property */
+ u32 reserved;
+ union {
+ /* More fields to be filled in when needed */
+ struct hv_sleep_state_info set_sleep_state_info;
+
+ /*
+ * Add a reserved field to ensure the union is 8-byte aligned as
+ * existing members may not be. This is a temporary measure
+ * until all remaining members are added.
+ */
+ u64 reserved0[8];
};
} __packed;
+struct hv_input_enter_sleep_state { /* HV_INPUT_ENTER_SLEEP_STATE */
+ u32 sleep_state; /* enum hv_sleep_state */
+} __packed;
+
struct hv_input_map_stats_page {
u32 type; /* enum hv_stats_object_type */
u32 padding;
union hv_stats_object_identity identity;
} __packed;
+struct hv_input_map_stats_page2 {
+ u32 type; /* enum hv_stats_object_type */
+ u32 padding;
+ union hv_stats_object_identity identity;
+ u64 map_location;
+} __packed;
+
struct hv_output_map_stats_page {
u64 map_location;
} __packed;
@@ -234,6 +320,48 @@ union hv_gpa_page_access_state {
u8 as_uint8;
} __packed;
+enum hv_crashdump_action {
+ HV_CRASHDUMP_NONE = 0,
+ HV_CRASHDUMP_SUSPEND_ALL_VPS,
+ HV_CRASHDUMP_PREPARE_FOR_STATE_SAVE,
+ HV_CRASHDUMP_STATE_SAVED,
+ HV_CRASHDUMP_ENTRY,
+};
+
+struct hv_partition_event_root_crashdump_input {
+ u32 crashdump_action; /* enum hv_crashdump_action */
+} __packed;
+
+struct hv_input_disable_hyp_ex { /* HV_X64_INPUT_DISABLE_HYPERVISOR_EX */
+ u64 rip;
+ u64 arg;
+} __packed;
+
+struct hv_crashdump_area { /* HV_CRASHDUMP_AREA */
+ u32 version;
+ union {
+ u32 flags_as_uint32;
+ struct {
+ u32 cda_valid : 1;
+ u32 cda_unused : 31;
+ } __packed;
+ };
+ /* more unused fields */
+} __packed;
+
+union hv_partition_event_input {
+ struct hv_partition_event_root_crashdump_input crashdump_input;
+};
+
+enum hv_partition_event {
+ HV_PARTITION_EVENT_ROOT_CRASHDUMP = 2,
+};
+
+struct hv_input_notify_partition_event {
+ u32 event; /* enum hv_partition_event */
+ union hv_partition_event_input input;
+} __packed;
+
struct hv_lp_startup_status {
u64 hv_status;
u64 substatus1;
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 3eac51d68426..45875903960e 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -11,6 +11,10 @@
#define __has_builtin(x) (0)
#endif
+/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
+#define ___PASTE(a, b) a##b
+#define __PASTE(a, b) ___PASTE(a, b)
+
#ifndef __ASSEMBLY__
/*
@@ -79,10 +83,6 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
# define __builtin_warning(x, y...) (1)
#endif /* __CHECKER__ */
-/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
-#define ___PASTE(a,b) a##b
-#define __PASTE(a,b) ___PASTE(a,b)
-
#ifdef __KERNEL__
/* Attributes */
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 59826c89171c..dfc516c1c719 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -265,16 +265,18 @@ static inline u32 hv_get_avail_to_write_percent(
* Linux kernel.
*/
-#define VERSION_WS2008 ((0 << 16) | (13))
-#define VERSION_WIN7 ((1 << 16) | (1))
-#define VERSION_WIN8 ((2 << 16) | (4))
-#define VERSION_WIN8_1 ((3 << 16) | (0))
-#define VERSION_WIN10 ((4 << 16) | (0))
-#define VERSION_WIN10_V4_1 ((4 << 16) | (1))
-#define VERSION_WIN10_V5 ((5 << 16) | (0))
-#define VERSION_WIN10_V5_1 ((5 << 16) | (1))
-#define VERSION_WIN10_V5_2 ((5 << 16) | (2))
-#define VERSION_WIN10_V5_3 ((5 << 16) | (3))
+#define VMBUS_MAKE_VERSION(MAJ, MIN) ((((u32)MAJ) << 16) | (MIN))
+#define VERSION_WS2008 VMBUS_MAKE_VERSION(0, 13)
+#define VERSION_WIN7 VMBUS_MAKE_VERSION(1, 1)
+#define VERSION_WIN8 VMBUS_MAKE_VERSION(2, 4)
+#define VERSION_WIN8_1 VMBUS_MAKE_VERSION(3, 0)
+#define VERSION_WIN10 VMBUS_MAKE_VERSION(4, 0)
+#define VERSION_WIN10_V4_1 VMBUS_MAKE_VERSION(4, 1)
+#define VERSION_WIN10_V5 VMBUS_MAKE_VERSION(5, 0)
+#define VERSION_WIN10_V5_1 VMBUS_MAKE_VERSION(5, 1)
+#define VERSION_WIN10_V5_2 VMBUS_MAKE_VERSION(5, 2)
+#define VERSION_WIN10_V5_3 VMBUS_MAKE_VERSION(5, 3)
+#define VERSION_WIN10_V6_0 VMBUS_MAKE_VERSION(6, 0)
/* Make maximum size of pipe payload of 16K */
#define MAX_PIPE_DATA_PAYLOAD (sizeof(u8) * 16384)
@@ -335,14 +337,22 @@ struct vmbus_channel_offer {
} __packed;
/* Server Flags */
-#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 1
-#define VMBUS_CHANNEL_SERVER_SUPPORTS_TRANSFER_PAGES 2
-#define VMBUS_CHANNEL_SERVER_SUPPORTS_GPADLS 4
-#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x10
-#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x100
-#define VMBUS_CHANNEL_PARENT_OFFER 0x200
-#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x400
-#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000
+#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 0x0001
+/*
+ * This flag indicates that the channel is offered by the paravisor, and must
+ * use encrypted memory for the channel ring buffer.
+ */
+#define VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER 0x0002
+/*
+ * This flag indicates that the channel is offered by the paravisor, and must
+ * use encrypted memory for GPA direct packets and additional GPADLs.
+ */
+#define VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY 0x0004
+#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x0010
+#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x0100
+#define VMBUS_CHANNEL_PARENT_OFFER 0x0200
+#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x0400
+#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000
struct vmpacket_descriptor {
u16 type;
@@ -621,6 +631,12 @@ struct vmbus_channel_relid_released {
u32 child_relid;
} __packed;
+/*
+ * Used by the paravisor only, means that the encrypted ring buffers and
+ * the encrypted external memory are supported
+ */
+#define VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS 0x10
+
struct vmbus_channel_initiate_contact {
struct vmbus_channel_message_header header;
u32 vmbus_version_requested;
@@ -630,7 +646,8 @@ struct vmbus_channel_initiate_contact {
struct {
u8 msg_sint;
u8 msg_vtl;
- u8 reserved[6];
+ u8 reserved[2];
+ u32 feature_flags; /* VMBus version 6.0 */
};
};
u64 monitor_page1;
@@ -1003,6 +1020,10 @@ struct vmbus_channel {
/* boolean to control visibility of sysfs for ring buffer */
bool ring_sysfs_visible;
+ /* The ring buffer is encrypted */
+ bool co_ring_buffer;
+ /* The external memory is encrypted */
+ bool co_external_memory;
};
#define lock_requestor(channel, flags) \
@@ -1027,6 +1048,16 @@ u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id,
u64 rqst_addr);
u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id);
+static inline bool is_co_ring_buffer(const struct vmbus_channel_offer_channel *o)
+{
+ return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER);
+}
+
+static inline bool is_co_external_memory(const struct vmbus_channel_offer_channel *o)
+{
+ return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY);
+}
+
static inline bool is_hvsock_offer(const struct vmbus_channel_offer_channel *o)
{
return !!(o->offer.chn_flags & VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER);
diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h
index 5a00b8b2cf9f..cfb6ddeb292b 100644
--- a/include/linux/static_call_types.h
+++ b/include/linux/static_call_types.h
@@ -25,6 +25,8 @@
#define STATIC_CALL_SITE_INIT 2UL /* init section */
#define STATIC_CALL_SITE_FLAGS 3UL
+#ifndef __ASSEMBLY__
+
/*
* The static call site table needs to be created by external tooling (objtool
* or a compiler plugin).
@@ -100,4 +102,6 @@ struct static_call_key {
#endif /* CONFIG_HAVE_STATIC_CALL */
+#endif /* __ASSEMBLY__ */
+
#endif /* _STATIC_CALL_TYPES_H */
diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
index 876bfe4e4227..dee3ece28ce5 100644
--- a/include/uapi/linux/mshv.h
+++ b/include/uapi/linux/mshv.h
@@ -26,6 +26,7 @@ enum {
MSHV_PT_BIT_LAPIC,
MSHV_PT_BIT_X2APIC,
MSHV_PT_BIT_GPA_SUPER_PAGES,
+ MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES,
MSHV_PT_BIT_COUNT,
};
@@ -41,6 +42,8 @@ enum {
* @pt_flags: Bitmask of 1 << MSHV_PT_BIT_*
* @pt_isolation: MSHV_PT_ISOLATION_*
*
+ * This is the initial/v1 version for backward compatibility.
+ *
* Returns a file descriptor to act as a handle to a guest partition.
* At this point the partition is not yet initialized in the hypervisor.
* Some operations must be done with the partition in this state, e.g. setting
@@ -52,6 +55,37 @@ struct mshv_create_partition {
__u64 pt_isolation;
};
+#define MSHV_NUM_CPU_FEATURES_BANKS 2
+
+/**
+ * struct mshv_create_partition_v2
+ *
+ * This is extended version of the above initial MSHV_CREATE_PARTITION
+ * ioctl and allows for following additional parameters:
+ *
+ * @pt_num_cpu_fbanks: Must be set to MSHV_NUM_CPU_FEATURES_BANKS.
+ * @pt_cpu_fbanks: Disabled processor feature banks array.
+ * @pt_disabled_xsave: Disabled xsave feature bits.
+ *
+ * pt_cpu_fbanks and pt_disabled_xsave are passed through as-is to the create
+ * partition hypercall.
+ *
+ * Returns : same as above original mshv_create_partition
+ */
+struct mshv_create_partition_v2 {
+ __u64 pt_flags;
+ __u64 pt_isolation;
+ __u16 pt_num_cpu_fbanks;
+ __u8 pt_rsvd[6]; /* MBZ */
+ __u64 pt_cpu_fbanks[MSHV_NUM_CPU_FEATURES_BANKS];
+ __u64 pt_rsvd1[2]; /* MBZ */
+#if defined(__x86_64__)
+ __u64 pt_disabled_xsave;
+#else
+ __u64 pt_rsvd2; /* MBZ */
+#endif
+} __packed;
+
/* /dev/mshv */
#define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition)
@@ -89,7 +123,7 @@ enum {
* @rsvd: MBZ
*
* Map or unmap a region of userspace memory to Guest Physical Addresses (GPA).
- * Mappings can't overlap in GPA space or userspace.
+ * Mappings can't overlap in GPA space.
* To unmap, these fields must match an existing mapping.
*/
struct mshv_user_mem_region {
@@ -288,4 +322,84 @@ struct mshv_get_set_vp_state {
* #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall)
*/
+/* Structure definitions, macros and IOCTLs for mshv_vtl */
+
+#define MSHV_CAP_CORE_API_STABLE 0x0
+#define MSHV_CAP_REGISTER_PAGE 0x1
+#define MSHV_CAP_VTL_RETURN_ACTION 0x2
+#define MSHV_CAP_DR6_SHARED 0x3
+#define MSHV_MAX_RUN_MSG_SIZE 256
+
+struct mshv_vp_registers {
+ __u32 count; /* supports only 1 register at a time */
+ __u32 reserved; /* Reserved for alignment or future use */
+ __u64 regs_ptr; /* pointer to struct hv_register_assoc */
+};
+
+struct mshv_vtl_set_eventfd {
+ __s32 fd;
+ __u32 flag;
+};
+
+struct mshv_vtl_signal_event {
+ __u32 connection_id;
+ __u32 flag;
+};
+
+struct mshv_vtl_sint_post_msg {
+ __u64 message_type;
+ __u32 connection_id;
+ __u32 payload_size; /* Must not exceed HV_MESSAGE_PAYLOAD_BYTE_COUNT */
+ __u64 payload_ptr; /* pointer to message payload (bytes) */
+};
+
+struct mshv_vtl_ram_disposition {
+ __u64 start_pfn;
+ __u64 last_pfn;
+};
+
+struct mshv_vtl_set_poll_file {
+ __u32 cpu;
+ __u32 fd;
+};
+
+struct mshv_vtl_hvcall_setup {
+ __u64 bitmap_array_size; /* stores number of bytes */
+ __u64 allow_bitmap_ptr;
+};
+
+struct mshv_vtl_hvcall {
+ __u64 control; /* Hypercall control code */
+ __u64 input_size; /* Size of the input data */
+ __u64 input_ptr; /* Pointer to the input struct */
+ __u64 status; /* Status of the hypercall (output) */
+ __u64 output_size; /* Size of the output data */
+ __u64 output_ptr; /* Pointer to the output struct */
+};
+
+struct mshv_sint_mask {
+ __u8 mask;
+ __u8 reserved[7];
+};
+
+/* /dev/mshv device IOCTL */
+#define MSHV_CHECK_EXTENSION _IOW(MSHV_IOCTL, 0x00, __u32)
+
+/* vtl device */
+#define MSHV_CREATE_VTL _IOR(MSHV_IOCTL, 0x1D, char)
+#define MSHV_ADD_VTL0_MEMORY _IOW(MSHV_IOCTL, 0x21, struct mshv_vtl_ram_disposition)
+#define MSHV_SET_POLL_FILE _IOW(MSHV_IOCTL, 0x25, struct mshv_vtl_set_poll_file)
+#define MSHV_RETURN_TO_LOWER_VTL _IO(MSHV_IOCTL, 0x27)
+#define MSHV_GET_VP_REGISTERS _IOWR(MSHV_IOCTL, 0x05, struct mshv_vp_registers)
+#define MSHV_SET_VP_REGISTERS _IOW(MSHV_IOCTL, 0x06, struct mshv_vp_registers)
+
+/* VMBus device IOCTLs */
+#define MSHV_SINT_SIGNAL_EVENT _IOW(MSHV_IOCTL, 0x22, struct mshv_vtl_signal_event)
+#define MSHV_SINT_POST_MESSAGE _IOW(MSHV_IOCTL, 0x23, struct mshv_vtl_sint_post_msg)
+#define MSHV_SINT_SET_EVENTFD _IOW(MSHV_IOCTL, 0x24, struct mshv_vtl_set_eventfd)
+#define MSHV_SINT_PAUSE_MESSAGE_STREAM _IOW(MSHV_IOCTL, 0x25, struct mshv_sint_mask)
+
+/* hv_hvcall device */
+#define MSHV_HVCALL_SETUP _IOW(MSHV_IOCTL, 0x1E, struct mshv_vtl_hvcall_setup)
+#define MSHV_HVCALL _IOWR(MSHV_IOCTL, 0x1F, struct mshv_vtl_hvcall)
#endif
diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h
index 5a00b8b2cf9f..cfb6ddeb292b 100644
--- a/tools/include/linux/static_call_types.h
+++ b/tools/include/linux/static_call_types.h
@@ -25,6 +25,8 @@
#define STATIC_CALL_SITE_INIT 2UL /* init section */
#define STATIC_CALL_SITE_FLAGS 3UL
+#ifndef __ASSEMBLY__
+
/*
* The static call site table needs to be created by external tooling (objtool
* or a compiler plugin).
@@ -100,4 +102,6 @@ struct static_call_key {
#endif /* CONFIG_HAVE_STATIC_CALL */
+#endif /* __ASSEMBLY__ */
+
#endif /* _STATIC_CALL_TYPES_H */