summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi3
-rw-r--r--arch/arm64/boot/dts/arm/morello.dtsi22
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi25
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi26
-rw-r--r--arch/arm64/boot/dts/freescale/imx95.dtsi8
-rw-r--r--arch/arm64/boot/dts/st/stm32mp211.dtsi8
-rw-r--r--arch/arm64/boot/dts/st/stm32mp231.dtsi9
-rw-r--r--arch/arm64/boot/dts/st/stm32mp251.dtsi9
-rw-r--r--arch/arm64/include/asm/el2_setup.h2
-rw-r--r--arch/arm64/include/asm/kvm_arm.h3
-rw-r--r--arch/arm64/include/asm/vdso/gettimeofday.h13
-rw-r--r--arch/arm64/kernel/cpufeature.c9
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h13
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c2
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c36
-rw-r--r--arch/arm64/kvm/mmu.c13
-rw-r--r--arch/arm64/kvm/sys_regs.c6
-rw-r--r--arch/mips/include/asm/idle.h5
-rw-r--r--arch/mips/include/asm/ptrace.h3
-rw-r--r--arch/mips/kernel/genex.S71
-rw-r--r--arch/mips/kernel/idle.c7
-rw-r--r--arch/mips/kernel/smp-cps.c4
-rw-r--r--arch/mips/kernel/traps.c10
-rw-r--r--arch/riscv/kernel/process.c6
-rw-r--r--arch/riscv/kernel/traps.c64
-rw-r--r--arch/riscv/kernel/traps_misaligned.c19
-rw-r--r--arch/riscv/kvm/vcpu.c2
-rw-r--r--arch/s390/configs/debug_defconfig28
-rw-r--r--arch/s390/configs/defconfig24
-rw-r--r--arch/s390/configs/zfcpdump_defconfig1
-rw-r--r--arch/s390/kernel/entry.S3
-rw-r--r--arch/s390/pci/pci_clp.c2
-rw-r--r--arch/um/include/asm/uaccess.h2
-rw-r--r--arch/um/kernel/trap.c26
-rw-r--r--arch/um/kernel/um_arch.c2
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/Makefile1
-rw-r--r--arch/x86/boot/bioscall.S4
-rw-r--r--arch/x86/boot/boot.h6
-rw-r--r--arch/x86/boot/compressed/Makefile10
-rw-r--r--arch/x86/boot/compressed/head_64.S1
-rw-r--r--arch/x86/boot/compressed/misc.c1
-rw-r--r--arch/x86/boot/compressed/misc.h8
-rw-r--r--arch/x86/boot/compressed/pgtable.h18
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c1
-rw-r--r--arch/x86/boot/compressed/sev-handle-vc.c134
-rw-r--r--arch/x86/boot/compressed/sev.c154
-rw-r--r--arch/x86/boot/compressed/sev.h21
-rw-r--r--arch/x86/boot/compressed/string.c8
-rw-r--r--arch/x86/boot/copy.S8
-rw-r--r--arch/x86/boot/header.S2
-rw-r--r--arch/x86/boot/startup/Makefile30
-rw-r--r--arch/x86/boot/startup/efi-mixed.S253
-rw-r--r--arch/x86/boot/startup/gdt_idt.c71
-rw-r--r--arch/x86/boot/startup/la57toggle.S (renamed from arch/x86/boot/compressed/la57toggle.S)1
-rw-r--r--arch/x86/boot/startup/map_kernel.c223
-rw-r--r--arch/x86/boot/startup/sev-shared.c (renamed from arch/x86/coco/sev/shared.c)894
-rw-r--r--arch/x86/boot/startup/sev-startup.c368
-rw-r--r--arch/x86/boot/startup/sme.c (renamed from arch/x86/mm/mem_encrypt_identity.c)28
-rw-r--r--arch/x86/boot/string.c2
-rw-r--r--arch/x86/boot/video.c2
-rw-r--r--arch/x86/coco/core.c2
-rw-r--r--arch/x86/coco/sev/Makefile23
-rw-r--r--arch/x86/coco/sev/core.c1713
-rw-r--r--arch/x86/coco/sev/sev-nmi.c108
-rw-r--r--arch/x86/coco/sev/vc-handle.c1061
-rw-r--r--arch/x86/coco/sev/vc-shared.c504
-rw-r--r--arch/x86/events/core.c9
-rw-r--r--arch/x86/include/asm/alternative.h11
-rw-r--r--arch/x86/include/asm/arch_hweight.h6
-rw-r--r--arch/x86/include/asm/asm.h5
-rw-r--r--arch/x86/include/asm/bitops.h7
-rw-r--r--arch/x86/include/asm/boot.h10
-rw-r--r--arch/x86/include/asm/coco.h2
-rw-r--r--arch/x86/include/asm/debugreg.h12
-rw-r--r--arch/x86/include/asm/inat.h6
-rw-r--r--arch/x86/include/asm/io.h6
-rw-r--r--arch/x86/include/asm/kexec.h7
-rw-r--r--arch/x86/include/asm/linkage.h10
-rw-r--r--arch/x86/include/asm/mem_encrypt.h2
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/mmu.h4
-rw-r--r--arch/x86/include/asm/mmu_context.h15
-rw-r--r--arch/x86/include/asm/nospec-branch.h2
-rw-r--r--arch/x86/include/asm/page_types.h4
-rw-r--r--arch/x86/include/asm/percpu.h20
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/include/asm/setup.h1
-rw-r--r--arch/x86/include/asm/sev-internal.h105
-rw-r--r--arch/x86/include/asm/sev.h59
-rw-r--r--arch/x86/include/asm/special_insns.h21
-rw-r--r--arch/x86/include/asm/string_32.h15
-rw-r--r--arch/x86/include/asm/text-patching.h29
-rw-r--r--arch/x86/include/asm/uaccess_64.h6
-rw-r--r--arch/x86/include/asm/vdso/processor.h8
-rw-r--r--arch/x86/kernel/alternative.c439
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/callthunks.c6
-rw-r--r--arch/x86/kernel/cpu/bugs.c1181
-rw-r--r--arch/x86/kernel/cpu/common.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c58
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h1
-rw-r--r--arch/x86/kernel/early_printk.c9
-rw-r--r--arch/x86/kernel/ftrace.c18
-rw-r--r--arch/x86/kernel/head32.c4
-rw-r--r--arch/x86/kernel/head64.c277
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/head_64.S8
-rw-r--r--arch/x86/kernel/jump_label.c6
-rw-r--r--arch/x86/kernel/kprobes/core.c4
-rw-r--r--arch/x86/kernel/kprobes/opt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_64.c48
-rw-r--r--arch/x86/kernel/module.c2
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S6
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S250
-rw-r--r--arch/x86/kernel/setup.c1
-rw-r--r--arch/x86/kernel/static_call.c2
-rw-r--r--arch/x86/kernel/traps.c6
-rw-r--r--arch/x86/kernel/vmlinux.lds.S14
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/mmu/mmu.c70
-rw-r--r--arch/x86/kvm/smm.c1
-rw-r--r--arch/x86/kvm/svm/sev.c32
-rw-r--r--arch/x86/kvm/svm/svm.c75
-rw-r--r--arch/x86/kvm/svm/svm.h2
-rw-r--r--arch/x86/kvm/vmx/vmx.c8
-rw-r--r--arch/x86/kvm/x86.c4
-rw-r--r--arch/x86/lib/delay.c2
-rw-r--r--arch/x86/lib/insn-eval.c13
-rw-r--r--arch/x86/lib/insn.c7
-rw-r--r--arch/x86/lib/iomem.c2
-rw-r--r--arch/x86/lib/memcpy_64.S1
-rw-r--r--arch/x86/lib/memset_64.S1
-rw-r--r--arch/x86/lib/retpoline.S2
-rw-r--r--arch/x86/lib/string_32.c17
-rw-r--r--arch/x86/lib/strstr_32.c6
-rw-r--r--arch/x86/lib/usercopy_32.c18
-rw-r--r--arch/x86/lib/x86-opcode-map.txt56
-rw-r--r--arch/x86/mm/Makefile6
-rw-r--r--arch/x86/mm/init.c19
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c2
-rw-r--r--arch/x86/mm/pgtable.c1
-rw-r--r--arch/x86/mm/tlb.c104
-rw-r--r--arch/x86/net/bpf_jit_comp.c2
-rw-r--r--arch/x86/platform/efi/efi_64.c8
-rw-r--r--arch/x86/platform/pvh/head.S3
-rw-r--r--arch/x86/power/hibernate_asm_32.S3
-rw-r--r--arch/x86/power/hibernate_asm_64.S3
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk7
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo_32.h2
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo_64.h2
153 files changed, 5304 insertions, 4076 deletions
diff --git a/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi b/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi
index f2386dcb9ff2..dda4fa91b2f2 100644
--- a/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi
+++ b/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi
@@ -40,6 +40,9 @@
reg = <1>;
interrupt-parent = <&gpio4>;
interrupts = <16 IRQ_TYPE_LEVEL_LOW>;
+ micrel,led-mode = <1>;
+ clocks = <&clks IMX6UL_CLK_ENET_REF>;
+ clock-names = "rmii-ref";
status = "okay";
};
};
diff --git a/arch/arm64/boot/dts/arm/morello.dtsi b/arch/arm64/boot/dts/arm/morello.dtsi
index 0bab0b3ea969..5bc1c725dc86 100644
--- a/arch/arm64/boot/dts/arm/morello.dtsi
+++ b/arch/arm64/boot/dts/arm/morello.dtsi
@@ -44,7 +44,7 @@
next-level-cache = <&l2_0>;
clocks = <&scmi_dvfs 0>;
- l2_0: l2-cache-0 {
+ l2_0: l2-cache {
compatible = "cache";
cache-level = <2>;
/* 8 ways set associative */
@@ -53,13 +53,6 @@
cache-sets = <2048>;
cache-unified;
next-level-cache = <&l3_0>;
-
- l3_0: l3-cache {
- compatible = "cache";
- cache-level = <3>;
- cache-size = <0x100000>;
- cache-unified;
- };
};
};
@@ -78,7 +71,7 @@
next-level-cache = <&l2_1>;
clocks = <&scmi_dvfs 0>;
- l2_1: l2-cache-1 {
+ l2_1: l2-cache {
compatible = "cache";
cache-level = <2>;
/* 8 ways set associative */
@@ -105,7 +98,7 @@
next-level-cache = <&l2_2>;
clocks = <&scmi_dvfs 1>;
- l2_2: l2-cache-2 {
+ l2_2: l2-cache {
compatible = "cache";
cache-level = <2>;
/* 8 ways set associative */
@@ -132,7 +125,7 @@
next-level-cache = <&l2_3>;
clocks = <&scmi_dvfs 1>;
- l2_3: l2-cache-3 {
+ l2_3: l2-cache {
compatible = "cache";
cache-level = <2>;
/* 8 ways set associative */
@@ -143,6 +136,13 @@
next-level-cache = <&l3_0>;
};
};
+
+ l3_0: l3-cache {
+ compatible = "cache";
+ cache-level = <3>;
+ cache-size = <0x100000>;
+ cache-unified;
+ };
};
firmware {
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
index 7251ad3a0017..b46566f3ce20 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
@@ -144,6 +144,19 @@
startup-delay-us = <20000>;
};
+ reg_usdhc2_vqmmc: regulator-usdhc2-vqmmc {
+ compatible = "regulator-gpio";
+ pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_usdhc2_vsel>;
+ gpios = <&gpio1 4 GPIO_ACTIVE_HIGH>;
+ regulator-max-microvolt = <3300000>;
+ regulator-min-microvolt = <1800000>;
+ states = <1800000 0x1>,
+ <3300000 0x0>;
+ regulator-name = "PMIC_USDHC_VSELECT";
+ vin-supply = <&reg_nvcc_sd>;
+ };
+
reserved-memory {
#address-cells = <2>;
#size-cells = <2>;
@@ -269,7 +282,7 @@
"SODIMM_19",
"",
"",
- "",
+ "PMIC_USDHC_VSELECT",
"",
"",
"",
@@ -785,6 +798,7 @@
pinctrl-2 = <&pinctrl_usdhc2_200mhz>, <&pinctrl_usdhc2_cd>;
pinctrl-3 = <&pinctrl_usdhc2_sleep>, <&pinctrl_usdhc2_cd_sleep>;
vmmc-supply = <&reg_usdhc2_vmmc>;
+ vqmmc-supply = <&reg_usdhc2_vqmmc>;
};
&wdog1 {
@@ -1206,13 +1220,17 @@
<MX8MM_IOMUXC_NAND_CLE_GPIO3_IO5 0x6>; /* SODIMM 76 */
};
+ pinctrl_usdhc2_vsel: usdhc2vselgrp {
+ fsl,pins =
+ <MX8MM_IOMUXC_GPIO1_IO04_GPIO1_IO4 0x10>; /* PMIC_USDHC_VSELECT */
+ };
+
/*
* Note: Due to ERR050080 we use discrete external on-module resistors pulling-up to the
* on-module +V3.3_1.8_SD (LDO5) rail and explicitly disable the internal pull-ups here.
*/
pinctrl_usdhc2: usdhc2grp {
fsl,pins =
- <MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT 0x10>,
<MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK 0x90>, /* SODIMM 78 */
<MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD 0x90>, /* SODIMM 74 */
<MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0 0x90>, /* SODIMM 80 */
@@ -1223,7 +1241,6 @@
pinctrl_usdhc2_100mhz: usdhc2-100mhzgrp {
fsl,pins =
- <MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT 0x10>,
<MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK 0x94>,
<MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD 0x94>,
<MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0 0x94>,
@@ -1234,7 +1251,6 @@
pinctrl_usdhc2_200mhz: usdhc2-200mhzgrp {
fsl,pins =
- <MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT 0x10>,
<MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK 0x96>,
<MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD 0x96>,
<MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0 0x96>,
@@ -1246,7 +1262,6 @@
/* Avoid backfeeding with removed card power */
pinctrl_usdhc2_sleep: usdhc2slpgrp {
fsl,pins =
- <MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT 0x0>,
<MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK 0x0>,
<MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD 0x0>,
<MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0 0x0>,
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi
index a1b75c9068b2..dc0ccd723c6d 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi
@@ -24,6 +24,20 @@
fsl,operating-mode = "nominal";
};
+&gpu2d {
+ assigned-clocks = <&clk IMX8MP_CLK_GPU2D_CORE>;
+ assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>;
+ assigned-clock-rates = <800000000>;
+};
+
+&gpu3d {
+ assigned-clocks = <&clk IMX8MP_CLK_GPU3D_CORE>,
+ <&clk IMX8MP_CLK_GPU3D_SHADER_CORE>;
+ assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>,
+ <&clk IMX8MP_SYS_PLL1_800M>;
+ assigned-clock-rates = <800000000>, <800000000>;
+};
+
&pgc_hdmimix {
assigned-clocks = <&clk IMX8MP_CLK_HDMI_AXI>,
<&clk IMX8MP_CLK_HDMI_APB>;
@@ -46,6 +60,18 @@
assigned-clock-rates = <600000000>, <300000000>;
};
+&pgc_mlmix {
+ assigned-clocks = <&clk IMX8MP_CLK_ML_CORE>,
+ <&clk IMX8MP_CLK_ML_AXI>,
+ <&clk IMX8MP_CLK_ML_AHB>;
+ assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>,
+ <&clk IMX8MP_SYS_PLL1_800M>,
+ <&clk IMX8MP_SYS_PLL1_800M>;
+ assigned-clock-rates = <800000000>,
+ <800000000>,
+ <300000000>;
+};
+
&media_blk_ctrl {
assigned-clocks = <&clk IMX8MP_CLK_MEDIA_AXI>,
<&clk IMX8MP_CLK_MEDIA_APB>,
diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi
index 9bb26b466a06..59f057ba6fa7 100644
--- a/arch/arm64/boot/dts/freescale/imx95.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx95.dtsi
@@ -1626,7 +1626,7 @@
reg = <0 0x4c300000 0 0x10000>,
<0 0x60100000 0 0xfe00000>,
<0 0x4c360000 0 0x10000>,
- <0 0x4c340000 0 0x2000>;
+ <0 0x4c340000 0 0x4000>;
reg-names = "dbi", "config", "atu", "app";
ranges = <0x81000000 0x0 0x00000000 0x0 0x6ff00000 0 0x00100000>,
<0x82000000 0x0 0x10000000 0x9 0x10000000 0 0x10000000>;
@@ -1673,7 +1673,7 @@
reg = <0 0x4c300000 0 0x10000>,
<0 0x4c360000 0 0x1000>,
<0 0x4c320000 0 0x1000>,
- <0 0x4c340000 0 0x2000>,
+ <0 0x4c340000 0 0x4000>,
<0 0x4c370000 0 0x10000>,
<0x9 0 1 0>;
reg-names = "dbi","atu", "dbi2", "app", "dma", "addr_space";
@@ -1700,7 +1700,7 @@
reg = <0 0x4c380000 0 0x10000>,
<8 0x80100000 0 0xfe00000>,
<0 0x4c3e0000 0 0x10000>,
- <0 0x4c3c0000 0 0x2000>;
+ <0 0x4c3c0000 0 0x4000>;
reg-names = "dbi", "config", "atu", "app";
ranges = <0x81000000 0 0x00000000 0x8 0x8ff00000 0 0x00100000>,
<0x82000000 0 0x10000000 0xa 0x10000000 0 0x10000000>;
@@ -1749,7 +1749,7 @@
reg = <0 0x4c380000 0 0x10000>,
<0 0x4c3e0000 0 0x1000>,
<0 0x4c3a0000 0 0x1000>,
- <0 0x4c3c0000 0 0x2000>,
+ <0 0x4c3c0000 0 0x4000>,
<0 0x4c3f0000 0 0x10000>,
<0xa 0 1 0>;
reg-names = "dbi", "atu", "dbi2", "app", "dma", "addr_space";
diff --git a/arch/arm64/boot/dts/st/stm32mp211.dtsi b/arch/arm64/boot/dts/st/stm32mp211.dtsi
index 6dd1377f3e1d..bf888d60cd4f 100644
--- a/arch/arm64/boot/dts/st/stm32mp211.dtsi
+++ b/arch/arm64/boot/dts/st/stm32mp211.dtsi
@@ -116,11 +116,11 @@
};
intc: interrupt-controller@4ac10000 {
- compatible = "arm,cortex-a7-gic";
+ compatible = "arm,gic-400";
reg = <0x4ac10000 0x0 0x1000>,
- <0x4ac20000 0x0 0x2000>,
- <0x4ac40000 0x0 0x2000>,
- <0x4ac60000 0x0 0x2000>;
+ <0x4ac20000 0x0 0x20000>,
+ <0x4ac40000 0x0 0x20000>,
+ <0x4ac60000 0x0 0x20000>;
#interrupt-cells = <3>;
interrupt-controller;
};
diff --git a/arch/arm64/boot/dts/st/stm32mp231.dtsi b/arch/arm64/boot/dts/st/stm32mp231.dtsi
index 8820d219a33e..75697acd1345 100644
--- a/arch/arm64/boot/dts/st/stm32mp231.dtsi
+++ b/arch/arm64/boot/dts/st/stm32mp231.dtsi
@@ -1201,13 +1201,12 @@
};
intc: interrupt-controller@4ac10000 {
- compatible = "arm,cortex-a7-gic";
+ compatible = "arm,gic-400";
reg = <0x4ac10000 0x1000>,
- <0x4ac20000 0x2000>,
- <0x4ac40000 0x2000>,
- <0x4ac60000 0x2000>;
+ <0x4ac20000 0x20000>,
+ <0x4ac40000 0x20000>,
+ <0x4ac60000 0x20000>;
#interrupt-cells = <3>;
- #address-cells = <1>;
interrupt-controller;
};
};
diff --git a/arch/arm64/boot/dts/st/stm32mp251.dtsi b/arch/arm64/boot/dts/st/stm32mp251.dtsi
index f3c6cdfd7008..87110f91e489 100644
--- a/arch/arm64/boot/dts/st/stm32mp251.dtsi
+++ b/arch/arm64/boot/dts/st/stm32mp251.dtsi
@@ -115,14 +115,13 @@
};
intc: interrupt-controller@4ac00000 {
- compatible = "arm,cortex-a7-gic";
+ compatible = "arm,gic-400";
#interrupt-cells = <3>;
- #address-cells = <1>;
interrupt-controller;
reg = <0x0 0x4ac10000 0x0 0x1000>,
- <0x0 0x4ac20000 0x0 0x2000>,
- <0x0 0x4ac40000 0x0 0x2000>,
- <0x0 0x4ac60000 0x0 0x2000>;
+ <0x0 0x4ac20000 0x0 0x20000>,
+ <0x0 0x4ac40000 0x0 0x20000>,
+ <0x0 0x4ac60000 0x0 0x20000>;
};
psci {
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index ebceaae3c749..d40e427ddad9 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -52,7 +52,7 @@
mrs x0, id_aa64mmfr1_el1
ubfx x0, x0, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4
cbz x0, .Lskip_hcrx_\@
- mov_q x0, HCRX_HOST_FLAGS
+ mov_q x0, (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM)
/* Enable GCS if supported */
mrs_s x1, SYS_ID_AA64PFR1_EL1
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 974d72b5905b..e9c8a581e16f 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -100,9 +100,8 @@
HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID1)
#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA)
#define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC)
-#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
+#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H | HCR_AMO | HCR_IMO | HCR_FMO)
-#define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM)
#define MPAMHCR_HOST_FLAGS 0
/* TCR_EL2 Registers bits */
diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h
index 92a2b59a9f3d..3322c7047d84 100644
--- a/arch/arm64/include/asm/vdso/gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/gettimeofday.h
@@ -99,6 +99,19 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
return res;
}
+#if IS_ENABLED(CONFIG_CC_IS_GCC) && IS_ENABLED(CONFIG_PAGE_SIZE_64KB)
+static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void)
+{
+ const struct vdso_time_data *ret = &vdso_u_time_data;
+
+ /* Work around invalid absolute relocations */
+ OPTIMIZER_HIDE_VAR(ret);
+
+ return ret;
+}
+#define __arch_get_vdso_u_time_data __arch_get_vdso_u_time_data
+#endif /* IS_ENABLED(CONFIG_CC_IS_GCC) && IS_ENABLED(CONFIG_PAGE_SIZE_64KB) */
+
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9c4d6d552b25..4c46d80aa64b 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -114,7 +114,14 @@ static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NC
DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS);
-bool arm64_use_ng_mappings = false;
+/*
+ * arm64_use_ng_mappings must be placed in the .data section, otherwise it
+ * ends up in the .bss section where it is initialized in early_map_kernel()
+ * after the MMU (with the idmap) was enabled. create_init_idmap() - which
+ * runs before early_map_kernel() and reads the variable via PTE_MAYBE_NG -
+ * may end up generating an incorrect idmap page table attributes.
+ */
+bool arm64_use_ng_mappings __read_mostly = false;
EXPORT_SYMBOL(arm64_use_ng_mappings);
DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors;
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index b741ea6aefa5..96f625dc7256 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -235,6 +235,8 @@ static inline void __deactivate_traps_mpam(void)
static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
{
+ struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
+
/* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */
write_sysreg(1 << 15, hstr_el2);
@@ -245,11 +247,8 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
* EL1 instead of being trapped to EL2.
*/
if (system_supports_pmuv3()) {
- struct kvm_cpu_context *hctxt;
-
write_sysreg(0, pmselr_el0);
- hctxt = host_data_ptr(host_ctxt);
ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0);
write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
vcpu_set_flag(vcpu, PMUSERENR_ON_CPU);
@@ -269,6 +268,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
hcrx &= ~clr;
}
+ ctxt_sys_reg(hctxt, HCRX_EL2) = read_sysreg_s(SYS_HCRX_EL2);
write_sysreg_s(hcrx, SYS_HCRX_EL2);
}
@@ -278,19 +278,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
{
+ struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
+
write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
write_sysreg(0, hstr_el2);
if (system_supports_pmuv3()) {
- struct kvm_cpu_context *hctxt;
-
- hctxt = host_data_ptr(host_ctxt);
write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0);
vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU);
}
if (cpus_have_final_cap(ARM64_HAS_HCX))
- write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2);
+ write_sysreg_s(ctxt_sys_reg(hctxt, HCRX_EL2), SYS_HCRX_EL2);
__deactivate_traps_hfgxtr(vcpu);
__deactivate_traps_mpam();
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 2a5284f749b4..e80f3ebd3e2a 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -503,7 +503,7 @@ int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
{
int ret;
- if (!addr_is_memory(addr))
+ if (!range_is_memory(addr, addr + size))
return -EPERM;
ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index ed363aa3027e..50aa8dbcae75 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -429,23 +429,27 @@ u64 __vgic_v3_get_gic_config(void)
/*
* To check whether we have a MMIO-based (GICv2 compatible)
* CPU interface, we need to disable the system register
- * view. To do that safely, we have to prevent any interrupt
- * from firing (which would be deadly).
+ * view.
*
- * Note that this only makes sense on VHE, as interrupts are
- * already masked for nVHE as part of the exception entry to
- * EL2.
- */
- if (has_vhe())
- flags = local_daif_save();
-
- /*
* Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates
* that to be able to set ICC_SRE_EL1.SRE to 0, all the
* interrupt overrides must be set. You've got to love this.
+ *
+ * As we always run VHE with HCR_xMO set, no extra xMO
+ * manipulation is required in that case.
+ *
+ * To safely disable SRE, we have to prevent any interrupt
+ * from firing (which would be deadly). This only makes sense
+ * on VHE, as interrupts are already masked for nVHE as part
+ * of the exception entry to EL2.
*/
- sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO);
- isb();
+ if (has_vhe()) {
+ flags = local_daif_save();
+ } else {
+ sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO);
+ isb();
+ }
+
write_gicreg(0, ICC_SRE_EL1);
isb();
@@ -453,11 +457,13 @@ u64 __vgic_v3_get_gic_config(void)
write_gicreg(sre, ICC_SRE_EL1);
isb();
- sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0);
- isb();
- if (has_vhe())
+ if (has_vhe()) {
local_daif_restore(flags);
+ } else {
+ sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0);
+ isb();
+ }
val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63);
val |= read_gicreg(ICH_VTR_EL2);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 754f2fe0cc67..eeda92330ade 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1501,6 +1501,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
+ if (!is_protected_kvm_enabled())
+ memcache = &vcpu->arch.mmu_page_cache;
+ else
+ memcache = &vcpu->arch.pkvm_memcache;
+
/*
* Permission faults just need to update the existing leaf entry,
* and so normally don't require allocations from the memcache. The
@@ -1510,13 +1515,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (!fault_is_perm || (logging_active && write_fault)) {
int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
- if (!is_protected_kvm_enabled()) {
- memcache = &vcpu->arch.mmu_page_cache;
+ if (!is_protected_kvm_enabled())
ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
- } else {
- memcache = &vcpu->arch.pkvm_memcache;
+ else
ret = topup_hyp_memcache(memcache, min_pages);
- }
+
if (ret)
return ret;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 005ad28f7306..5dde9285afc8 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1945,6 +1945,12 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
if ((hw_val & mpam_mask) == (user_val & mpam_mask))
user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
+ /* Fail the guest's request to disable the AA64 ISA at EL{0,1,2} */
+ if (!FIELD_GET(ID_AA64PFR0_EL1_EL0, user_val) ||
+ !FIELD_GET(ID_AA64PFR0_EL1_EL1, user_val) ||
+ (vcpu_has_nv(vcpu) && !FIELD_GET(ID_AA64PFR0_EL1_EL2, user_val)))
+ return -EINVAL;
+
return set_id_reg(vcpu, rd, user_val);
}
diff --git a/arch/mips/include/asm/idle.h b/arch/mips/include/asm/idle.h
index 0992cad9c632..c7d75807d13f 100644
--- a/arch/mips/include/asm/idle.h
+++ b/arch/mips/include/asm/idle.h
@@ -6,11 +6,10 @@
#include <linux/linkage.h>
extern void (*cpu_wait)(void);
-extern void r4k_wait(void);
-extern asmlinkage void __r4k_wait(void);
+extern asmlinkage void r4k_wait(void);
extern void r4k_wait_irqoff(void);
-static inline int using_rollback_handler(void)
+static inline int using_skipover_handler(void)
{
return cpu_wait == r4k_wait;
}
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h
index 85fa9962266a..ef72c46b5568 100644
--- a/arch/mips/include/asm/ptrace.h
+++ b/arch/mips/include/asm/ptrace.h
@@ -65,7 +65,8 @@ static inline void instruction_pointer_set(struct pt_regs *regs,
/* Query offset/name of register from its name/offset */
extern int regs_query_register_offset(const char *name);
-#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last))
+#define MAX_REG_OFFSET \
+ (offsetof(struct pt_regs, __last) - sizeof(unsigned long))
/**
* regs_get_register() - get register value from its offset
diff --git a/arch/mips/kernel/genex.S b/arch/mips/kernel/genex.S
index a572ce36a24f..08c0a01d9a29 100644
--- a/arch/mips/kernel/genex.S
+++ b/arch/mips/kernel/genex.S
@@ -104,48 +104,59 @@ handle_vcei:
__FINIT
- .align 5 /* 32 byte rollback region */
-LEAF(__r4k_wait)
- .set push
- .set noreorder
- /* start of rollback region */
- LONG_L t0, TI_FLAGS($28)
- nop
- andi t0, _TIF_NEED_RESCHED
- bnez t0, 1f
- nop
- nop
- nop
-#ifdef CONFIG_CPU_MICROMIPS
- nop
- nop
- nop
- nop
-#endif
+ .section .cpuidle.text,"ax"
+ /* Align to 32 bytes for the maximum idle interrupt region size. */
+ .align 5
+LEAF(r4k_wait)
+ /* Keep the ISA bit clear for calculations on local labels here. */
+0: .fill 0
+ /* Start of idle interrupt region. */
+ local_irq_enable
+ /*
+ * If an interrupt lands here, before going idle on the next
+ * instruction, we must *NOT* go idle since the interrupt could
+ * have set TIF_NEED_RESCHED or caused a timer to need resched.
+ * Fall through -- see skipover_handler below -- and have the
+ * idle loop take care of things.
+ */
+1: .fill 0
+ /* The R2 EI/EHB sequence takes 8 bytes, otherwise pad up. */
+ .if 1b - 0b > 32
+ .error "overlong idle interrupt region"
+ .elseif 1b - 0b > 8
+ .align 4
+ .endif
+2: .fill 0
+ .equ r4k_wait_idle_size, 2b - 0b
+ /* End of idle interrupt region; size has to be a power of 2. */
.set MIPS_ISA_ARCH_LEVEL_RAW
+r4k_wait_insn:
wait
- /* end of rollback region (the region size must be power of two) */
-1:
+r4k_wait_exit:
+ .set mips0
+ local_irq_disable
jr ra
- nop
- .set pop
- END(__r4k_wait)
+ END(r4k_wait)
+ .previous
- .macro BUILD_ROLLBACK_PROLOGUE handler
- FEXPORT(rollback_\handler)
+ .macro BUILD_SKIPOVER_PROLOGUE handler
+ FEXPORT(skipover_\handler)
.set push
.set noat
MFC0 k0, CP0_EPC
- PTR_LA k1, __r4k_wait
- ori k0, 0x1f /* 32 byte rollback region */
- xori k0, 0x1f
+ /* Subtract/add 2 to let the ISA bit propagate through the mask. */
+ PTR_LA k1, r4k_wait_insn - 2
+ ori k0, r4k_wait_idle_size - 2
+ .set noreorder
bne k0, k1, \handler
+ PTR_ADDIU k0, r4k_wait_exit - r4k_wait_insn + 2
+ .set reorder
MTC0 k0, CP0_EPC
.set pop
.endm
.align 5
-BUILD_ROLLBACK_PROLOGUE handle_int
+BUILD_SKIPOVER_PROLOGUE handle_int
NESTED(handle_int, PT_SIZE, sp)
.cfi_signal_frame
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -265,7 +276,7 @@ NESTED(except_vec_ejtag_debug, 0, sp)
* This prototype is copied to ebase + n*IntCtl.VS and patched
* to invoke the handler
*/
-BUILD_ROLLBACK_PROLOGUE except_vec_vi
+BUILD_SKIPOVER_PROLOGUE except_vec_vi
NESTED(except_vec_vi, 0, sp)
SAVE_SOME docfi=1
SAVE_AT docfi=1
diff --git a/arch/mips/kernel/idle.c b/arch/mips/kernel/idle.c
index 5abc8b7340f8..80e8a04a642e 100644
--- a/arch/mips/kernel/idle.c
+++ b/arch/mips/kernel/idle.c
@@ -35,13 +35,6 @@ static void __cpuidle r3081_wait(void)
write_c0_conf(cfg | R30XX_CONF_HALT);
}
-void __cpuidle r4k_wait(void)
-{
- raw_local_irq_enable();
- __r4k_wait();
- raw_local_irq_disable();
-}
-
/*
* This variant is preferable as it allows testing need_resched and going to
* sleep depending on the outcome atomically. Unfortunately the "It is
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index e85bd087467e..cc26d56f3ab6 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -332,6 +332,8 @@ static void __init cps_prepare_cpus(unsigned int max_cpus)
mips_cps_cluster_bootcfg = kcalloc(nclusters,
sizeof(*mips_cps_cluster_bootcfg),
GFP_KERNEL);
+ if (!mips_cps_cluster_bootcfg)
+ goto err_out;
if (nclusters > 1)
mips_cm_update_property();
@@ -348,6 +350,8 @@ static void __init cps_prepare_cpus(unsigned int max_cpus)
mips_cps_cluster_bootcfg[cl].core_power =
kcalloc(BITS_TO_LONGS(ncores), sizeof(unsigned long),
GFP_KERNEL);
+ if (!mips_cps_cluster_bootcfg[cl].core_power)
+ goto err_out;
/* Allocate VPE boot configuration structs */
for (c = 0; c < ncores; c++) {
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 39e248d0ed59..8ec1e185b35c 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -77,7 +77,7 @@
#include "access-helper.h"
extern void check_wait(void);
-extern asmlinkage void rollback_handle_int(void);
+extern asmlinkage void skipover_handle_int(void);
extern asmlinkage void handle_int(void);
extern asmlinkage void handle_adel(void);
extern asmlinkage void handle_ades(void);
@@ -2066,7 +2066,7 @@ void *set_vi_handler(int n, vi_handler_t addr)
{
extern const u8 except_vec_vi[];
extern const u8 except_vec_vi_ori[], except_vec_vi_end[];
- extern const u8 rollback_except_vec_vi[];
+ extern const u8 skipover_except_vec_vi[];
unsigned long handler;
unsigned long old_handler = vi_handlers[n];
int srssets = current_cpu_data.srsets;
@@ -2095,7 +2095,7 @@ void *set_vi_handler(int n, vi_handler_t addr)
change_c0_srsmap(0xf << n*4, 0 << n*4);
}
- vec_start = using_rollback_handler() ? rollback_except_vec_vi :
+ vec_start = using_skipover_handler() ? skipover_except_vec_vi :
except_vec_vi;
#if defined(CONFIG_CPU_MICROMIPS) || defined(CONFIG_CPU_BIG_ENDIAN)
ori_offset = except_vec_vi_ori - vec_start + 2;
@@ -2426,8 +2426,8 @@ void __init trap_init(void)
if (board_be_init)
board_be_init();
- set_except_vector(EXCCODE_INT, using_rollback_handler() ?
- rollback_handle_int : handle_int);
+ set_except_vector(EXCCODE_INT, using_skipover_handler() ?
+ skipover_handle_int : handle_int);
set_except_vector(EXCCODE_MOD, handle_tlbm);
set_except_vector(EXCCODE_TLBL, handle_tlbl);
set_except_vector(EXCCODE_TLBS, handle_tlbs);
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index 7c244de77180..15d8f75902f8 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -275,6 +275,9 @@ long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg)
unsigned long pmm;
u8 pmlen;
+ if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM))
+ return -EINVAL;
+
if (is_compat_thread(ti))
return -EINVAL;
@@ -330,6 +333,9 @@ long get_tagged_addr_ctrl(struct task_struct *task)
struct thread_info *ti = task_thread_info(task);
long ret = 0;
+ if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM))
+ return -EINVAL;
+
if (is_compat_thread(ti))
return -EINVAL;
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 8ff8e8b36524..9c83848797a7 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -198,47 +198,57 @@ asmlinkage __visible __trap_section void do_trap_insn_illegal(struct pt_regs *re
DO_ERROR_INFO(do_trap_load_fault,
SIGSEGV, SEGV_ACCERR, "load access fault");
-asmlinkage __visible __trap_section void do_trap_load_misaligned(struct pt_regs *regs)
+enum misaligned_access_type {
+ MISALIGNED_STORE,
+ MISALIGNED_LOAD,
+};
+static const struct {
+ const char *type_str;
+ int (*handler)(struct pt_regs *regs);
+} misaligned_handler[] = {
+ [MISALIGNED_STORE] = {
+ .type_str = "Oops - store (or AMO) address misaligned",
+ .handler = handle_misaligned_store,
+ },
+ [MISALIGNED_LOAD] = {
+ .type_str = "Oops - load address misaligned",
+ .handler = handle_misaligned_load,
+ },
+};
+
+static void do_trap_misaligned(struct pt_regs *regs, enum misaligned_access_type type)
{
+ irqentry_state_t state;
+
if (user_mode(regs)) {
irqentry_enter_from_user_mode(regs);
+ local_irq_enable();
+ } else {
+ state = irqentry_nmi_enter(regs);
+ }
- if (handle_misaligned_load(regs))
- do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
- "Oops - load address misaligned");
+ if (misaligned_handler[type].handler(regs))
+ do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
+ misaligned_handler[type].type_str);
+ if (user_mode(regs)) {
+ local_irq_disable();
irqentry_exit_to_user_mode(regs);
} else {
- irqentry_state_t state = irqentry_nmi_enter(regs);
-
- if (handle_misaligned_load(regs))
- do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
- "Oops - load address misaligned");
-
irqentry_nmi_exit(regs, state);
}
}
-asmlinkage __visible __trap_section void do_trap_store_misaligned(struct pt_regs *regs)
+asmlinkage __visible __trap_section void do_trap_load_misaligned(struct pt_regs *regs)
{
- if (user_mode(regs)) {
- irqentry_enter_from_user_mode(regs);
-
- if (handle_misaligned_store(regs))
- do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
- "Oops - store (or AMO) address misaligned");
-
- irqentry_exit_to_user_mode(regs);
- } else {
- irqentry_state_t state = irqentry_nmi_enter(regs);
-
- if (handle_misaligned_store(regs))
- do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
- "Oops - store (or AMO) address misaligned");
+ do_trap_misaligned(regs, MISALIGNED_LOAD);
+}
- irqentry_nmi_exit(regs, state);
- }
+asmlinkage __visible __trap_section void do_trap_store_misaligned(struct pt_regs *regs)
+{
+ do_trap_misaligned(regs, MISALIGNED_STORE);
}
+
DO_ERROR_INFO(do_trap_store_fault,
SIGSEGV, SEGV_ACCERR, "store (or AMO) access fault");
DO_ERROR_INFO(do_trap_ecall_s,
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 4354c87c0376..77c788660223 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -88,6 +88,13 @@
#define INSN_MATCH_C_FSWSP 0xe002
#define INSN_MASK_C_FSWSP 0xe003
+#define INSN_MATCH_C_LHU 0x8400
+#define INSN_MASK_C_LHU 0xfc43
+#define INSN_MATCH_C_LH 0x8440
+#define INSN_MASK_C_LH 0xfc43
+#define INSN_MATCH_C_SH 0x8c00
+#define INSN_MASK_C_SH 0xfc43
+
#define INSN_LEN(insn) ((((insn) & 0x3) < 0x3) ? 2 : 4)
#if defined(CONFIG_64BIT)
@@ -268,7 +275,7 @@ static unsigned long get_f32_rs(unsigned long insn, u8 fp_reg_offset,
int __ret; \
\
if (user_mode(regs)) { \
- __ret = __get_user(insn, (type __user *) insn_addr); \
+ __ret = get_user(insn, (type __user *) insn_addr); \
} else { \
insn = *(type *)insn_addr; \
__ret = 0; \
@@ -431,6 +438,13 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs)
fp = 1;
len = 4;
#endif
+ } else if ((insn & INSN_MASK_C_LHU) == INSN_MATCH_C_LHU) {
+ len = 2;
+ insn = RVC_RS2S(insn) << SH_RD;
+ } else if ((insn & INSN_MASK_C_LH) == INSN_MATCH_C_LH) {
+ len = 2;
+ shift = 8 * (sizeof(ulong) - len);
+ insn = RVC_RS2S(insn) << SH_RD;
} else {
regs->epc = epc;
return -1;
@@ -530,6 +544,9 @@ static int handle_scalar_misaligned_store(struct pt_regs *regs)
len = 4;
val.data_ulong = GET_F32_RS2C(insn, regs);
#endif
+ } else if ((insn & INSN_MASK_C_SH) == INSN_MATCH_C_SH) {
+ len = 2;
+ val.data_ulong = GET_RS2S(insn, regs);
} else {
regs->epc = epc;
return -1;
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index 60d684c76c58..02635bac91f1 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -77,6 +77,8 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu)
memcpy(cntx, reset_cntx, sizeof(*cntx));
spin_unlock(&vcpu->arch.reset_cntx_lock);
+ memset(&vcpu->arch.smstateen_csr, 0, sizeof(vcpu->arch.smstateen_csr));
+
kvm_riscv_vcpu_fp_reset(vcpu);
kvm_riscv_vcpu_vector_reset(vcpu);
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 6f2c9ce1b154..24b22f6a9e99 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -38,7 +38,6 @@ CONFIG_USER_NS=y
CONFIG_CHECKPOINT_RESTORE=y
CONFIG_SCHED_AUTOGROUP=y
CONFIG_EXPERT=y
-# CONFIG_SYSFS_SYSCALL is not set
CONFIG_PROFILING=y
CONFIG_KEXEC=y
CONFIG_KEXEC_FILE=y
@@ -92,7 +91,6 @@ CONFIG_UNIXWARE_DISKLABEL=y
CONFIG_IOSCHED_BFQ=y
CONFIG_BINFMT_MISC=m
CONFIG_ZSWAP=y
-CONFIG_ZSMALLOC=y
CONFIG_ZSMALLOC_STAT=y
CONFIG_SLAB_BUCKETS=y
CONFIG_SLUB_STATS=y
@@ -395,6 +393,9 @@ CONFIG_CLS_U32_MARK=y
CONFIG_NET_CLS_FLOW=m
CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_CLS_BPF=m
+CONFIG_NET_CLS_FLOWER=m
+CONFIG_NET_CLS_MATCHALL=m
+CONFIG_NET_EMATCH=y
CONFIG_NET_CLS_ACT=y
CONFIG_NET_ACT_POLICE=m
CONFIG_NET_ACT_GACT=m
@@ -405,6 +406,9 @@ CONFIG_NET_ACT_PEDIT=m
CONFIG_NET_ACT_SIMP=m
CONFIG_NET_ACT_SKBEDIT=m
CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_VLAN=m
+CONFIG_NET_ACT_TUNNEL_KEY=m
+CONFIG_NET_ACT_CT=m
CONFIG_NET_ACT_GATE=m
CONFIG_NET_TC_SKB_EXT=y
CONFIG_DNS_RESOLVER=y
@@ -628,8 +632,16 @@ CONFIG_VIRTIO_PCI=m
CONFIG_VIRTIO_BALLOON=m
CONFIG_VIRTIO_MEM=m
CONFIG_VIRTIO_INPUT=y
+CONFIG_VDPA=m
+CONFIG_VDPA_SIM=m
+CONFIG_VDPA_SIM_NET=m
+CONFIG_VDPA_SIM_BLOCK=m
+CONFIG_VDPA_USER=m
+CONFIG_MLX5_VDPA_NET=m
+CONFIG_VP_VDPA=m
CONFIG_VHOST_NET=m
CONFIG_VHOST_VSOCK=m
+CONFIG_VHOST_VDPA=m
CONFIG_EXT4_FS=y
CONFIG_EXT4_FS_POSIX_ACL=y
CONFIG_EXT4_FS_SECURITY=y
@@ -654,7 +666,6 @@ CONFIG_NILFS2_FS=m
CONFIG_BCACHEFS_FS=y
CONFIG_BCACHEFS_QUOTA=y
CONFIG_BCACHEFS_POSIX_ACL=y
-CONFIG_FS_DAX=y
CONFIG_EXPORTFS_BLOCK_OPS=y
CONFIG_FS_ENCRYPTION=y
CONFIG_FS_VERITY=y
@@ -724,11 +735,10 @@ CONFIG_NLS_UTF8=m
CONFIG_DLM=m
CONFIG_UNICODE=y
CONFIG_PERSISTENT_KEYRINGS=y
+CONFIG_BIG_KEYS=y
CONFIG_ENCRYPTED_KEYS=m
CONFIG_KEY_NOTIFICATIONS=y
CONFIG_SECURITY=y
-CONFIG_HARDENED_USERCOPY=y
-CONFIG_FORTIFY_SOURCE=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
CONFIG_SECURITY_LOCKDOWN_LSM=y
@@ -741,6 +751,8 @@ CONFIG_IMA=y
CONFIG_IMA_DEFAULT_HASH_SHA256=y
CONFIG_IMA_WRITE_POLICY=y
CONFIG_IMA_APPRAISE=y
+CONFIG_FORTIFY_SOURCE=y
+CONFIG_HARDENED_USERCOPY=y
CONFIG_BUG_ON_DATA_CORRUPTION=y
CONFIG_CRYPTO_USER=m
# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
@@ -756,7 +768,6 @@ CONFIG_CRYPTO_AES_TI=m
CONFIG_CRYPTO_ANUBIS=m
CONFIG_CRYPTO_ARIA=m
CONFIG_CRYPTO_BLOWFISH=m
-CONFIG_CRYPTO_CAMELLIA=m
CONFIG_CRYPTO_CAST5=m
CONFIG_CRYPTO_CAST6=m
CONFIG_CRYPTO_DES=m
@@ -801,7 +812,6 @@ CONFIG_CRYPTO_SHA3_512_S390=m
CONFIG_CRYPTO_GHASH_S390=m
CONFIG_CRYPTO_AES_S390=m
CONFIG_CRYPTO_DES_S390=m
-CONFIG_CRYPTO_CHACHA_S390=m
CONFIG_CRYPTO_HMAC_S390=m
CONFIG_ZCRYPT=m
CONFIG_PKEY=m
@@ -812,9 +822,9 @@ CONFIG_PKEY_UV=m
CONFIG_CRYPTO_PAES_S390=m
CONFIG_CRYPTO_DEV_VIRTIO=m
CONFIG_SYSTEM_BLACKLIST_KEYRING=y
+CONFIG_CRYPTO_KRB5=m
+CONFIG_CRYPTO_KRB5_SELFTESTS=y
CONFIG_CORDIC=m
-CONFIG_CRYPTO_LIB_CURVE25519=m
-CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
CONFIG_RANDOM32_SELFTEST=y
CONFIG_XZ_DEC_MICROLZMA=y
CONFIG_DMA_CMA=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index f18a7d97ac21..2b8b42d569bc 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -36,7 +36,6 @@ CONFIG_USER_NS=y
CONFIG_CHECKPOINT_RESTORE=y
CONFIG_SCHED_AUTOGROUP=y
CONFIG_EXPERT=y
-# CONFIG_SYSFS_SYSCALL is not set
CONFIG_PROFILING=y
CONFIG_KEXEC=y
CONFIG_KEXEC_FILE=y
@@ -86,7 +85,6 @@ CONFIG_UNIXWARE_DISKLABEL=y
CONFIG_IOSCHED_BFQ=y
CONFIG_BINFMT_MISC=m
CONFIG_ZSWAP=y
-CONFIG_ZSMALLOC=y
CONFIG_ZSMALLOC_STAT=y
CONFIG_SLAB_BUCKETS=y
# CONFIG_COMPAT_BRK is not set
@@ -385,6 +383,9 @@ CONFIG_CLS_U32_MARK=y
CONFIG_NET_CLS_FLOW=m
CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_CLS_BPF=m
+CONFIG_NET_CLS_FLOWER=m
+CONFIG_NET_CLS_MATCHALL=m
+CONFIG_NET_EMATCH=y
CONFIG_NET_CLS_ACT=y
CONFIG_NET_ACT_POLICE=m
CONFIG_NET_ACT_GACT=m
@@ -395,6 +396,9 @@ CONFIG_NET_ACT_PEDIT=m
CONFIG_NET_ACT_SIMP=m
CONFIG_NET_ACT_SKBEDIT=m
CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_VLAN=m
+CONFIG_NET_ACT_TUNNEL_KEY=m
+CONFIG_NET_ACT_CT=m
CONFIG_NET_ACT_GATE=m
CONFIG_NET_TC_SKB_EXT=y
CONFIG_DNS_RESOLVER=y
@@ -618,8 +622,16 @@ CONFIG_VIRTIO_PCI=m
CONFIG_VIRTIO_BALLOON=m
CONFIG_VIRTIO_MEM=m
CONFIG_VIRTIO_INPUT=y
+CONFIG_VDPA=m
+CONFIG_VDPA_SIM=m
+CONFIG_VDPA_SIM_NET=m
+CONFIG_VDPA_SIM_BLOCK=m
+CONFIG_VDPA_USER=m
+CONFIG_MLX5_VDPA_NET=m
+CONFIG_VP_VDPA=m
CONFIG_VHOST_NET=m
CONFIG_VHOST_VSOCK=m
+CONFIG_VHOST_VDPA=m
CONFIG_EXT4_FS=y
CONFIG_EXT4_FS_POSIX_ACL=y
CONFIG_EXT4_FS_SECURITY=y
@@ -641,7 +653,6 @@ CONFIG_NILFS2_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_BCACHEFS_QUOTA=y
CONFIG_BCACHEFS_POSIX_ACL=y
-CONFIG_FS_DAX=y
CONFIG_EXPORTFS_BLOCK_OPS=y
CONFIG_FS_ENCRYPTION=y
CONFIG_FS_VERITY=y
@@ -711,6 +722,7 @@ CONFIG_NLS_UTF8=m
CONFIG_DLM=m
CONFIG_UNICODE=y
CONFIG_PERSISTENT_KEYRINGS=y
+CONFIG_BIG_KEYS=y
CONFIG_ENCRYPTED_KEYS=m
CONFIG_KEY_NOTIFICATIONS=y
CONFIG_SECURITY=y
@@ -742,7 +754,6 @@ CONFIG_CRYPTO_AES_TI=m
CONFIG_CRYPTO_ANUBIS=m
CONFIG_CRYPTO_ARIA=m
CONFIG_CRYPTO_BLOWFISH=m
-CONFIG_CRYPTO_CAMELLIA=m
CONFIG_CRYPTO_CAST5=m
CONFIG_CRYPTO_CAST6=m
CONFIG_CRYPTO_DES=m
@@ -788,7 +799,6 @@ CONFIG_CRYPTO_SHA3_512_S390=m
CONFIG_CRYPTO_GHASH_S390=m
CONFIG_CRYPTO_AES_S390=m
CONFIG_CRYPTO_DES_S390=m
-CONFIG_CRYPTO_CHACHA_S390=m
CONFIG_CRYPTO_HMAC_S390=m
CONFIG_ZCRYPT=m
CONFIG_PKEY=m
@@ -799,10 +809,10 @@ CONFIG_PKEY_UV=m
CONFIG_CRYPTO_PAES_S390=m
CONFIG_CRYPTO_DEV_VIRTIO=m
CONFIG_SYSTEM_BLACKLIST_KEYRING=y
+CONFIG_CRYPTO_KRB5=m
+CONFIG_CRYPTO_KRB5_SELFTESTS=y
CONFIG_CORDIC=m
CONFIG_PRIME_NUMBERS=m
-CONFIG_CRYPTO_LIB_CURVE25519=m
-CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
CONFIG_XZ_DEC_MICROLZMA=y
CONFIG_DMA_CMA=y
CONFIG_CMA_SIZE_MBYTES=0
diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index 853b2326a171..8163c1702720 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -70,7 +70,6 @@ CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_INFO_DWARF4=y
CONFIG_DEBUG_FS=y
CONFIG_PANIC_ON_OOPS=y
-# CONFIG_SCHED_DEBUG is not set
CONFIG_RCU_CPU_STALL_TIMEOUT=60
# CONFIG_RCU_TRACE is not set
# CONFIG_FTRACE is not set
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index dd291c9ad6a6..9980c17ba22d 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -602,7 +602,8 @@ SYM_CODE_START(stack_invalid)
stmg %r0,%r7,__PT_R0(%r11)
stmg %r8,%r9,__PT_PSW(%r11)
mvc __PT_R8(64,%r11),0(%r14)
- stg %r10,__PT_ORIG_GPR2(%r11) # store last break to orig_gpr2
+ GET_LC %r2
+ mvc __PT_ORIG_GPR2(8,%r11),__LC_PGM_LAST_BREAK(%r2)
xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
lgr %r2,%r11 # pass pointer to pt_regs
jg kernel_stack_invalid
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 9a929bbcc397..241f7251c873 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -428,6 +428,8 @@ static void __clp_add(struct clp_fh_list_entry *entry, void *data)
return;
}
zdev = zpci_create_device(entry->fid, entry->fh, entry->config_state);
+ if (IS_ERR(zdev))
+ return;
list_add_tail(&zdev->entry, scan_list);
}
diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h
index 3a08f9029a3f..1c6e0ae41b0c 100644
--- a/arch/um/include/asm/uaccess.h
+++ b/arch/um/include/asm/uaccess.h
@@ -55,6 +55,7 @@ do { \
goto err_label; \
} \
*((type *)dst) = get_unaligned((type *)(src)); \
+ barrier(); \
current->thread.segv_continue = NULL; \
} while (0)
@@ -66,6 +67,7 @@ do { \
if (__faulted) \
goto err_label; \
put_unaligned(*((type *)src), (type *)(dst)); \
+ barrier(); \
current->thread.segv_continue = NULL; \
} while (0)
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index ce073150dc20..ef2272e92a43 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -225,20 +225,20 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
panic("Failed to sync kernel TLBs: %d", err);
goto out;
}
- else if (current->mm == NULL) {
- if (current->pagefault_disabled) {
- if (!mc) {
- show_regs(container_of(regs, struct pt_regs, regs));
- panic("Segfault with pagefaults disabled but no mcontext");
- }
- if (!current->thread.segv_continue) {
- show_regs(container_of(regs, struct pt_regs, regs));
- panic("Segfault without recovery target");
- }
- mc_set_rip(mc, current->thread.segv_continue);
- current->thread.segv_continue = NULL;
- goto out;
+ else if (current->pagefault_disabled) {
+ if (!mc) {
+ show_regs(container_of(regs, struct pt_regs, regs));
+ panic("Segfault with pagefaults disabled but no mcontext");
}
+ if (!current->thread.segv_continue) {
+ show_regs(container_of(regs, struct pt_regs, regs));
+ panic("Segfault without recovery target");
+ }
+ mc_set_rip(mc, current->thread.segv_continue);
+ current->thread.segv_continue = NULL;
+ goto out;
+ }
+ else if (current->mm == NULL) {
show_regs(container_of(regs, struct pt_regs, regs));
panic("Segfault with no mm");
}
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index d4b3b6742ec8..2f5ee045bc7a 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -477,7 +477,7 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len)
return text_poke(addr, opcode, len);
}
-void text_poke_sync(void)
+void smp_text_poke_sync_each_cpu(void)
{
}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4b9f378e05f6..297e33cfa760 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,6 +153,7 @@ config X86
select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64
select ARCH_WANTS_THP_SWAP if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
+ select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
select BUILDTIME_TABLE_SORT
select CLKEVT_I8253
select CLOCKSOURCE_WATCHDOG
@@ -2368,6 +2369,7 @@ config STRICT_SIGALTSTACK_SIZE
config CFI_AUTO_DEFAULT
bool "Attempt to use FineIBT by default at boot time"
depends on FINEIBT
+ depends on !RUST || RUSTC_VERSION >= 108800
default y
help
Attempt to use FineIBT by default at boot time. If enabled,
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 594723005d95..53daf4654f6c 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -281,6 +281,7 @@ archprepare: $(cpufeaturemasks.hdr)
###
# Kernel objects
+core-y += arch/x86/boot/startup/
libs-y += arch/x86/lib/
# drivers-y are linked after core-y
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
index aa9b96457584..cf4a6155714e 100644
--- a/arch/x86/boot/bioscall.S
+++ b/arch/x86/boot/bioscall.S
@@ -32,7 +32,7 @@ intcall:
movw %dx, %si
movw %sp, %di
movw $11, %cx
- rep; movsl
+ rep movsl
/* Pop full state from the stack */
popal
@@ -67,7 +67,7 @@ intcall:
jz 4f
movw %sp, %si
movw $11, %cx
- rep; movsl
+ rep movsl
4: addw $44, %sp
/* Restore state and return */
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 38f17a1e1e36..60580836daf7 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -34,7 +34,7 @@
extern struct setup_header hdr;
extern struct boot_params boot_params;
-#define cpu_relax() asm volatile("rep; nop")
+#define cpu_relax() asm volatile("pause")
static inline void io_delay(void)
{
@@ -155,14 +155,14 @@ static inline void wrgs32(u32 v, addr_t addr)
static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len)
{
bool diff;
- asm volatile("fs; repe; cmpsb" CC_SET(nz)
+ asm volatile("fs repe cmpsb" CC_SET(nz)
: CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len));
return diff;
}
static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len)
{
bool diff;
- asm volatile("gs; repe; cmpsb" CC_SET(nz)
+ asm volatile("gs repe cmpsb" CC_SET(nz)
: CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len));
return diff;
}
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index fdbce022db55..f4f7b22d8113 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -44,10 +44,10 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS
KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
-# sev.c indirectly includes inat-table.h which is generated during
+# sev-decode-insn.c indirectly includes inat-table.c which is generated during
# compilation and stored in $(objtree). Add the directory to the includes so
# that the compiler finds it even with out-of-tree builds (make O=/some/path).
-CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/
+CFLAGS_sev-handle-vc.o += -I$(objtree)/arch/x86/lib/
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
@@ -73,7 +73,7 @@ LDFLAGS_vmlinux += -T
hostprogs := mkpiggy
HOST_EXTRACFLAGS += -I$(srctree)/tools/include
-sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
quiet_cmd_voffset = VOFFSET $@
cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
@@ -96,8 +96,7 @@ ifdef CONFIG_X86_64
vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o
vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o
- vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o
- vmlinux-objs-y += $(obj)/la57toggle.o
+ vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o $(obj)/sev-handle-vc.o
endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
@@ -106,6 +105,7 @@ vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o
vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
+vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a
$(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE
$(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index eafd4f185e77..d9dab940ff62 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -35,7 +35,6 @@
#include <asm/bootparam.h>
#include <asm/desc_defs.h>
#include <asm/trapnr.h>
-#include "pgtable.h"
/*
* Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 1cdcd4aaf395..94b5991da001 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -14,7 +14,6 @@
#include "misc.h"
#include "error.h"
-#include "pgtable.h"
#include "../string.h"
#include "../voffset.h"
#include <asm/bootparam_utils.h>
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index dd8d1a85f671..db1048621ea2 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -136,6 +136,9 @@ static inline void console_init(void)
#endif
#ifdef CONFIG_AMD_MEM_ENCRYPT
+struct es_em_ctxt;
+struct insn;
+
void sev_enable(struct boot_params *bp);
void snp_check_features(void);
void sev_es_shutdown_ghcb(void);
@@ -143,6 +146,11 @@ extern bool sev_es_check_ghcb_fault(unsigned long address);
void snp_set_page_private(unsigned long paddr);
void snp_set_page_shared(unsigned long paddr);
void sev_prep_identity_maps(unsigned long top_level_pgt);
+
+enum es_result vc_decode_insn(struct es_em_ctxt *ctxt);
+bool insn_has_rep_prefix(struct insn *insn);
+void sev_insn_decode_init(void);
+bool early_setup_ghcb(void);
#else
static inline void sev_enable(struct boot_params *bp)
{
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
deleted file mode 100644
index 6d595abe06b3..000000000000
--- a/arch/x86/boot/compressed/pgtable.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef BOOT_COMPRESSED_PAGETABLE_H
-#define BOOT_COMPRESSED_PAGETABLE_H
-
-#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE)
-
-#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
-#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0
-
-#ifndef __ASSEMBLER__
-
-extern unsigned long *trampoline_32bit;
-
-extern void trampoline_32bit_src(void *trampoline, bool enable_5lvl);
-
-extern const u16 trampoline_ljmp_imm_offset;
-
-#endif /* __ASSEMBLER__ */
-#endif /* BOOT_COMPRESSED_PAGETABLE_H */
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index d8c5de40669d..5a6c7a190e5b 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -4,7 +4,6 @@
#include <asm/bootparam_utils.h>
#include <asm/e820/types.h>
#include <asm/processor.h>
-#include "pgtable.h"
#include "../string.h"
#include "efi.h"
diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c
new file mode 100644
index 000000000000..89dd02de2a0f
--- /dev/null
+++ b/arch/x86/boot/compressed/sev-handle-vc.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "misc.h"
+#include "sev.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <asm/insn.h>
+#include <asm/pgtable_types.h>
+#include <asm/ptrace.h>
+#include <asm/sev.h>
+#include <asm/trapnr.h>
+#include <asm/trap_pf.h>
+#include <asm/fpu/xcr.h>
+
+#define __BOOT_COMPRESSED
+
+/* Basic instruction decoding support needed */
+#include "../../lib/inat.c"
+#include "../../lib/insn.c"
+
+/*
+ * Copy a version of this function here - insn-eval.c can't be used in
+ * pre-decompression code.
+ */
+bool insn_has_rep_prefix(struct insn *insn)
+{
+ insn_byte_t p;
+ int i;
+
+ insn_get_prefixes(insn);
+
+ for_each_insn_prefix(insn, i, p) {
+ if (p == 0xf2 || p == 0xf3)
+ return true;
+ }
+
+ return false;
+}
+
+enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ int ret;
+
+ memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+
+ ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
+ if (ret < 0)
+ return ES_DECODE_FAILED;
+
+ return ES_OK;
+}
+
+extern void sev_insn_decode_init(void) __alias(inat_init_tables);
+
+/*
+ * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and
+ * doesn't use segments.
+ */
+static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
+{
+ return 0UL;
+}
+
+static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
+ void *dst, char *buf, size_t size)
+{
+ memcpy(dst, buf, size);
+
+ return ES_OK;
+}
+
+static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
+ void *src, char *buf, size_t size)
+{
+ memcpy(buf, src, size);
+
+ return ES_OK;
+}
+
+static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
+{
+ return ES_OK;
+}
+
+static bool fault_in_kernel_space(unsigned long address)
+{
+ return false;
+}
+
+#define sev_printk(fmt, ...)
+
+#include "../../coco/sev/vc-shared.c"
+
+void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
+{
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+
+ if (!boot_ghcb && !early_setup_ghcb())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ vc_ghcb_invalidate(boot_ghcb);
+ result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+ if (result != ES_OK)
+ goto finish;
+
+ result = vc_check_opcode_bytes(&ctxt, exit_code);
+ if (result != ES_OK)
+ goto finish;
+
+ switch (exit_code) {
+ case SVM_EXIT_RDTSC:
+ case SVM_EXIT_RDTSCP:
+ result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code);
+ break;
+ case SVM_EXIT_IOIO:
+ result = vc_handle_ioio(boot_ghcb, &ctxt);
+ break;
+ case SVM_EXIT_CPUID:
+ result = vc_handle_cpuid(boot_ghcb, &ctxt);
+ break;
+ default:
+ result = ES_UNSUPPORTED;
+ break;
+ }
+
+finish:
+ if (result == ES_OK)
+ vc_finish_insn(&ctxt);
+ else if (result != ES_RETRY)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 0003e4416efd..612b443296d3 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -24,96 +24,11 @@
#include <asm/cpuid.h>
#include "error.h"
-#include "../msr.h"
+#include "sev.h"
static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
struct ghcb *boot_ghcb;
-/*
- * Copy a version of this function here - insn-eval.c can't be used in
- * pre-decompression code.
- */
-static bool insn_has_rep_prefix(struct insn *insn)
-{
- insn_byte_t p;
- int i;
-
- insn_get_prefixes(insn);
-
- for_each_insn_prefix(insn, i, p) {
- if (p == 0xf2 || p == 0xf3)
- return true;
- }
-
- return false;
-}
-
-/*
- * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and
- * doesn't use segments.
- */
-static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
-{
- return 0UL;
-}
-
-static inline u64 sev_es_rd_ghcb_msr(void)
-{
- struct msr m;
-
- boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m);
-
- return m.q;
-}
-
-static inline void sev_es_wr_ghcb_msr(u64 val)
-{
- struct msr m;
-
- m.q = val;
- boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
-}
-
-static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
-{
- char buffer[MAX_INSN_SIZE];
- int ret;
-
- memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
-
- ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
- if (ret < 0)
- return ES_DECODE_FAILED;
-
- return ES_OK;
-}
-
-static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
- void *dst, char *buf, size_t size)
-{
- memcpy(dst, buf, size);
-
- return ES_OK;
-}
-
-static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
- void *src, char *buf, size_t size)
-{
- memcpy(buf, src, size);
-
- return ES_OK;
-}
-
-static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
-{
- return ES_OK;
-}
-
-static bool fault_in_kernel_space(unsigned long address)
-{
- return false;
-}
-
#undef __init
#define __init
@@ -122,24 +37,27 @@ static bool fault_in_kernel_space(unsigned long address)
#define __BOOT_COMPRESSED
-/* Basic instruction decoding support needed */
-#include "../../lib/inat.c"
-#include "../../lib/insn.c"
+extern struct svsm_ca *boot_svsm_caa;
+extern u64 boot_svsm_caa_pa;
-/* Include code for early handlers */
-#include "../../coco/sev/shared.c"
-
-static struct svsm_ca *svsm_get_caa(void)
+struct svsm_ca *svsm_get_caa(void)
{
return boot_svsm_caa;
}
-static u64 svsm_get_caa_pa(void)
+u64 svsm_get_caa_pa(void)
{
return boot_svsm_caa_pa;
}
-static int svsm_perform_call_protocol(struct svsm_call *call)
+int svsm_perform_call_protocol(struct svsm_call *call);
+
+u8 snp_vmpl;
+
+/* Include code for early handlers */
+#include "../../boot/startup/sev-shared.c"
+
+int svsm_perform_call_protocol(struct svsm_call *call)
{
struct ghcb *ghcb;
int ret;
@@ -157,7 +75,7 @@ static int svsm_perform_call_protocol(struct svsm_call *call)
return ret;
}
-bool sev_snp_enabled(void)
+static bool sev_snp_enabled(void)
{
return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
}
@@ -212,7 +130,7 @@ void snp_set_page_shared(unsigned long paddr)
__page_state_change(paddr, SNP_PAGE_STATE_SHARED);
}
-static bool early_setup_ghcb(void)
+bool early_setup_ghcb(void)
{
if (set_page_decrypted((unsigned long)&boot_ghcb_page))
return false;
@@ -223,7 +141,7 @@ static bool early_setup_ghcb(void)
boot_ghcb = &boot_ghcb_page;
/* Initialize lookup tables for the instruction decoder */
- inat_init_tables();
+ sev_insn_decode_init();
/* SNP guest requires the GHCB GPA must be registered */
if (sev_snp_enabled())
@@ -296,46 +214,6 @@ bool sev_es_check_ghcb_fault(unsigned long address)
return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page);
}
-void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
-{
- struct es_em_ctxt ctxt;
- enum es_result result;
-
- if (!boot_ghcb && !early_setup_ghcb())
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-
- vc_ghcb_invalidate(boot_ghcb);
- result = vc_init_em_ctxt(&ctxt, regs, exit_code);
- if (result != ES_OK)
- goto finish;
-
- result = vc_check_opcode_bytes(&ctxt, exit_code);
- if (result != ES_OK)
- goto finish;
-
- switch (exit_code) {
- case SVM_EXIT_RDTSC:
- case SVM_EXIT_RDTSCP:
- result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code);
- break;
- case SVM_EXIT_IOIO:
- result = vc_handle_ioio(boot_ghcb, &ctxt);
- break;
- case SVM_EXIT_CPUID:
- result = vc_handle_cpuid(boot_ghcb, &ctxt);
- break;
- default:
- result = ES_UNSUPPORTED;
- break;
- }
-
-finish:
- if (result == ES_OK)
- vc_finish_insn(&ctxt);
- else if (result != ES_RETRY)
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-}
-
/*
* SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need
* guest side implementation for proper functioning of the guest. If any
diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h
index d3900384b8ab..92f79c21939c 100644
--- a/arch/x86/boot/compressed/sev.h
+++ b/arch/x86/boot/compressed/sev.h
@@ -10,14 +10,31 @@
#ifdef CONFIG_AMD_MEM_ENCRYPT
-bool sev_snp_enabled(void);
+#include "../msr.h"
+
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
u64 sev_get_status(void);
bool early_is_sevsnp_guest(void);
+static inline u64 sev_es_rd_ghcb_msr(void)
+{
+ struct msr m;
+
+ boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+
+ return m.q;
+}
+
+static inline void sev_es_wr_ghcb_msr(u64 val)
+{
+ struct msr m;
+
+ m.q = val;
+ boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+}
+
#else
-static inline bool sev_snp_enabled(void) { return false; }
static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
static inline u64 sev_get_status(void) { return 0; }
static inline bool early_is_sevsnp_guest(void) { return false; }
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index 81fc1eaa3229..9af19d9614cb 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -15,9 +15,9 @@ static void *____memcpy(void *dest, const void *src, size_t n)
{
int d0, d1, d2;
asm volatile(
- "rep ; movsl\n\t"
+ "rep movsl\n\t"
"movl %4,%%ecx\n\t"
- "rep ; movsb\n\t"
+ "rep movsb"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
: "memory");
@@ -29,9 +29,9 @@ static void *____memcpy(void *dest, const void *src, size_t n)
{
long d0, d1, d2;
asm volatile(
- "rep ; movsq\n\t"
+ "rep movsq\n\t"
"movq %4,%%rcx\n\t"
- "rep ; movsb\n\t"
+ "rep movsb"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
: "memory");
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
index 6afd05e819d2..3973a67cd04e 100644
--- a/arch/x86/boot/copy.S
+++ b/arch/x86/boot/copy.S
@@ -22,10 +22,10 @@ SYM_FUNC_START_NOALIGN(memcpy)
movw %dx, %si
pushw %cx
shrw $2, %cx
- rep; movsl
+ rep movsl
popw %cx
andw $3, %cx
- rep; movsb
+ rep movsb
popw %di
popw %si
retl
@@ -38,10 +38,10 @@ SYM_FUNC_START_NOALIGN(memset)
imull $0x01010101,%eax
pushw %cx
shrw $2, %cx
- rep; stosl
+ rep stosl
popw %cx
andw $3, %cx
- rep; stosb
+ rep stosb
popw %di
retl
SYM_FUNC_END(memset)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index b5c79f43359b..9cb91421b4e4 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -585,7 +585,7 @@ start_of_setup:
xorl %eax, %eax
subw %di, %cx
shrw $2, %cx
- rep; stosl
+ rep stosl
# Jump to C code (should not return)
calll main
diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile
new file mode 100644
index 000000000000..b514f7e81332
--- /dev/null
+++ b/arch/x86/boot/startup/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+
+KBUILD_AFLAGS += -D__DISABLE_EXPORTS
+KBUILD_CFLAGS += -D__DISABLE_EXPORTS -mcmodel=small -fPIC \
+ -Os -DDISABLE_BRANCH_PROFILING \
+ $(DISABLE_STACKLEAK_PLUGIN) \
+ -fno-stack-protector -D__NO_FORTIFY \
+ -fno-jump-tables \
+ -include $(srctree)/include/linux/hidden.h
+
+# disable ftrace hooks and LTO
+KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS))
+KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
+KMSAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += sme.o sev-startup.o
+
+lib-$(CONFIG_X86_64) += la57toggle.o
+lib-$(CONFIG_EFI_MIXED) += efi-mixed.o
+
+#
+# Disable objtool validation for all library code, which is intended
+# to be linked into the decompressor or the EFI stub but not vmlinux
+#
+$(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y
diff --git a/arch/x86/boot/startup/efi-mixed.S b/arch/x86/boot/startup/efi-mixed.S
new file mode 100644
index 000000000000..e04ed99bc449
--- /dev/null
+++ b/arch/x86/boot/startup/efi-mixed.S
@@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming
+ *
+ * Early support for invoking 32-bit EFI services from a 64-bit kernel.
+ *
+ * Because this thunking occurs before ExitBootServices() we have to
+ * restore the firmware's 32-bit GDT and IDT before we make EFI service
+ * calls.
+ *
+ * On the plus side, we don't have to worry about mangling 64-bit
+ * addresses into 32-bits because we're executing with an identity
+ * mapped pagetable and haven't transitioned to 64-bit virtual addresses
+ * yet.
+ */
+
+#include <linux/linkage.h>
+#include <asm/desc_defs.h>
+#include <asm/msr.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+ .text
+ .code32
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
+SYM_FUNC_START(efi32_stub_entry)
+ call 1f
+1: popl %ecx
+
+ /* Clear BSS */
+ xorl %eax, %eax
+ leal (_bss - 1b)(%ecx), %edi
+ leal (_ebss - 1b)(%ecx), %ecx
+ subl %edi, %ecx
+ shrl $2, %ecx
+ cld
+ rep stosl
+
+ add $0x4, %esp /* Discard return address */
+ movl 8(%esp), %ebx /* struct boot_params pointer */
+ jmp efi32_startup
+SYM_FUNC_END(efi32_stub_entry)
+#endif
+
+/*
+ * Called using a far call from __efi64_thunk() below, using the x86_64 SysV
+ * ABI (except for R8/R9 which are inaccessible to 32-bit code - EAX/EBX are
+ * used instead). EBP+16 points to the arguments passed via the stack.
+ *
+ * The first argument (EDI) is a pointer to the boot service or protocol, to
+ * which the remaining arguments are passed, each truncated to 32 bits.
+ */
+SYM_FUNC_START_LOCAL(efi_enter32)
+ /*
+ * Convert x86-64 SysV ABI params to i386 ABI
+ */
+ pushl 32(%ebp) /* Up to 3 args passed via the stack */
+ pushl 24(%ebp)
+ pushl 16(%ebp)
+ pushl %ebx /* R9 */
+ pushl %eax /* R8 */
+ pushl %ecx
+ pushl %edx
+ pushl %esi
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Disable long mode via EFER */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btrl $_EFER_LME, %eax
+ wrmsr
+
+ call *%edi
+
+ /* We must preserve return value */
+ movl %eax, %edi
+
+ call efi32_enable_long_mode
+
+ addl $32, %esp
+ movl %edi, %eax
+ lret
+SYM_FUNC_END(efi_enter32)
+
+ .code64
+SYM_FUNC_START(__efi64_thunk)
+ push %rbp
+ movl %esp, %ebp
+ push %rbx
+
+ /* Move args #5 and #6 into 32-bit accessible registers */
+ movl %r8d, %eax
+ movl %r9d, %ebx
+
+ lcalll *efi32_call(%rip)
+
+ pop %rbx
+ pop %rbp
+ RET
+SYM_FUNC_END(__efi64_thunk)
+
+ .code32
+SYM_FUNC_START_LOCAL(efi32_enable_long_mode)
+ movl %cr4, %eax
+ btsl $(X86_CR4_PAE_BIT), %eax
+ movl %eax, %cr4
+
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ /* Disable interrupts - the firmware's IDT does not work in long mode */
+ cli
+
+ /* Enable paging */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+ ret
+SYM_FUNC_END(efi32_enable_long_mode)
+
+/*
+ * This is the common EFI stub entry point for mixed mode. It sets up the GDT
+ * and page tables needed for 64-bit execution, after which it calls the
+ * common 64-bit EFI entrypoint efi_stub_entry().
+ *
+ * Arguments: 0(%esp) image handle
+ * 4(%esp) EFI system table pointer
+ * %ebx struct boot_params pointer (or NULL)
+ *
+ * Since this is the point of no return for ordinary execution, no registers
+ * are considered live except for the function parameters. [Note that the EFI
+ * stub may still exit and return to the firmware using the Exit() EFI boot
+ * service.]
+ */
+SYM_FUNC_START_LOCAL(efi32_startup)
+ movl %esp, %ebp
+
+ subl $8, %esp
+ sgdtl (%esp) /* Save GDT descriptor to the stack */
+ movl 2(%esp), %esi /* Existing GDT pointer */
+ movzwl (%esp), %ecx /* Existing GDT limit */
+ inc %ecx /* Existing GDT size */
+ andl $~7, %ecx /* Ensure size is multiple of 8 */
+
+ subl %ecx, %esp /* Allocate new GDT */
+ andl $~15, %esp /* Realign the stack */
+ movl %esp, %edi /* New GDT address */
+ leal 7(%ecx), %eax /* New GDT limit */
+ pushw %cx /* Push 64-bit CS (for LJMP below) */
+ pushl %edi /* Push new GDT address */
+ pushw %ax /* Push new GDT limit */
+
+ /* Copy GDT to the stack and add a 64-bit code segment at the end */
+ movl $GDT_ENTRY(DESC_CODE64, 0, 0xfffff) & 0xffffffff, (%edi,%ecx)
+ movl $GDT_ENTRY(DESC_CODE64, 0, 0xfffff) >> 32, 4(%edi,%ecx)
+ shrl $2, %ecx
+ cld
+ rep movsl /* Copy the firmware GDT */
+ lgdtl (%esp) /* Switch to the new GDT */
+
+ call 1f
+1: pop %edi
+
+ /* Record mixed mode entry */
+ movb $0x0, (efi_is64 - 1b)(%edi)
+
+ /* Set up indirect far call to re-enter 32-bit mode */
+ leal (efi32_call - 1b)(%edi), %eax
+ addl %eax, (%eax)
+ movw %cs, 4(%eax)
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Set up 1:1 mapping */
+ leal (pte - 1b)(%edi), %eax
+ movl $_PAGE_PRESENT | _PAGE_RW | _PAGE_PSE, %ecx
+ leal (_PAGE_PRESENT | _PAGE_RW)(%eax), %edx
+2: movl %ecx, (%eax)
+ addl $8, %eax
+ addl $PMD_SIZE, %ecx
+ jnc 2b
+
+ movl $PAGE_SIZE, %ecx
+ .irpc l, 0123
+ movl %edx, \l * 8(%eax)
+ addl %ecx, %edx
+ .endr
+ addl %ecx, %eax
+ movl %edx, (%eax)
+ movl %eax, %cr3
+
+ call efi32_enable_long_mode
+
+ /* Set up far jump to 64-bit mode (CS is already on the stack) */
+ leal (efi_stub_entry - 1b)(%edi), %eax
+ movl %eax, 2(%esp)
+
+ movl 0(%ebp), %edi
+ movl 4(%ebp), %esi
+ movl %ebx, %edx
+ ljmpl *2(%esp)
+SYM_FUNC_END(efi32_startup)
+
+/*
+ * efi_status_t efi32_pe_entry(efi_handle_t image_handle,
+ * efi_system_table_32_t *sys_table)
+ */
+SYM_FUNC_START(efi32_pe_entry)
+ pushl %ebx // save callee-save registers
+
+ /* Check whether the CPU supports long mode */
+ movl $0x80000001, %eax // assume extended info support
+ cpuid
+ btl $29, %edx // check long mode bit
+ jnc 1f
+ leal 8(%esp), %esp // preserve stack alignment
+ xor %ebx, %ebx // no struct boot_params pointer
+ jmp efi32_startup // only ESP and EBX remain live
+1: movl $0x80000003, %eax // EFI_UNSUPPORTED
+ popl %ebx
+ RET
+SYM_FUNC_END(efi32_pe_entry)
+
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
+ .org efi32_stub_entry + 0x200
+ .code64
+SYM_FUNC_START_NOALIGN(efi64_stub_entry)
+ jmp efi_handover_entry
+SYM_FUNC_END(efi64_stub_entry)
+#endif
+
+ .data
+ .balign 8
+SYM_DATA_START_LOCAL(efi32_call)
+ .long efi_enter32 - .
+ .word 0x0
+SYM_DATA_END(efi32_call)
+SYM_DATA(efi_is64, .byte 1)
+
+ .bss
+ .balign PAGE_SIZE
+SYM_DATA_LOCAL(pte, .fill 6 * PAGE_SIZE, 1, 0)
diff --git a/arch/x86/boot/startup/gdt_idt.c b/arch/x86/boot/startup/gdt_idt.c
new file mode 100644
index 000000000000..a3112a69b06a
--- /dev/null
+++ b/arch/x86/boot/startup/gdt_idt.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/linkage.h>
+#include <linux/types.h>
+
+#include <asm/desc.h>
+#include <asm/init.h>
+#include <asm/setup.h>
+#include <asm/sev.h>
+#include <asm/trapnr.h>
+
+/*
+ * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is
+ * used until the idt_table takes over. On the boot CPU this happens in
+ * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases
+ * this happens in the functions called from head_64.S.
+ *
+ * The idt_table can't be used that early because all the code modifying it is
+ * in idt.c and can be instrumented by tracing or KASAN, which both don't work
+ * during early CPU bringup. Also the idt_table has the runtime vectors
+ * configured which require certain CPU state to be setup already (like TSS),
+ * which also hasn't happened yet in early CPU bringup.
+ */
+static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data;
+
+/* This may run while still in the direct mapping */
+void __head startup_64_load_idt(void *vc_handler)
+{
+ struct desc_ptr desc = {
+ .address = (unsigned long)rip_rel_ptr(bringup_idt_table),
+ .size = sizeof(bringup_idt_table) - 1,
+ };
+ struct idt_data data;
+ gate_desc idt_desc;
+
+ /* @vc_handler is set only for a VMM Communication Exception */
+ if (vc_handler) {
+ init_idt_data(&data, X86_TRAP_VC, vc_handler);
+ idt_init_desc(&idt_desc, &data);
+ native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc);
+ }
+
+ native_load_idt(&desc);
+}
+
+/*
+ * Setup boot CPU state needed before kernel switches to virtual addresses.
+ */
+void __head startup_64_setup_gdt_idt(void)
+{
+ struct gdt_page *gp = rip_rel_ptr((void *)(__force unsigned long)&gdt_page);
+ void *handler = NULL;
+
+ struct desc_ptr startup_gdt_descr = {
+ .address = (unsigned long)gp->gdt,
+ .size = GDT_SIZE - 1,
+ };
+
+ /* Load GDT */
+ native_load_gdt(&startup_gdt_descr);
+
+ /* New GDT is live - reload data segment registers */
+ asm volatile("movl %%eax, %%ds\n"
+ "movl %%eax, %%ss\n"
+ "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory");
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+ handler = rip_rel_ptr(vc_no_ghcb);
+
+ startup_64_load_idt(handler);
+}
diff --git a/arch/x86/boot/compressed/la57toggle.S b/arch/x86/boot/startup/la57toggle.S
index 9ee002387eb1..370075b4d95b 100644
--- a/arch/x86/boot/compressed/la57toggle.S
+++ b/arch/x86/boot/startup/la57toggle.S
@@ -5,7 +5,6 @@
#include <asm/boot.h>
#include <asm/msr.h>
#include <asm/processor-flags.h>
-#include "pgtable.h"
/*
* This is the 32-bit trampoline that will be copied over to low memory. It
diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c
new file mode 100644
index 000000000000..099ae2559336
--- /dev/null
+++ b/arch/x86/boot/startup/map_kernel.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pgtable.h>
+
+#include <asm/init.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/sev.h>
+
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+extern unsigned int next_early_pgt;
+
+static inline bool check_la57_support(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return false;
+
+ /*
+ * 5-level paging is detected and enabled at kernel decompression
+ * stage. Only check if it has been enabled there.
+ */
+ if (!(native_read_cr4() & X86_CR4_LA57))
+ return false;
+
+ __pgtable_l5_enabled = 1;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
+ page_offset_base = __PAGE_OFFSET_BASE_L5;
+ vmalloc_base = __VMALLOC_BASE_L5;
+ vmemmap_base = __VMEMMAP_BASE_L5;
+
+ return true;
+}
+
+static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
+ pmdval_t *pmd,
+ unsigned long p2v_offset)
+{
+ unsigned long paddr, paddr_end;
+ int i;
+
+ /* Encrypt the kernel and related (if SME is active) */
+ sme_encrypt_kernel(bp);
+
+ /*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (sme_get_me_mask()) {
+ paddr = (unsigned long)rip_rel_ptr(__start_bss_decrypted);
+ paddr_end = (unsigned long)rip_rel_ptr(__end_bss_decrypted);
+
+ for (; paddr < paddr_end; paddr += PMD_SIZE) {
+ /*
+ * On SNP, transition the page to shared in the RMP table so that
+ * it is consistent with the page table attribute change.
+ *
+ * __start_bss_decrypted has a virtual address in the high range
+ * mapping (kernel .text). PVALIDATE, by way of
+ * early_snp_set_memory_shared(), requires a valid virtual
+ * address but the kernel is currently running off of the identity
+ * mapping so use the PA to get a *currently* valid virtual address.
+ */
+ early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
+
+ i = pmd_index(paddr - p2v_offset);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
+/*
+ * This code is compiled using PIC codegen because it will execute from the
+ * early 1:1 mapping of memory, which deviates from the mapping expected by the
+ * linker. Due to this deviation, taking the address of a global variable will
+ * produce an ambiguous result when using the plain & operator. Instead,
+ * rip_rel_ptr() must be used, which will return the RIP-relative address in
+ * the 1:1 mapping of memory. Kernel virtual addresses can be determined by
+ * subtracting p2v_offset from the RIP-relative address.
+ */
+unsigned long __head __startup_64(unsigned long p2v_offset,
+ struct boot_params *bp)
+{
+ pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts);
+ unsigned long physaddr = (unsigned long)rip_rel_ptr(_text);
+ unsigned long va_text, va_end;
+ unsigned long pgtable_flags;
+ unsigned long load_delta;
+ pgdval_t *pgd;
+ p4dval_t *p4d;
+ pudval_t *pud;
+ pmdval_t *pmd, pmd_entry;
+ bool la57;
+ int i;
+
+ la57 = check_la57_support();
+
+ /* Is the address too large? */
+ if (physaddr >> MAX_PHYSMEM_BITS)
+ for (;;);
+
+ /*
+ * Compute the delta between the address I am compiled to run at
+ * and the address I am actually running at.
+ */
+ phys_base = load_delta = __START_KERNEL_map + p2v_offset;
+
+ /* Is the address not 2M aligned? */
+ if (load_delta & ~PMD_MASK)
+ for (;;);
+
+ va_text = physaddr - p2v_offset;
+ va_end = (unsigned long)rip_rel_ptr(_end) - p2v_offset;
+
+ /* Include the SME encryption mask in the fixup value */
+ load_delta += sme_get_me_mask();
+
+ /* Fixup the physical addresses in the page table */
+
+ pgd = rip_rel_ptr(early_top_pgt);
+ pgd[pgd_index(__START_KERNEL_map)] += load_delta;
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
+ p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt);
+ p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
+
+ pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
+ }
+
+ level3_kernel_pgt[PTRS_PER_PUD - 2].pud += load_delta;
+ level3_kernel_pgt[PTRS_PER_PUD - 1].pud += load_delta;
+
+ for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
+ level2_fixmap_pgt[i].pmd += load_delta;
+
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
+
+ pud = &early_pgts[0]->pmd;
+ pmd = &early_pgts[1]->pmd;
+ next_early_pgt = 2;
+
+ pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
+
+ if (la57) {
+ p4d = &early_pgts[next_early_pgt++]->pmd;
+
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
+
+ i = physaddr >> P4D_SHIFT;
+ p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
+ p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
+ } else {
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
+ }
+
+ i = physaddr >> PUD_SHIFT;
+ pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
+ pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
+
+ pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+ pmd_entry += sme_get_me_mask();
+ pmd_entry += physaddr;
+
+ for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
+ int idx = i + (physaddr >> PMD_SHIFT);
+
+ pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
+ }
+
+ /*
+ * Fixup the kernel text+data virtual addresses. Note that
+ * we might write invalid pmds, when the kernel is relocated
+ * cleanup_highmap() fixes this up along with the mappings
+ * beyond _end.
+ *
+ * Only the region occupied by the kernel image has so far
+ * been checked against the table of usable memory regions
+ * provided by the firmware, so invalidate pages outside that
+ * region. A page table entry that maps to a reserved area of
+ * memory would allow processor speculation into that area,
+ * and on some hardware (particularly the UV platform) even
+ * speculative access to some reserved areas is caught as an
+ * error, causing the BIOS to halt the system.
+ */
+
+ pmd = rip_rel_ptr(level2_kernel_pgt);
+
+ /* invalidate pages before the kernel image */
+ for (i = 0; i < pmd_index(va_text); i++)
+ pmd[i] &= ~_PAGE_PRESENT;
+
+ /* fixup pages that are part of the kernel image */
+ for (; i <= pmd_index(va_end); i++)
+ if (pmd[i] & _PAGE_PRESENT)
+ pmd[i] += load_delta;
+
+ /* invalidate pages after the kernel image */
+ for (; i < PTRS_PER_PMD; i++)
+ pmd[i] &= ~_PAGE_PRESENT;
+
+ return sme_postprocess_startup(bp, pmd, p2v_offset);
+}
diff --git a/arch/x86/coco/sev/shared.c b/arch/x86/boot/startup/sev-shared.c
index 2e4122f8aa6b..7a706db87b93 100644
--- a/arch/x86/coco/sev/shared.c
+++ b/arch/x86/boot/startup/sev-shared.c
@@ -14,76 +14,23 @@
#ifndef __BOOT_COMPRESSED
#define error(v) pr_err(v)
#define has_cpuflag(f) boot_cpu_has(f)
-#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__)
-#define sev_printk_rtl(fmt, ...) printk_ratelimited(fmt, ##__VA_ARGS__)
#else
#undef WARN
#define WARN(condition, format...) (!!(condition))
-#define sev_printk(fmt, ...)
-#define sev_printk_rtl(fmt, ...)
#undef vc_forward_exception
#define vc_forward_exception(c) panic("SNP: Hypervisor requested exception\n")
#endif
/*
* SVSM related information:
- * When running under an SVSM, the VMPL that Linux is executing at must be
- * non-zero. The VMPL is therefore used to indicate the presence of an SVSM.
- *
* During boot, the page tables are set up as identity mapped and later
* changed to use kernel virtual addresses. Maintain separate virtual and
* physical addresses for the CAA to allow SVSM functions to be used during
* early boot, both with identity mapped virtual addresses and proper kernel
* virtual addresses.
*/
-u8 snp_vmpl __ro_after_init;
-EXPORT_SYMBOL_GPL(snp_vmpl);
-static struct svsm_ca *boot_svsm_caa __ro_after_init;
-static u64 boot_svsm_caa_pa __ro_after_init;
-
-static struct svsm_ca *svsm_get_caa(void);
-static u64 svsm_get_caa_pa(void);
-static int svsm_perform_call_protocol(struct svsm_call *call);
-
-/* I/O parameters for CPUID-related helpers */
-struct cpuid_leaf {
- u32 fn;
- u32 subfn;
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
-};
-
-/*
- * Individual entries of the SNP CPUID table, as defined by the SNP
- * Firmware ABI, Revision 0.9, Section 7.1, Table 14.
- */
-struct snp_cpuid_fn {
- u32 eax_in;
- u32 ecx_in;
- u64 xcr0_in;
- u64 xss_in;
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u64 __reserved;
-} __packed;
-
-/*
- * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9,
- * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit
- * of 64 entries per CPUID table.
- */
-#define SNP_CPUID_COUNT_MAX 64
-
-struct snp_cpuid_table {
- u32 count;
- u32 __reserved1;
- u64 __reserved2;
- struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX];
-} __packed;
+struct svsm_ca *boot_svsm_caa __ro_after_init;
+u64 boot_svsm_caa_pa __ro_after_init;
/*
* Since feature negotiation related variables are set early in the boot
@@ -107,7 +54,7 @@ static u32 cpuid_std_range_max __ro_after_init;
static u32 cpuid_hyp_range_max __ro_after_init;
static u32 cpuid_ext_range_max __ro_after_init;
-static bool __init sev_es_check_cpu_features(void)
+bool __init sev_es_check_cpu_features(void)
{
if (!has_cpuflag(X86_FEATURE_RDRAND)) {
error("RDRAND instruction not supported - no trusted source of randomness available\n");
@@ -117,7 +64,7 @@ static bool __init sev_es_check_cpu_features(void)
return true;
}
-static void __head __noreturn
+void __head __noreturn
sev_es_terminate(unsigned int set, unsigned int reason)
{
u64 val = GHCB_MSR_TERM_REQ;
@@ -136,7 +83,7 @@ sev_es_terminate(unsigned int set, unsigned int reason)
/*
* The hypervisor features are available from GHCB version 2 onward.
*/
-static u64 get_hv_features(void)
+u64 get_hv_features(void)
{
u64 val;
@@ -153,7 +100,7 @@ static u64 get_hv_features(void)
return GHCB_MSR_HV_FT_RESP_VAL(val);
}
-static void snp_register_ghcb_early(unsigned long paddr)
+void snp_register_ghcb_early(unsigned long paddr)
{
unsigned long pfn = paddr >> PAGE_SHIFT;
u64 val;
@@ -169,7 +116,7 @@ static void snp_register_ghcb_early(unsigned long paddr)
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER);
}
-static bool sev_es_negotiate_protocol(void)
+bool sev_es_negotiate_protocol(void)
{
u64 val;
@@ -190,39 +137,6 @@ static bool sev_es_negotiate_protocol(void)
return true;
}
-static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
-{
- ghcb->save.sw_exit_code = 0;
- __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
-}
-
-static bool vc_decoding_needed(unsigned long exit_code)
-{
- /* Exceptions don't require to decode the instruction */
- return !(exit_code >= SVM_EXIT_EXCP_BASE &&
- exit_code <= SVM_EXIT_LAST_EXCP);
-}
-
-static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt,
- struct pt_regs *regs,
- unsigned long exit_code)
-{
- enum es_result ret = ES_OK;
-
- memset(ctxt, 0, sizeof(*ctxt));
- ctxt->regs = regs;
-
- if (vc_decoding_needed(exit_code))
- ret = vc_decode_insn(ctxt);
-
- return ret;
-}
-
-static void vc_finish_insn(struct es_em_ctxt *ctxt)
-{
- ctxt->regs->ip += ctxt->insn.length;
-}
-
static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
u32 ret;
@@ -344,7 +258,7 @@ static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call)
* Fill in protocol and format specifiers. This can be called very early
* in the boot, so use rip-relative references as needed.
*/
- ghcb->protocol_version = RIP_REL_REF(ghcb_version);
+ ghcb->protocol_version = ghcb_version;
ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL);
@@ -371,10 +285,10 @@ static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call)
return svsm_process_result_codes(call);
}
-static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt,
- u64 exit_code, u64 exit_info_1,
- u64 exit_info_2)
+enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt,
+ u64 exit_code, u64 exit_info_1,
+ u64 exit_info_2)
{
/* Fill in protocol and format specifiers */
ghcb->protocol_version = ghcb_version;
@@ -473,9 +387,9 @@ static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid
* while running with the initial identity mapping as well as the
* switch-over to kernel virtual addresses later.
*/
-static const struct snp_cpuid_table *snp_cpuid_get_table(void)
+const struct snp_cpuid_table *snp_cpuid_get_table(void)
{
- return &RIP_REL_REF(cpuid_table_copy);
+ return rip_rel_ptr(&cpuid_table_copy);
}
/*
@@ -672,7 +586,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
* Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
* should be treated as fatal by caller.
*/
-static int __head
+int __head
snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
@@ -701,9 +615,9 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0;
/* Skip post-processing for out-of-range zero leafs. */
- if (!(leaf->fn <= RIP_REL_REF(cpuid_std_range_max) ||
- (leaf->fn >= 0x40000000 && leaf->fn <= RIP_REL_REF(cpuid_hyp_range_max)) ||
- (leaf->fn >= 0x80000000 && leaf->fn <= RIP_REL_REF(cpuid_ext_range_max))))
+ if (!(leaf->fn <= cpuid_std_range_max ||
+ (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) ||
+ (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max)))
return 0;
}
@@ -782,391 +696,6 @@ fail:
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
}
-static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt,
- unsigned long address,
- bool write)
-{
- if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) {
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = X86_PF_USER;
- ctxt->fi.cr2 = address;
- if (write)
- ctxt->fi.error_code |= X86_PF_WRITE;
-
- return ES_EXCEPTION;
- }
-
- return ES_OK;
-}
-
-static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
- void *src, char *buf,
- unsigned int data_size,
- unsigned int count,
- bool backwards)
-{
- int i, b = backwards ? -1 : 1;
- unsigned long address = (unsigned long)src;
- enum es_result ret;
-
- ret = vc_insn_string_check(ctxt, address, false);
- if (ret != ES_OK)
- return ret;
-
- for (i = 0; i < count; i++) {
- void *s = src + (i * data_size * b);
- char *d = buf + (i * data_size);
-
- ret = vc_read_mem(ctxt, s, d, data_size);
- if (ret != ES_OK)
- break;
- }
-
- return ret;
-}
-
-static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
- void *dst, char *buf,
- unsigned int data_size,
- unsigned int count,
- bool backwards)
-{
- int i, s = backwards ? -1 : 1;
- unsigned long address = (unsigned long)dst;
- enum es_result ret;
-
- ret = vc_insn_string_check(ctxt, address, true);
- if (ret != ES_OK)
- return ret;
-
- for (i = 0; i < count; i++) {
- void *d = dst + (i * data_size * s);
- char *b = buf + (i * data_size);
-
- ret = vc_write_mem(ctxt, d, b, data_size);
- if (ret != ES_OK)
- break;
- }
-
- return ret;
-}
-
-#define IOIO_TYPE_STR BIT(2)
-#define IOIO_TYPE_IN 1
-#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR)
-#define IOIO_TYPE_OUT 0
-#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR)
-
-#define IOIO_REP BIT(3)
-
-#define IOIO_ADDR_64 BIT(9)
-#define IOIO_ADDR_32 BIT(8)
-#define IOIO_ADDR_16 BIT(7)
-
-#define IOIO_DATA_32 BIT(6)
-#define IOIO_DATA_16 BIT(5)
-#define IOIO_DATA_8 BIT(4)
-
-#define IOIO_SEG_ES (0 << 10)
-#define IOIO_SEG_DS (3 << 10)
-
-static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
-{
- struct insn *insn = &ctxt->insn;
- size_t size;
- u64 port;
-
- *exitinfo = 0;
-
- switch (insn->opcode.bytes[0]) {
- /* INS opcodes */
- case 0x6c:
- case 0x6d:
- *exitinfo |= IOIO_TYPE_INS;
- *exitinfo |= IOIO_SEG_ES;
- port = ctxt->regs->dx & 0xffff;
- break;
-
- /* OUTS opcodes */
- case 0x6e:
- case 0x6f:
- *exitinfo |= IOIO_TYPE_OUTS;
- *exitinfo |= IOIO_SEG_DS;
- port = ctxt->regs->dx & 0xffff;
- break;
-
- /* IN immediate opcodes */
- case 0xe4:
- case 0xe5:
- *exitinfo |= IOIO_TYPE_IN;
- port = (u8)insn->immediate.value & 0xffff;
- break;
-
- /* OUT immediate opcodes */
- case 0xe6:
- case 0xe7:
- *exitinfo |= IOIO_TYPE_OUT;
- port = (u8)insn->immediate.value & 0xffff;
- break;
-
- /* IN register opcodes */
- case 0xec:
- case 0xed:
- *exitinfo |= IOIO_TYPE_IN;
- port = ctxt->regs->dx & 0xffff;
- break;
-
- /* OUT register opcodes */
- case 0xee:
- case 0xef:
- *exitinfo |= IOIO_TYPE_OUT;
- port = ctxt->regs->dx & 0xffff;
- break;
-
- default:
- return ES_DECODE_FAILED;
- }
-
- *exitinfo |= port << 16;
-
- switch (insn->opcode.bytes[0]) {
- case 0x6c:
- case 0x6e:
- case 0xe4:
- case 0xe6:
- case 0xec:
- case 0xee:
- /* Single byte opcodes */
- *exitinfo |= IOIO_DATA_8;
- size = 1;
- break;
- default:
- /* Length determined by instruction parsing */
- *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16
- : IOIO_DATA_32;
- size = (insn->opnd_bytes == 2) ? 2 : 4;
- }
-
- switch (insn->addr_bytes) {
- case 2:
- *exitinfo |= IOIO_ADDR_16;
- break;
- case 4:
- *exitinfo |= IOIO_ADDR_32;
- break;
- case 8:
- *exitinfo |= IOIO_ADDR_64;
- break;
- }
-
- if (insn_has_rep_prefix(insn))
- *exitinfo |= IOIO_REP;
-
- return vc_ioio_check(ctxt, (u16)port, size);
-}
-
-static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- struct pt_regs *regs = ctxt->regs;
- u64 exit_info_1, exit_info_2;
- enum es_result ret;
-
- ret = vc_ioio_exitinfo(ctxt, &exit_info_1);
- if (ret != ES_OK)
- return ret;
-
- if (exit_info_1 & IOIO_TYPE_STR) {
-
- /* (REP) INS/OUTS */
-
- bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF);
- unsigned int io_bytes, exit_bytes;
- unsigned int ghcb_count, op_count;
- unsigned long es_base;
- u64 sw_scratch;
-
- /*
- * For the string variants with rep prefix the amount of in/out
- * operations per #VC exception is limited so that the kernel
- * has a chance to take interrupts and re-schedule while the
- * instruction is emulated.
- */
- io_bytes = (exit_info_1 >> 4) & 0x7;
- ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes;
-
- op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1;
- exit_info_2 = min(op_count, ghcb_count);
- exit_bytes = exit_info_2 * io_bytes;
-
- es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
-
- /* Read bytes of OUTS into the shared buffer */
- if (!(exit_info_1 & IOIO_TYPE_IN)) {
- ret = vc_insn_string_read(ctxt,
- (void *)(es_base + regs->si),
- ghcb->shared_buffer, io_bytes,
- exit_info_2, df);
- if (ret)
- return ret;
- }
-
- /*
- * Issue an VMGEXIT to the HV to consume the bytes from the
- * shared buffer or to have it write them into the shared buffer
- * depending on the instruction: OUTS or INS.
- */
- sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer);
- ghcb_set_sw_scratch(ghcb, sw_scratch);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO,
- exit_info_1, exit_info_2);
- if (ret != ES_OK)
- return ret;
-
- /* Read bytes from shared buffer into the guest's destination. */
- if (exit_info_1 & IOIO_TYPE_IN) {
- ret = vc_insn_string_write(ctxt,
- (void *)(es_base + regs->di),
- ghcb->shared_buffer, io_bytes,
- exit_info_2, df);
- if (ret)
- return ret;
-
- if (df)
- regs->di -= exit_bytes;
- else
- regs->di += exit_bytes;
- } else {
- if (df)
- regs->si -= exit_bytes;
- else
- regs->si += exit_bytes;
- }
-
- if (exit_info_1 & IOIO_REP)
- regs->cx -= exit_info_2;
-
- ret = regs->cx ? ES_RETRY : ES_OK;
-
- } else {
-
- /* IN/OUT into/from rAX */
-
- int bits = (exit_info_1 & 0x70) >> 1;
- u64 rax = 0;
-
- if (!(exit_info_1 & IOIO_TYPE_IN))
- rax = lower_bits(regs->ax, bits);
-
- ghcb_set_rax(ghcb, rax);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0);
- if (ret != ES_OK)
- return ret;
-
- if (exit_info_1 & IOIO_TYPE_IN) {
- if (!ghcb_rax_is_valid(ghcb))
- return ES_VMM_ERROR;
- regs->ax = lower_bits(ghcb->save.rax, bits);
- }
- }
-
- return ret;
-}
-
-static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- struct pt_regs *regs = ctxt->regs;
- struct cpuid_leaf leaf;
- int ret;
-
- leaf.fn = regs->ax;
- leaf.subfn = regs->cx;
- ret = snp_cpuid(ghcb, ctxt, &leaf);
- if (!ret) {
- regs->ax = leaf.eax;
- regs->bx = leaf.ebx;
- regs->cx = leaf.ecx;
- regs->dx = leaf.edx;
- }
-
- return ret;
-}
-
-static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct pt_regs *regs = ctxt->regs;
- u32 cr4 = native_read_cr4();
- enum es_result ret;
- int snp_cpuid_ret;
-
- snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt);
- if (!snp_cpuid_ret)
- return ES_OK;
- if (snp_cpuid_ret != -EOPNOTSUPP)
- return ES_VMM_ERROR;
-
- ghcb_set_rax(ghcb, regs->ax);
- ghcb_set_rcx(ghcb, regs->cx);
-
- if (cr4 & X86_CR4_OSXSAVE)
- /* Safe to read xcr0 */
- ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
- else
- /* xgetbv will cause #GP - use reset value for xcr0 */
- ghcb_set_xcr0(ghcb, 1);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!(ghcb_rax_is_valid(ghcb) &&
- ghcb_rbx_is_valid(ghcb) &&
- ghcb_rcx_is_valid(ghcb) &&
- ghcb_rdx_is_valid(ghcb)))
- return ES_VMM_ERROR;
-
- regs->ax = ghcb->save.rax;
- regs->bx = ghcb->save.rbx;
- regs->cx = ghcb->save.rcx;
- regs->dx = ghcb->save.rdx;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt,
- unsigned long exit_code)
-{
- bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
- enum es_result ret;
-
- /*
- * The hypervisor should not be intercepting RDTSC/RDTSCP when Secure
- * TSC is enabled. A #VC exception will be generated if the RDTSC/RDTSCP
- * instructions are being intercepted. If this should occur and Secure
- * TSC is enabled, guest execution should be terminated as the guest
- * cannot rely on the TSC value provided by the hypervisor.
- */
- if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
- return ES_VMM_ERROR;
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) &&
- (!rdtscp || ghcb_rcx_is_valid(ghcb))))
- return ES_VMM_ERROR;
-
- ctxt->regs->ax = ghcb->save.rax;
- ctxt->regs->dx = ghcb->save.rdx;
- if (rdtscp)
- ctxt->regs->cx = ghcb->save.rcx;
-
- return ES_OK;
-}
-
struct cc_setup_data {
struct setup_data header;
u32 cc_blob_address;
@@ -1224,36 +753,14 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
if (fn->eax_in == 0x0)
- RIP_REL_REF(cpuid_std_range_max) = fn->eax;
+ cpuid_std_range_max = fn->eax;
else if (fn->eax_in == 0x40000000)
- RIP_REL_REF(cpuid_hyp_range_max) = fn->eax;
+ cpuid_hyp_range_max = fn->eax;
else if (fn->eax_in == 0x80000000)
- RIP_REL_REF(cpuid_ext_range_max) = fn->eax;
+ cpuid_ext_range_max = fn->eax;
}
}
-static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size,
- int ret, u64 svsm_ret)
-{
- WARN(1, "PVALIDATE failure: pfn: 0x%llx, action: %u, size: %u, ret: %d, svsm_ret: 0x%llx\n",
- pfn, action, page_size, ret, svsm_ret);
-
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
-}
-
-static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svsm_ret)
-{
- unsigned int page_size;
- bool action;
- u64 pfn;
-
- pfn = pc->entry[pc->cur_index].pfn;
- action = pc->entry[pc->cur_index].action;
- page_size = pc->entry[pc->cur_index].page_size;
-
- __pval_terminate(pfn, action, page_size, ret, svsm_ret);
-}
-
static void __head svsm_pval_4k_page(unsigned long paddr, bool validate)
{
struct svsm_pvalidate_call *pc;
@@ -1296,11 +803,7 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr,
{
int ret;
- /*
- * This can be called very early during boot, so use rIP-relative
- * references as needed.
- */
- if (RIP_REL_REF(snp_vmpl)) {
+ if (snp_vmpl) {
svsm_pval_4k_page(paddr, validate);
} else {
ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
@@ -1309,351 +812,6 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr,
}
}
-static void pval_pages(struct snp_psc_desc *desc)
-{
- struct psc_entry *e;
- unsigned long vaddr;
- unsigned int size;
- unsigned int i;
- bool validate;
- u64 pfn;
- int rc;
-
- for (i = 0; i <= desc->hdr.end_entry; i++) {
- e = &desc->entries[i];
-
- pfn = e->gfn;
- vaddr = (unsigned long)pfn_to_kaddr(pfn);
- size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K;
- validate = e->operation == SNP_PAGE_STATE_PRIVATE;
-
- rc = pvalidate(vaddr, size, validate);
- if (!rc)
- continue;
-
- if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) {
- unsigned long vaddr_end = vaddr + PMD_SIZE;
-
- for (; vaddr < vaddr_end; vaddr += PAGE_SIZE, pfn++) {
- rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
- if (rc)
- __pval_terminate(pfn, validate, RMP_PG_SIZE_4K, rc, 0);
- }
- } else {
- __pval_terminate(pfn, validate, size, rc, 0);
- }
- }
-}
-
-static u64 svsm_build_ca_from_pfn_range(u64 pfn, u64 pfn_end, bool action,
- struct svsm_pvalidate_call *pc)
-{
- struct svsm_pvalidate_entry *pe;
-
- /* Nothing in the CA yet */
- pc->num_entries = 0;
- pc->cur_index = 0;
-
- pe = &pc->entry[0];
-
- while (pfn < pfn_end) {
- pe->page_size = RMP_PG_SIZE_4K;
- pe->action = action;
- pe->ignore_cf = 0;
- pe->pfn = pfn;
-
- pe++;
- pfn++;
-
- pc->num_entries++;
- if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT)
- break;
- }
-
- return pfn;
-}
-
-static int svsm_build_ca_from_psc_desc(struct snp_psc_desc *desc, unsigned int desc_entry,
- struct svsm_pvalidate_call *pc)
-{
- struct svsm_pvalidate_entry *pe;
- struct psc_entry *e;
-
- /* Nothing in the CA yet */
- pc->num_entries = 0;
- pc->cur_index = 0;
-
- pe = &pc->entry[0];
- e = &desc->entries[desc_entry];
-
- while (desc_entry <= desc->hdr.end_entry) {
- pe->page_size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K;
- pe->action = e->operation == SNP_PAGE_STATE_PRIVATE;
- pe->ignore_cf = 0;
- pe->pfn = e->gfn;
-
- pe++;
- e++;
-
- desc_entry++;
- pc->num_entries++;
- if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT)
- break;
- }
-
- return desc_entry;
-}
-
-static void svsm_pval_pages(struct snp_psc_desc *desc)
-{
- struct svsm_pvalidate_entry pv_4k[VMGEXIT_PSC_MAX_ENTRY];
- unsigned int i, pv_4k_count = 0;
- struct svsm_pvalidate_call *pc;
- struct svsm_call call = {};
- unsigned long flags;
- bool action;
- u64 pc_pa;
- int ret;
-
- /*
- * This can be called very early in the boot, use native functions in
- * order to avoid paravirt issues.
- */
- flags = native_local_irq_save();
-
- /*
- * The SVSM calling area (CA) can support processing 510 entries at a
- * time. Loop through the Page State Change descriptor until the CA is
- * full or the last entry in the descriptor is reached, at which time
- * the SVSM is invoked. This repeats until all entries in the descriptor
- * are processed.
- */
- call.caa = svsm_get_caa();
-
- pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer;
- pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer);
-
- /* Protocol 0, Call ID 1 */
- call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE);
- call.rcx = pc_pa;
-
- for (i = 0; i <= desc->hdr.end_entry;) {
- i = svsm_build_ca_from_psc_desc(desc, i, pc);
-
- do {
- ret = svsm_perform_call_protocol(&call);
- if (!ret)
- continue;
-
- /*
- * Check if the entry failed because of an RMP mismatch (a
- * PVALIDATE at 2M was requested, but the page is mapped in
- * the RMP as 4K).
- */
-
- if (call.rax_out == SVSM_PVALIDATE_FAIL_SIZEMISMATCH &&
- pc->entry[pc->cur_index].page_size == RMP_PG_SIZE_2M) {
- /* Save this entry for post-processing at 4K */
- pv_4k[pv_4k_count++] = pc->entry[pc->cur_index];
-
- /* Skip to the next one unless at the end of the list */
- pc->cur_index++;
- if (pc->cur_index < pc->num_entries)
- ret = -EAGAIN;
- else
- ret = 0;
- }
- } while (ret == -EAGAIN);
-
- if (ret)
- svsm_pval_terminate(pc, ret, call.rax_out);
- }
-
- /* Process any entries that failed to be validated at 2M and validate them at 4K */
- for (i = 0; i < pv_4k_count; i++) {
- u64 pfn, pfn_end;
-
- action = pv_4k[i].action;
- pfn = pv_4k[i].pfn;
- pfn_end = pfn + 512;
-
- while (pfn < pfn_end) {
- pfn = svsm_build_ca_from_pfn_range(pfn, pfn_end, action, pc);
-
- ret = svsm_perform_call_protocol(&call);
- if (ret)
- svsm_pval_terminate(pc, ret, call.rax_out);
- }
- }
-
- native_local_irq_restore(flags);
-}
-
-static void pvalidate_pages(struct snp_psc_desc *desc)
-{
- if (snp_vmpl)
- svsm_pval_pages(desc);
- else
- pval_pages(desc);
-}
-
-static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc)
-{
- int cur_entry, end_entry, ret = 0;
- struct snp_psc_desc *data;
- struct es_em_ctxt ctxt;
-
- vc_ghcb_invalidate(ghcb);
-
- /* Copy the input desc into GHCB shared buffer */
- data = (struct snp_psc_desc *)ghcb->shared_buffer;
- memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc)));
-
- /*
- * As per the GHCB specification, the hypervisor can resume the guest
- * before processing all the entries. Check whether all the entries
- * are processed. If not, then keep retrying. Note, the hypervisor
- * will update the data memory directly to indicate the status, so
- * reference the data->hdr everywhere.
- *
- * The strategy here is to wait for the hypervisor to change the page
- * state in the RMP table before guest accesses the memory pages. If the
- * page state change was not successful, then later memory access will
- * result in a crash.
- */
- cur_entry = data->hdr.cur_entry;
- end_entry = data->hdr.end_entry;
-
- while (data->hdr.cur_entry <= data->hdr.end_entry) {
- ghcb_set_sw_scratch(ghcb, (u64)__pa(data));
-
- /* This will advance the shared buffer data points to. */
- ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0);
-
- /*
- * Page State Change VMGEXIT can pass error code through
- * exit_info_2.
- */
- if (WARN(ret || ghcb->save.sw_exit_info_2,
- "SNP: PSC failed ret=%d exit_info_2=%llx\n",
- ret, ghcb->save.sw_exit_info_2)) {
- ret = 1;
- goto out;
- }
-
- /* Verify that reserved bit is not set */
- if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) {
- ret = 1;
- goto out;
- }
-
- /*
- * Sanity check that entry processing is not going backwards.
- * This will happen only if hypervisor is tricking us.
- */
- if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry,
-"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n",
- end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) {
- ret = 1;
- goto out;
- }
- }
-
-out:
- return ret;
-}
-
-static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt,
- unsigned long exit_code)
-{
- unsigned int opcode = (unsigned int)ctxt->insn.opcode.value;
- u8 modrm = ctxt->insn.modrm.value;
-
- switch (exit_code) {
-
- case SVM_EXIT_IOIO:
- case SVM_EXIT_NPF:
- /* handled separately */
- return ES_OK;
-
- case SVM_EXIT_CPUID:
- if (opcode == 0xa20f)
- return ES_OK;
- break;
-
- case SVM_EXIT_INVD:
- if (opcode == 0x080f)
- return ES_OK;
- break;
-
- case SVM_EXIT_MONITOR:
- /* MONITOR and MONITORX instructions generate the same error code */
- if (opcode == 0x010f && (modrm == 0xc8 || modrm == 0xfa))
- return ES_OK;
- break;
-
- case SVM_EXIT_MWAIT:
- /* MWAIT and MWAITX instructions generate the same error code */
- if (opcode == 0x010f && (modrm == 0xc9 || modrm == 0xfb))
- return ES_OK;
- break;
-
- case SVM_EXIT_MSR:
- /* RDMSR */
- if (opcode == 0x320f ||
- /* WRMSR */
- opcode == 0x300f)
- return ES_OK;
- break;
-
- case SVM_EXIT_RDPMC:
- if (opcode == 0x330f)
- return ES_OK;
- break;
-
- case SVM_EXIT_RDTSC:
- if (opcode == 0x310f)
- return ES_OK;
- break;
-
- case SVM_EXIT_RDTSCP:
- if (opcode == 0x010f && modrm == 0xf9)
- return ES_OK;
- break;
-
- case SVM_EXIT_READ_DR7:
- if (opcode == 0x210f &&
- X86_MODRM_REG(ctxt->insn.modrm.value) == 7)
- return ES_OK;
- break;
-
- case SVM_EXIT_VMMCALL:
- if (opcode == 0x010f && modrm == 0xd9)
- return ES_OK;
-
- break;
-
- case SVM_EXIT_WRITE_DR7:
- if (opcode == 0x230f &&
- X86_MODRM_REG(ctxt->insn.modrm.value) == 7)
- return ES_OK;
- break;
-
- case SVM_EXIT_WBINVD:
- if (opcode == 0x90f)
- return ES_OK;
- break;
-
- default:
- break;
- }
-
- sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n",
- opcode, exit_code, ctxt->regs->ip);
-
- return ES_UNSUPPORTED;
-}
-
/*
* Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM
* services needed when not running in VMPL0.
@@ -1681,7 +839,7 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info)
* routine is running identity mapped when called, both by the decompressor
* code and the early kernel code.
*/
- if (!rmpadjust((unsigned long)&RIP_REL_REF(boot_ghcb_page), RMP_PG_SIZE_4K, 1))
+ if (!rmpadjust((unsigned long)rip_rel_ptr(&boot_ghcb_page), RMP_PG_SIZE_4K, 1))
return false;
/*
@@ -1698,7 +856,7 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info)
if (!secrets_page->svsm_guest_vmpl)
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_VMPL0);
- RIP_REL_REF(snp_vmpl) = secrets_page->svsm_guest_vmpl;
+ snp_vmpl = secrets_page->svsm_guest_vmpl;
caa = secrets_page->svsm_caa;
@@ -1713,8 +871,8 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info)
* The CA is identity mapped when this routine is called, both by the
* decompressor code and the early kernel code.
*/
- RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)caa;
- RIP_REL_REF(boot_svsm_caa_pa) = caa;
+ boot_svsm_caa = (struct svsm_ca *)caa;
+ boot_svsm_caa_pa = caa;
/* Advertise the SVSM presence via CPUID. */
cpuid_table = (struct snp_cpuid_table *)snp_cpuid_get_table();
diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c
new file mode 100644
index 000000000000..435853a55768
--- /dev/null
+++ b/arch/x86/boot/startup/sev-startup.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "SEV: " fmt
+
+#include <linux/percpu-defs.h>
+#include <linux/cc_platform.h>
+#include <linux/printk.h>
+#include <linux/mm_types.h>
+#include <linux/set_memory.h>
+#include <linux/memblock.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/efi.h>
+#include <linux/io.h>
+#include <linux/psp-sev.h>
+#include <uapi/linux/sev-guest.h>
+
+#include <asm/init.h>
+#include <asm/cpu_entry_area.h>
+#include <asm/stacktrace.h>
+#include <asm/sev.h>
+#include <asm/sev-internal.h>
+#include <asm/insn-eval.h>
+#include <asm/fpu/xcr.h>
+#include <asm/processor.h>
+#include <asm/realmode.h>
+#include <asm/setup.h>
+#include <asm/traps.h>
+#include <asm/svm.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/cpuid.h>
+#include <asm/cmdline.h>
+
+/* For early boot hypervisor communication in SEV-ES enabled guests */
+struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
+
+/*
+ * Needs to be in the .data section because we need it NULL before bss is
+ * cleared
+ */
+struct ghcb *boot_ghcb __section(".data");
+
+/* Bitmap of SEV features supported by the hypervisor */
+u64 sev_hv_features __ro_after_init;
+
+/* Secrets page physical address from the CC blob */
+u64 sev_secrets_pa __ro_after_init;
+
+/* For early boot SVSM communication */
+struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE);
+
+DEFINE_PER_CPU(struct svsm_ca *, svsm_caa);
+DEFINE_PER_CPU(u64, svsm_caa_pa);
+
+/*
+ * Nothing shall interrupt this code path while holding the per-CPU
+ * GHCB. The backup GHCB is only for NMIs interrupting this path.
+ *
+ * Callers must disable local interrupts around it.
+ */
+noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ WARN_ON(!irqs_disabled());
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (unlikely(data->ghcb_active)) {
+ /* GHCB is already in use - save its contents */
+
+ if (unlikely(data->backup_ghcb_active)) {
+ /*
+ * Backup-GHCB is also already in use. There is no way
+ * to continue here so just kill the machine. To make
+ * panic() work, mark GHCBs inactive so that messages
+ * can be printed out.
+ */
+ data->ghcb_active = false;
+ data->backup_ghcb_active = false;
+
+ instrumentation_begin();
+ panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
+ instrumentation_end();
+ }
+
+ /* Mark backup_ghcb active before writing to it */
+ data->backup_ghcb_active = true;
+
+ state->ghcb = &data->backup_ghcb;
+
+ /* Backup GHCB content */
+ *state->ghcb = *ghcb;
+ } else {
+ state->ghcb = NULL;
+ data->ghcb_active = true;
+ }
+
+ return ghcb;
+}
+
+/* Include code shared with pre-decompression boot stage */
+#include "sev-shared.c"
+
+noinstr void __sev_put_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ WARN_ON(!irqs_disabled());
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (state->ghcb) {
+ /* Restore GHCB from Backup */
+ *ghcb = *state->ghcb;
+ data->backup_ghcb_active = false;
+ state->ghcb = NULL;
+ } else {
+ /*
+ * Invalidate the GHCB so a VMGEXIT instruction issued
+ * from userspace won't appear to be valid.
+ */
+ vc_ghcb_invalidate(ghcb);
+ data->ghcb_active = false;
+ }
+}
+
+int svsm_perform_call_protocol(struct svsm_call *call)
+{
+ struct ghcb_state state;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ int ret;
+
+ /*
+ * This can be called very early in the boot, use native functions in
+ * order to avoid paravirt issues.
+ */
+ flags = native_local_irq_save();
+
+ if (sev_cfg.ghcbs_initialized)
+ ghcb = __sev_get_ghcb(&state);
+ else if (boot_ghcb)
+ ghcb = boot_ghcb;
+ else
+ ghcb = NULL;
+
+ do {
+ ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call)
+ : svsm_perform_msr_protocol(call);
+ } while (ret == -EAGAIN);
+
+ if (sev_cfg.ghcbs_initialized)
+ __sev_put_ghcb(&state);
+
+ native_local_irq_restore(flags);
+
+ return ret;
+}
+
+void __head
+early_set_pages_state(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages, enum psc_op op)
+{
+ unsigned long paddr_end;
+ u64 val;
+
+ vaddr = vaddr & PAGE_MASK;
+
+ paddr = paddr & PAGE_MASK;
+ paddr_end = paddr + (npages << PAGE_SHIFT);
+
+ while (paddr < paddr_end) {
+ /* Page validation must be rescinded before changing to shared */
+ if (op == SNP_PAGE_STATE_SHARED)
+ pvalidate_4k_page(vaddr, paddr, false);
+
+ /*
+ * Use the MSR protocol because this function can be called before
+ * the GHCB is established.
+ */
+ sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
+ VMGEXIT();
+
+ val = sev_es_rd_ghcb_msr();
+
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP)
+ goto e_term;
+
+ if (GHCB_MSR_PSC_RESP_VAL(val))
+ goto e_term;
+
+ /* Page validation must be performed after changing to private */
+ if (op == SNP_PAGE_STATE_PRIVATE)
+ pvalidate_4k_page(vaddr, paddr, true);
+
+ vaddr += PAGE_SIZE;
+ paddr += PAGE_SIZE;
+ }
+
+ return;
+
+e_term:
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+}
+
+void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages)
+{
+ /*
+ * This can be invoked in early boot while running identity mapped, so
+ * use an open coded check for SNP instead of using cc_platform_has().
+ * This eliminates worries about jump tables or checking boot_cpu_data
+ * in the cc_platform_has() function.
+ */
+ if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ return;
+
+ /*
+ * Ask the hypervisor to mark the memory pages as private in the RMP
+ * table.
+ */
+ early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
+}
+
+void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages)
+{
+ /*
+ * This can be invoked in early boot while running identity mapped, so
+ * use an open coded check for SNP instead of using cc_platform_has().
+ * This eliminates worries about jump tables or checking boot_cpu_data
+ * in the cc_platform_has() function.
+ */
+ if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ return;
+
+ /* Ask hypervisor to mark the memory pages shared in the RMP table. */
+ early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the kernel
+ * in the following ways, depending on how it is booted:
+ *
+ * - when booted via the boot/decompress kernel:
+ * - via boot_params
+ *
+ * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH):
+ * - via a setup_data entry, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ /* Boot kernel would have passed the CC blob via boot_params. */
+ if (bp->cc_blob_address) {
+ cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address;
+ goto found_cc_info;
+ }
+
+ /*
+ * If kernel was booted directly, without the use of the
+ * boot/decompression kernel, the CC blob may have been passed via
+ * setup_data instead.
+ */
+ cc_info = find_cc_blob_setup_data(bp);
+ if (!cc_info)
+ return NULL;
+
+found_cc_info:
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ snp_abort();
+
+ return cc_info;
+}
+
+static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
+{
+ struct svsm_call call = {};
+ int ret;
+ u64 pa;
+
+ /*
+ * Record the SVSM Calling Area address (CAA) if the guest is not
+ * running at VMPL0. The CA will be used to communicate with the
+ * SVSM to perform the SVSM services.
+ */
+ if (!svsm_setup_ca(cc_info))
+ return;
+
+ /*
+ * It is very early in the boot and the kernel is running identity
+ * mapped but without having adjusted the pagetables to where the
+ * kernel was loaded (physbase), so the get the CA address using
+ * RIP-relative addressing.
+ */
+ pa = (u64)rip_rel_ptr(&boot_svsm_ca_page);
+
+ /*
+ * Switch over to the boot SVSM CA while the current CA is still
+ * addressable. There is no GHCB at this point so use the MSR protocol.
+ *
+ * SVSM_CORE_REMAP_CA call:
+ * RAX = 0 (Protocol=0, CallID=0)
+ * RCX = New CA GPA
+ */
+ call.caa = svsm_get_caa();
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA);
+ call.rcx = pa;
+ ret = svsm_perform_call_protocol(&call);
+ if (ret)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL);
+
+ boot_svsm_caa = (struct svsm_ca *)pa;
+ boot_svsm_caa_pa = pa;
+}
+
+bool __head snp_init(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ if (!bp)
+ return false;
+
+ cc_info = find_cc_blob(bp);
+ if (!cc_info)
+ return false;
+
+ if (cc_info->secrets_phys && cc_info->secrets_len == PAGE_SIZE)
+ sev_secrets_pa = cc_info->secrets_phys;
+ else
+ return false;
+
+ setup_cpuid_table(cc_info);
+
+ svsm_setup(cc_info);
+
+ /*
+ * The CC blob will be used later to access the secrets page. Cache
+ * it here like the boot kernel does.
+ */
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+ return true;
+}
+
+void __head __noreturn snp_abort(void)
+{
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+}
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/boot/startup/sme.c
index 5eecdd92da10..753cd2094080 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/boot/startup/sme.c
@@ -45,8 +45,6 @@
#include <asm/coco.h>
#include <asm/sev.h>
-#include "mm_internal.h"
-
#define PGD_FLAGS _KERNPG_TABLE_NOENC
#define P4D_FLAGS _KERNPG_TABLE_NOENC
#define PUD_FLAGS _KERNPG_TABLE_NOENC
@@ -299,8 +297,7 @@ void __head sme_encrypt_kernel(struct boot_params *bp)
* instrumentation or checking boot_cpu_data in the cc_platform_has()
* function.
*/
- if (!sme_get_me_mask() ||
- RIP_REL_REF(sev_status) & MSR_AMD64_SEV_ENABLED)
+ if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED)
return;
/*
@@ -318,8 +315,8 @@ void __head sme_encrypt_kernel(struct boot_params *bp)
* memory from being cached.
*/
- kernel_start = (unsigned long)RIP_REL_REF(_text);
- kernel_end = ALIGN((unsigned long)RIP_REL_REF(_end), PMD_SIZE);
+ kernel_start = (unsigned long)rip_rel_ptr(_text);
+ kernel_end = ALIGN((unsigned long)rip_rel_ptr(_end), PMD_SIZE);
kernel_len = kernel_end - kernel_start;
initrd_start = 0;
@@ -345,7 +342,7 @@ void __head sme_encrypt_kernel(struct boot_params *bp)
* pagetable structures for the encryption of the kernel
* pagetable structures for workarea (in case not currently mapped)
*/
- execute_start = workarea_start = (unsigned long)RIP_REL_REF(sme_workarea);
+ execute_start = workarea_start = (unsigned long)rip_rel_ptr(sme_workarea);
execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE;
execute_len = execute_end - execute_start;
@@ -526,7 +523,7 @@ void __head sme_enable(struct boot_params *bp)
me_mask = 1UL << (ebx & 0x3f);
/* Check the SEV MSR whether SEV or SME is enabled */
- RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV);
+ sev_status = msr = __rdmsr(MSR_AMD64_SEV);
feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
/*
@@ -562,8 +559,17 @@ void __head sme_enable(struct boot_params *bp)
return;
}
- RIP_REL_REF(sme_me_mask) = me_mask;
- RIP_REL_REF(physical_mask) &= ~me_mask;
- RIP_REL_REF(cc_vendor) = CC_VENDOR_AMD;
+ sme_me_mask = me_mask;
+ physical_mask &= ~me_mask;
+ cc_vendor = CC_VENDOR_AMD;
cc_set_mask(me_mask);
}
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+/* Local version for startup code, which never operates on user page tables */
+__weak
+pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+#endif
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 84f7a883ce1e..f35369bb14c5 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -32,7 +32,7 @@
int memcmp(const void *s1, const void *s2, size_t len)
{
bool diff;
- asm("repe; cmpsb" CC_SET(nz)
+ asm("repe cmpsb" CC_SET(nz)
: CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len));
return diff;
}
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index f2e96905b3fe..0641c8c46aee 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -292,7 +292,7 @@ static void restore_screen(void)
"shrw %%cx ; "
"jnc 1f ; "
"stosw \n\t"
- "1: rep;stosl ; "
+ "1: rep stosl ; "
"popw %%es"
: "+D" (dst), "+c" (npad)
: "bdS" (video_segment),
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index 9a0ddda3aa69..d4610af68114 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -18,7 +18,9 @@
#include <asm/processor.h>
enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE;
+SYM_PIC_ALIAS(cc_vendor);
u64 cc_mask __ro_after_init;
+SYM_PIC_ALIAS(cc_mask);
static struct cc_attr_flags {
__u64 host_sev_snp : 1,
diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile
index dcb06dc8b5ae..db3255b979bd 100644
--- a/arch/x86/coco/sev/Makefile
+++ b/arch/x86/coco/sev/Makefile
@@ -1,22 +1,9 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y += core.o
-
-# jump tables are emitted using absolute references in non-PIC code
-# so they cannot be used in the early SEV startup code
-CFLAGS_core.o += -fno-jump-tables
-
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_core.o = -pg
-endif
-
-KASAN_SANITIZE_core.o := n
-KMSAN_SANITIZE_core.o := n
-KCOV_INSTRUMENT_core.o := n
-
-# With some compiler versions the generated code results in boot hangs, caused
-# by several compilation units. To be safe, disable all instrumentation.
-KCSAN_SANITIZE := n
+obj-y += core.o sev-nmi.o vc-handle.o
# Clang 14 and older may fail to respect __no_sanitize_undefined when inlining
-UBSAN_SANITIZE := n
+UBSAN_SANITIZE_sev-nmi.o := n
+
+# GCC may fail to respect __no_sanitize_address when inlining
+KASAN_SANITIZE_sev-nmi.o := n
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index b0c1a7a57497..ac400525de73 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -31,6 +31,7 @@
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
#include <asm/sev.h>
+#include <asm/sev-internal.h>
#include <asm/insn-eval.h>
#include <asm/fpu/xcr.h>
#include <asm/processor.h>
@@ -44,8 +45,6 @@
#include <asm/cpuid.h>
#include <asm/cmdline.h>
-#define DR7_RESET_VALUE 0x400
-
/* AP INIT values as documented in the APM2 section "Processor Initialization State" */
#define AP_INIT_CS_LIMIT 0xffff
#define AP_INIT_DS_LIMIT 0xffff
@@ -81,21 +80,6 @@ static const char * const sev_status_feat_names[] = {
[MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt",
};
-/* For early boot hypervisor communication in SEV-ES enabled guests */
-static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
-
-/*
- * Needs to be in the .data section because we need it NULL before bss is
- * cleared
- */
-static struct ghcb *boot_ghcb __section(".data");
-
-/* Bitmap of SEV features supported by the hypervisor */
-static u64 sev_hv_features __ro_after_init;
-
-/* Secrets page physical address from the CC blob */
-static u64 secrets_pa __ro_after_init;
-
/*
* For Secure TSC guests, the BSP fetches TSC_INFO using SNP guest messaging and
* initializes snp_tsc_scale and snp_tsc_offset. These values are replicated
@@ -105,558 +89,196 @@ static u64 snp_tsc_scale __ro_after_init;
static u64 snp_tsc_offset __ro_after_init;
static u64 snp_tsc_freq_khz __ro_after_init;
-/* #VC handler runtime per-CPU data */
-struct sev_es_runtime_data {
- struct ghcb ghcb_page;
-
- /*
- * Reserve one page per CPU as backup storage for the unencrypted GHCB.
- * It is needed when an NMI happens while the #VC handler uses the real
- * GHCB, and the NMI handler itself is causing another #VC exception. In
- * that case the GHCB content of the first handler needs to be backed up
- * and restored.
- */
- struct ghcb backup_ghcb;
-
- /*
- * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
- * There is no need for it to be atomic, because nothing is written to
- * the GHCB between the read and the write of ghcb_active. So it is safe
- * to use it when a nested #VC exception happens before the write.
- *
- * This is necessary for example in the #VC->NMI->#VC case when the NMI
- * happens while the first #VC handler uses the GHCB. When the NMI code
- * raises a second #VC handler it might overwrite the contents of the
- * GHCB written by the first handler. To avoid this the content of the
- * GHCB is saved and restored when the GHCB is detected to be in use
- * already.
- */
- bool ghcb_active;
- bool backup_ghcb_active;
-
- /*
- * Cached DR7 value - write it on DR7 writes and return it on reads.
- * That value will never make it to the real hardware DR7 as debugging
- * is currently unsupported in SEV-ES guests.
- */
- unsigned long dr7;
-};
-
-struct ghcb_state {
- struct ghcb *ghcb;
-};
-
-/* For early boot SVSM communication */
-static struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE);
-
-static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
-static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
-static DEFINE_PER_CPU(struct svsm_ca *, svsm_caa);
-static DEFINE_PER_CPU(u64, svsm_caa_pa);
-
-static __always_inline bool on_vc_stack(struct pt_regs *regs)
-{
- unsigned long sp = regs->sp;
-
- /* User-mode RSP is not trusted */
- if (user_mode(regs))
- return false;
-
- /* SYSCALL gap still has user-mode RSP */
- if (ip_within_syscall_gap(regs))
- return false;
-
- return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
-}
+DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
+DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
/*
- * This function handles the case when an NMI is raised in the #VC
- * exception handler entry code, before the #VC handler has switched off
- * its IST stack. In this case, the IST entry for #VC must be adjusted,
- * so that any nested #VC exception will not overwrite the stack
- * contents of the interrupted #VC handler.
- *
- * The IST entry is adjusted unconditionally so that it can be also be
- * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a
- * nested sev_es_ist_exit() call may adjust back the IST entry too
- * early.
- *
- * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run
- * on the NMI IST stack, as they are only called from NMI handling code
- * right now.
+ * SVSM related information:
+ * When running under an SVSM, the VMPL that Linux is executing at must be
+ * non-zero. The VMPL is therefore used to indicate the presence of an SVSM.
*/
-void noinstr __sev_es_ist_enter(struct pt_regs *regs)
-{
- unsigned long old_ist, new_ist;
-
- /* Read old IST entry */
- new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
-
- /*
- * If NMI happened while on the #VC IST stack, set the new IST
- * value below regs->sp, so that the interrupted stack frame is
- * not overwritten by subsequent #VC exceptions.
- */
- if (on_vc_stack(regs))
- new_ist = regs->sp;
-
- /*
- * Reserve additional 8 bytes and store old IST value so this
- * adjustment can be unrolled in __sev_es_ist_exit().
- */
- new_ist -= sizeof(old_ist);
- *(unsigned long *)new_ist = old_ist;
-
- /* Set new IST entry */
- this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
-}
+u8 snp_vmpl __ro_after_init;
+EXPORT_SYMBOL_GPL(snp_vmpl);
-void noinstr __sev_es_ist_exit(void)
+static u64 __init get_snp_jump_table_addr(void)
{
- unsigned long ist;
+ struct snp_secrets_page *secrets;
+ void __iomem *mem;
+ u64 addr;
- /* Read IST entry */
- ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+ mem = ioremap_encrypted(sev_secrets_pa, PAGE_SIZE);
+ if (!mem) {
+ pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n");
+ return 0;
+ }
- if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
- return;
+ secrets = (__force struct snp_secrets_page *)mem;
- /* Read back old IST entry and write it to the TSS */
- this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
+ addr = secrets->os_area.ap_jump_table_pa;
+ iounmap(mem);
+
+ return addr;
}
-/*
- * Nothing shall interrupt this code path while holding the per-CPU
- * GHCB. The backup GHCB is only for NMIs interrupting this path.
- *
- * Callers must disable local interrupts around it.
- */
-static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
+static u64 __init get_jump_table_addr(void)
{
- struct sev_es_runtime_data *data;
+ struct ghcb_state state;
+ unsigned long flags;
struct ghcb *ghcb;
+ u64 ret = 0;
- WARN_ON(!irqs_disabled());
-
- data = this_cpu_read(runtime_data);
- ghcb = &data->ghcb_page;
-
- if (unlikely(data->ghcb_active)) {
- /* GHCB is already in use - save its contents */
-
- if (unlikely(data->backup_ghcb_active)) {
- /*
- * Backup-GHCB is also already in use. There is no way
- * to continue here so just kill the machine. To make
- * panic() work, mark GHCBs inactive so that messages
- * can be printed out.
- */
- data->ghcb_active = false;
- data->backup_ghcb_active = false;
-
- instrumentation_begin();
- panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
- instrumentation_end();
- }
-
- /* Mark backup_ghcb active before writing to it */
- data->backup_ghcb_active = true;
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return get_snp_jump_table_addr();
- state->ghcb = &data->backup_ghcb;
+ local_irq_save(flags);
- /* Backup GHCB content */
- *state->ghcb = *ghcb;
- } else {
- state->ghcb = NULL;
- data->ghcb_active = true;
- }
+ ghcb = __sev_get_ghcb(&state);
- return ghcb;
-}
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
+ ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
-static inline u64 sev_es_rd_ghcb_msr(void)
-{
- return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
-}
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
-static __always_inline void sev_es_wr_ghcb_msr(u64 val)
-{
- u32 low, high;
+ if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
+ ghcb_sw_exit_info_2_is_valid(ghcb))
+ ret = ghcb->save.sw_exit_info_2;
- low = (u32)(val);
- high = (u32)(val >> 32);
+ __sev_put_ghcb(&state);
- native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
-}
+ local_irq_restore(flags);
-static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
- unsigned char *buffer)
-{
- return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+ return ret;
}
-static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
+static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size,
+ int ret, u64 svsm_ret)
{
- char buffer[MAX_INSN_SIZE];
- int insn_bytes;
-
- insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
- if (insn_bytes == 0) {
- /* Nothing could be copied */
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
- ctxt->fi.cr2 = ctxt->regs->ip;
- return ES_EXCEPTION;
- } else if (insn_bytes == -EINVAL) {
- /* Effective RIP could not be calculated */
- ctxt->fi.vector = X86_TRAP_GP;
- ctxt->fi.error_code = 0;
- ctxt->fi.cr2 = 0;
- return ES_EXCEPTION;
- }
-
- if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes))
- return ES_DECODE_FAILED;
+ WARN(1, "PVALIDATE failure: pfn: 0x%llx, action: %u, size: %u, ret: %d, svsm_ret: 0x%llx\n",
+ pfn, action, page_size, ret, svsm_ret);
- if (ctxt->insn.immediate.got)
- return ES_OK;
- else
- return ES_DECODE_FAILED;
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
}
-static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
+static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svsm_ret)
{
- char buffer[MAX_INSN_SIZE];
- int res, ret;
+ unsigned int page_size;
+ bool action;
+ u64 pfn;
- res = vc_fetch_insn_kernel(ctxt, buffer);
- if (res) {
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = X86_PF_INSTR;
- ctxt->fi.cr2 = ctxt->regs->ip;
- return ES_EXCEPTION;
- }
+ pfn = pc->entry[pc->cur_index].pfn;
+ action = pc->entry[pc->cur_index].action;
+ page_size = pc->entry[pc->cur_index].page_size;
- ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
- if (ret < 0)
- return ES_DECODE_FAILED;
- else
- return ES_OK;
+ __pval_terminate(pfn, action, page_size, ret, svsm_ret);
}
-static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+static void pval_pages(struct snp_psc_desc *desc)
{
- if (user_mode(ctxt->regs))
- return __vc_decode_user_insn(ctxt);
- else
- return __vc_decode_kern_insn(ctxt);
-}
+ struct psc_entry *e;
+ unsigned long vaddr;
+ unsigned int size;
+ unsigned int i;
+ bool validate;
+ u64 pfn;
+ int rc;
-static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
- char *dst, char *buf, size_t size)
-{
- unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
+ for (i = 0; i <= desc->hdr.end_entry; i++) {
+ e = &desc->entries[i];
- /*
- * This function uses __put_user() independent of whether kernel or user
- * memory is accessed. This works fine because __put_user() does no
- * sanity checks of the pointer being accessed. All that it does is
- * to report when the access failed.
- *
- * Also, this function runs in atomic context, so __put_user() is not
- * allowed to sleep. The page-fault handler detects that it is running
- * in atomic context and will not try to take mmap_sem and handle the
- * fault, so additional pagefault_enable()/disable() calls are not
- * needed.
- *
- * The access can't be done via copy_to_user() here because
- * vc_write_mem() must not use string instructions to access unsafe
- * memory. The reason is that MOVS is emulated by the #VC handler by
- * splitting the move up into a read and a write and taking a nested #VC
- * exception on whatever of them is the MMIO access. Using string
- * instructions here would cause infinite nesting.
- */
- switch (size) {
- case 1: {
- u8 d1;
- u8 __user *target = (u8 __user *)dst;
-
- memcpy(&d1, buf, 1);
- if (__put_user(d1, target))
- goto fault;
- break;
- }
- case 2: {
- u16 d2;
- u16 __user *target = (u16 __user *)dst;
+ pfn = e->gfn;
+ vaddr = (unsigned long)pfn_to_kaddr(pfn);
+ size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K;
+ validate = e->operation == SNP_PAGE_STATE_PRIVATE;
- memcpy(&d2, buf, 2);
- if (__put_user(d2, target))
- goto fault;
- break;
- }
- case 4: {
- u32 d4;
- u32 __user *target = (u32 __user *)dst;
+ rc = pvalidate(vaddr, size, validate);
+ if (!rc)
+ continue;
- memcpy(&d4, buf, 4);
- if (__put_user(d4, target))
- goto fault;
- break;
- }
- case 8: {
- u64 d8;
- u64 __user *target = (u64 __user *)dst;
+ if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) {
+ unsigned long vaddr_end = vaddr + PMD_SIZE;
- memcpy(&d8, buf, 8);
- if (__put_user(d8, target))
- goto fault;
- break;
- }
- default:
- WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
- return ES_UNSUPPORTED;
+ for (; vaddr < vaddr_end; vaddr += PAGE_SIZE, pfn++) {
+ rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
+ if (rc)
+ __pval_terminate(pfn, validate, RMP_PG_SIZE_4K, rc, 0);
+ }
+ } else {
+ __pval_terminate(pfn, validate, size, rc, 0);
+ }
}
-
- return ES_OK;
-
-fault:
- if (user_mode(ctxt->regs))
- error_code |= X86_PF_USER;
-
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = error_code;
- ctxt->fi.cr2 = (unsigned long)dst;
-
- return ES_EXCEPTION;
}
-static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
- char *src, char *buf, size_t size)
+static u64 svsm_build_ca_from_pfn_range(u64 pfn, u64 pfn_end, bool action,
+ struct svsm_pvalidate_call *pc)
{
- unsigned long error_code = X86_PF_PROT;
-
- /*
- * This function uses __get_user() independent of whether kernel or user
- * memory is accessed. This works fine because __get_user() does no
- * sanity checks of the pointer being accessed. All that it does is
- * to report when the access failed.
- *
- * Also, this function runs in atomic context, so __get_user() is not
- * allowed to sleep. The page-fault handler detects that it is running
- * in atomic context and will not try to take mmap_sem and handle the
- * fault, so additional pagefault_enable()/disable() calls are not
- * needed.
- *
- * The access can't be done via copy_from_user() here because
- * vc_read_mem() must not use string instructions to access unsafe
- * memory. The reason is that MOVS is emulated by the #VC handler by
- * splitting the move up into a read and a write and taking a nested #VC
- * exception on whatever of them is the MMIO access. Using string
- * instructions here would cause infinite nesting.
- */
- switch (size) {
- case 1: {
- u8 d1;
- u8 __user *s = (u8 __user *)src;
-
- if (__get_user(d1, s))
- goto fault;
- memcpy(buf, &d1, 1);
- break;
- }
- case 2: {
- u16 d2;
- u16 __user *s = (u16 __user *)src;
-
- if (__get_user(d2, s))
- goto fault;
- memcpy(buf, &d2, 2);
- break;
- }
- case 4: {
- u32 d4;
- u32 __user *s = (u32 __user *)src;
-
- if (__get_user(d4, s))
- goto fault;
- memcpy(buf, &d4, 4);
- break;
- }
- case 8: {
- u64 d8;
- u64 __user *s = (u64 __user *)src;
- if (__get_user(d8, s))
- goto fault;
- memcpy(buf, &d8, 8);
- break;
- }
- default:
- WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
- return ES_UNSUPPORTED;
- }
-
- return ES_OK;
-
-fault:
- if (user_mode(ctxt->regs))
- error_code |= X86_PF_USER;
+ struct svsm_pvalidate_entry *pe;
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = error_code;
- ctxt->fi.cr2 = (unsigned long)src;
+ /* Nothing in the CA yet */
+ pc->num_entries = 0;
+ pc->cur_index = 0;
- return ES_EXCEPTION;
-}
+ pe = &pc->entry[0];
-static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
- unsigned long vaddr, phys_addr_t *paddr)
-{
- unsigned long va = (unsigned long)vaddr;
- unsigned int level;
- phys_addr_t pa;
- pgd_t *pgd;
- pte_t *pte;
+ while (pfn < pfn_end) {
+ pe->page_size = RMP_PG_SIZE_4K;
+ pe->action = action;
+ pe->ignore_cf = 0;
+ pe->pfn = pfn;
- pgd = __va(read_cr3_pa());
- pgd = &pgd[pgd_index(va)];
- pte = lookup_address_in_pgd(pgd, va, &level);
- if (!pte) {
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.cr2 = vaddr;
- ctxt->fi.error_code = 0;
+ pe++;
+ pfn++;
- if (user_mode(ctxt->regs))
- ctxt->fi.error_code |= X86_PF_USER;
-
- return ES_EXCEPTION;
+ pc->num_entries++;
+ if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT)
+ break;
}
- if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
- /* Emulated MMIO to/from encrypted memory not supported */
- return ES_UNSUPPORTED;
-
- pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
- pa |= va & ~page_level_mask(level);
-
- *paddr = pa;
-
- return ES_OK;
+ return pfn;
}
-static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
+static int svsm_build_ca_from_psc_desc(struct snp_psc_desc *desc, unsigned int desc_entry,
+ struct svsm_pvalidate_call *pc)
{
- BUG_ON(size > 4);
-
- if (user_mode(ctxt->regs)) {
- struct thread_struct *t = &current->thread;
- struct io_bitmap *iobm = t->io_bitmap;
- size_t idx;
-
- if (!iobm)
- goto fault;
-
- for (idx = port; idx < port + size; ++idx) {
- if (test_bit(idx, iobm->bitmap))
- goto fault;
- }
- }
-
- return ES_OK;
+ struct svsm_pvalidate_entry *pe;
+ struct psc_entry *e;
-fault:
- ctxt->fi.vector = X86_TRAP_GP;
- ctxt->fi.error_code = 0;
+ /* Nothing in the CA yet */
+ pc->num_entries = 0;
+ pc->cur_index = 0;
- return ES_EXCEPTION;
-}
+ pe = &pc->entry[0];
+ e = &desc->entries[desc_entry];
-static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
-{
- long error_code = ctxt->fi.error_code;
- int trapnr = ctxt->fi.vector;
+ while (desc_entry <= desc->hdr.end_entry) {
+ pe->page_size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K;
+ pe->action = e->operation == SNP_PAGE_STATE_PRIVATE;
+ pe->ignore_cf = 0;
+ pe->pfn = e->gfn;
- ctxt->regs->orig_ax = ctxt->fi.error_code;
+ pe++;
+ e++;
- switch (trapnr) {
- case X86_TRAP_GP:
- exc_general_protection(ctxt->regs, error_code);
- break;
- case X86_TRAP_UD:
- exc_invalid_op(ctxt->regs);
- break;
- case X86_TRAP_PF:
- write_cr2(ctxt->fi.cr2);
- exc_page_fault(ctxt->regs, error_code);
- break;
- case X86_TRAP_AC:
- exc_alignment_check(ctxt->regs, error_code);
- break;
- default:
- pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
- BUG();
+ desc_entry++;
+ pc->num_entries++;
+ if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT)
+ break;
}
-}
-
-/* Include code shared with pre-decompression boot stage */
-#include "shared.c"
-
-static inline struct svsm_ca *svsm_get_caa(void)
-{
- /*
- * Use rIP-relative references when called early in the boot. If
- * ->use_cas is set, then it is late in the boot and no need
- * to worry about rIP-relative references.
- */
- if (RIP_REL_REF(sev_cfg).use_cas)
- return this_cpu_read(svsm_caa);
- else
- return RIP_REL_REF(boot_svsm_caa);
-}
-
-static u64 svsm_get_caa_pa(void)
-{
- /*
- * Use rIP-relative references when called early in the boot. If
- * ->use_cas is set, then it is late in the boot and no need
- * to worry about rIP-relative references.
- */
- if (RIP_REL_REF(sev_cfg).use_cas)
- return this_cpu_read(svsm_caa_pa);
- else
- return RIP_REL_REF(boot_svsm_caa_pa);
-}
-
-static noinstr void __sev_put_ghcb(struct ghcb_state *state)
-{
- struct sev_es_runtime_data *data;
- struct ghcb *ghcb;
-
- WARN_ON(!irqs_disabled());
-
- data = this_cpu_read(runtime_data);
- ghcb = &data->ghcb_page;
- if (state->ghcb) {
- /* Restore GHCB from Backup */
- *ghcb = *state->ghcb;
- data->backup_ghcb_active = false;
- state->ghcb = NULL;
- } else {
- /*
- * Invalidate the GHCB so a VMGEXIT instruction issued
- * from userspace won't appear to be valid.
- */
- vc_ghcb_invalidate(ghcb);
- data->ghcb_active = false;
- }
+ return desc_entry;
}
-static int svsm_perform_call_protocol(struct svsm_call *call)
+static void svsm_pval_pages(struct snp_psc_desc *desc)
{
- struct ghcb_state state;
+ struct svsm_pvalidate_entry pv_4k[VMGEXIT_PSC_MAX_ENTRY];
+ unsigned int i, pv_4k_count = 0;
+ struct svsm_pvalidate_call *pc;
+ struct svsm_call call = {};
unsigned long flags;
- struct ghcb *ghcb;
+ bool action;
+ u64 pc_pa;
int ret;
/*
@@ -666,180 +288,145 @@ static int svsm_perform_call_protocol(struct svsm_call *call)
flags = native_local_irq_save();
/*
- * Use rip-relative references when called early in the boot. If
- * ghcbs_initialized is set, then it is late in the boot and no need
- * to worry about rip-relative references in called functions.
+ * The SVSM calling area (CA) can support processing 510 entries at a
+ * time. Loop through the Page State Change descriptor until the CA is
+ * full or the last entry in the descriptor is reached, at which time
+ * the SVSM is invoked. This repeats until all entries in the descriptor
+ * are processed.
*/
- if (RIP_REL_REF(sev_cfg).ghcbs_initialized)
- ghcb = __sev_get_ghcb(&state);
- else if (RIP_REL_REF(boot_ghcb))
- ghcb = RIP_REL_REF(boot_ghcb);
- else
- ghcb = NULL;
+ call.caa = svsm_get_caa();
- do {
- ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call)
- : svsm_perform_msr_protocol(call);
- } while (ret == -EAGAIN);
+ pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer;
+ pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer);
- if (RIP_REL_REF(sev_cfg).ghcbs_initialized)
- __sev_put_ghcb(&state);
+ /* Protocol 0, Call ID 1 */
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE);
+ call.rcx = pc_pa;
- native_local_irq_restore(flags);
+ for (i = 0; i <= desc->hdr.end_entry;) {
+ i = svsm_build_ca_from_psc_desc(desc, i, pc);
- return ret;
-}
+ do {
+ ret = svsm_perform_call_protocol(&call);
+ if (!ret)
+ continue;
-void noinstr __sev_es_nmi_complete(void)
-{
- struct ghcb_state state;
- struct ghcb *ghcb;
+ /*
+ * Check if the entry failed because of an RMP mismatch (a
+ * PVALIDATE at 2M was requested, but the page is mapped in
+ * the RMP as 4K).
+ */
- ghcb = __sev_get_ghcb(&state);
+ if (call.rax_out == SVSM_PVALIDATE_FAIL_SIZEMISMATCH &&
+ pc->entry[pc->cur_index].page_size == RMP_PG_SIZE_2M) {
+ /* Save this entry for post-processing at 4K */
+ pv_4k[pv_4k_count++] = pc->entry[pc->cur_index];
+
+ /* Skip to the next one unless at the end of the list */
+ pc->cur_index++;
+ if (pc->cur_index < pc->num_entries)
+ ret = -EAGAIN;
+ else
+ ret = 0;
+ }
+ } while (ret == -EAGAIN);
- vc_ghcb_invalidate(ghcb);
- ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
- ghcb_set_sw_exit_info_1(ghcb, 0);
- ghcb_set_sw_exit_info_2(ghcb, 0);
+ if (ret)
+ svsm_pval_terminate(pc, ret, call.rax_out);
+ }
- sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
- VMGEXIT();
+ /* Process any entries that failed to be validated at 2M and validate them at 4K */
+ for (i = 0; i < pv_4k_count; i++) {
+ u64 pfn, pfn_end;
- __sev_put_ghcb(&state);
-}
+ action = pv_4k[i].action;
+ pfn = pv_4k[i].pfn;
+ pfn_end = pfn + 512;
-static u64 __init get_snp_jump_table_addr(void)
-{
- struct snp_secrets_page *secrets;
- void __iomem *mem;
- u64 addr;
+ while (pfn < pfn_end) {
+ pfn = svsm_build_ca_from_pfn_range(pfn, pfn_end, action, pc);
- mem = ioremap_encrypted(secrets_pa, PAGE_SIZE);
- if (!mem) {
- pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n");
- return 0;
+ ret = svsm_perform_call_protocol(&call);
+ if (ret)
+ svsm_pval_terminate(pc, ret, call.rax_out);
+ }
}
- secrets = (__force struct snp_secrets_page *)mem;
-
- addr = secrets->os_area.ap_jump_table_pa;
- iounmap(mem);
-
- return addr;
+ native_local_irq_restore(flags);
}
-static u64 __init get_jump_table_addr(void)
+static void pvalidate_pages(struct snp_psc_desc *desc)
{
- struct ghcb_state state;
- unsigned long flags;
- struct ghcb *ghcb;
- u64 ret = 0;
-
- if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
- return get_snp_jump_table_addr();
-
- local_irq_save(flags);
+ if (snp_vmpl)
+ svsm_pval_pages(desc);
+ else
+ pval_pages(desc);
+}
- ghcb = __sev_get_ghcb(&state);
+static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc)
+{
+ int cur_entry, end_entry, ret = 0;
+ struct snp_psc_desc *data;
+ struct es_em_ctxt ctxt;
vc_ghcb_invalidate(ghcb);
- ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
- ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
- ghcb_set_sw_exit_info_2(ghcb, 0);
- sev_es_wr_ghcb_msr(__pa(ghcb));
- VMGEXIT();
-
- if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
- ghcb_sw_exit_info_2_is_valid(ghcb))
- ret = ghcb->save.sw_exit_info_2;
-
- __sev_put_ghcb(&state);
-
- local_irq_restore(flags);
+ /* Copy the input desc into GHCB shared buffer */
+ data = (struct snp_psc_desc *)ghcb->shared_buffer;
+ memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc)));
- return ret;
-}
-
-static void __head
-early_set_pages_state(unsigned long vaddr, unsigned long paddr,
- unsigned long npages, enum psc_op op)
-{
- unsigned long paddr_end;
- u64 val;
-
- vaddr = vaddr & PAGE_MASK;
+ /*
+ * As per the GHCB specification, the hypervisor can resume the guest
+ * before processing all the entries. Check whether all the entries
+ * are processed. If not, then keep retrying. Note, the hypervisor
+ * will update the data memory directly to indicate the status, so
+ * reference the data->hdr everywhere.
+ *
+ * The strategy here is to wait for the hypervisor to change the page
+ * state in the RMP table before guest accesses the memory pages. If the
+ * page state change was not successful, then later memory access will
+ * result in a crash.
+ */
+ cur_entry = data->hdr.cur_entry;
+ end_entry = data->hdr.end_entry;
- paddr = paddr & PAGE_MASK;
- paddr_end = paddr + (npages << PAGE_SHIFT);
+ while (data->hdr.cur_entry <= data->hdr.end_entry) {
+ ghcb_set_sw_scratch(ghcb, (u64)__pa(data));
- while (paddr < paddr_end) {
- /* Page validation must be rescinded before changing to shared */
- if (op == SNP_PAGE_STATE_SHARED)
- pvalidate_4k_page(vaddr, paddr, false);
+ /* This will advance the shared buffer data points to. */
+ ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0);
/*
- * Use the MSR protocol because this function can be called before
- * the GHCB is established.
+ * Page State Change VMGEXIT can pass error code through
+ * exit_info_2.
*/
- sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
- VMGEXIT();
-
- val = sev_es_rd_ghcb_msr();
-
- if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP)
- goto e_term;
-
- if (GHCB_MSR_PSC_RESP_VAL(val))
- goto e_term;
+ if (WARN(ret || ghcb->save.sw_exit_info_2,
+ "SNP: PSC failed ret=%d exit_info_2=%llx\n",
+ ret, ghcb->save.sw_exit_info_2)) {
+ ret = 1;
+ goto out;
+ }
- /* Page validation must be performed after changing to private */
- if (op == SNP_PAGE_STATE_PRIVATE)
- pvalidate_4k_page(vaddr, paddr, true);
+ /* Verify that reserved bit is not set */
+ if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) {
+ ret = 1;
+ goto out;
+ }
- vaddr += PAGE_SIZE;
- paddr += PAGE_SIZE;
+ /*
+ * Sanity check that entry processing is not going backwards.
+ * This will happen only if hypervisor is tricking us.
+ */
+ if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry,
+"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n",
+ end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) {
+ ret = 1;
+ goto out;
+ }
}
- return;
-
-e_term:
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
-}
-
-void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
- unsigned long npages)
-{
- /*
- * This can be invoked in early boot while running identity mapped, so
- * use an open coded check for SNP instead of using cc_platform_has().
- * This eliminates worries about jump tables or checking boot_cpu_data
- * in the cc_platform_has() function.
- */
- if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
- return;
-
- /*
- * Ask the hypervisor to mark the memory pages as private in the RMP
- * table.
- */
- early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
-}
-
-void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
- unsigned long npages)
-{
- /*
- * This can be invoked in early boot while running identity mapped, so
- * use an open coded check for SNP instead of using cc_platform_has().
- * This eliminates worries about jump tables or checking boot_cpu_data
- * in the cc_platform_has() function.
- */
- if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
- return;
-
- /* Ask hypervisor to mark the memory pages shared in the RMP table. */
- early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED);
+out:
+ return ret;
}
static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
@@ -1417,90 +1004,6 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
return 0;
}
-/* Writes to the SVSM CAA MSR are ignored */
-static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write)
-{
- if (write)
- return ES_OK;
-
- regs->ax = lower_32_bits(this_cpu_read(svsm_caa_pa));
- regs->dx = upper_32_bits(this_cpu_read(svsm_caa_pa));
-
- return ES_OK;
-}
-
-/*
- * TSC related accesses should not exit to the hypervisor when a guest is
- * executing with Secure TSC enabled, so special handling is required for
- * accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ.
- */
-static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write)
-{
- u64 tsc;
-
- /*
- * GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled.
- * Terminate the SNP guest when the interception is enabled.
- */
- if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ)
- return ES_VMM_ERROR;
-
- /*
- * Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC
- * to return undefined values, so ignore all writes.
- *
- * Reads: Reads of MSR_IA32_TSC should return the current TSC value, use
- * the value returned by rdtsc_ordered().
- */
- if (write) {
- WARN_ONCE(1, "TSC MSR writes are verboten!\n");
- return ES_OK;
- }
-
- tsc = rdtsc_ordered();
- regs->ax = lower_32_bits(tsc);
- regs->dx = upper_32_bits(tsc);
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- struct pt_regs *regs = ctxt->regs;
- enum es_result ret;
- bool write;
-
- /* Is it a WRMSR? */
- write = ctxt->insn.opcode.bytes[1] == 0x30;
-
- switch (regs->cx) {
- case MSR_SVSM_CAA:
- return __vc_handle_msr_caa(regs, write);
- case MSR_IA32_TSC:
- case MSR_AMD64_GUEST_TSC_FREQ:
- if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
- return __vc_handle_secure_tsc_msrs(regs, write);
- break;
- default:
- break;
- }
-
- ghcb_set_rcx(ghcb, regs->cx);
- if (write) {
- ghcb_set_rax(ghcb, regs->ax);
- ghcb_set_rdx(ghcb, regs->dx);
- }
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, write, 0);
-
- if ((ret == ES_OK) && !write) {
- regs->ax = ghcb->save.rax;
- regs->dx = ghcb->save.rdx;
- }
-
- return ret;
-}
-
static void snp_register_per_cpu_ghcb(void)
{
struct sev_es_runtime_data *data;
@@ -1713,748 +1216,6 @@ void __init sev_es_init_vc_handling(void)
initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
}
-static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
-{
- int trapnr = ctxt->fi.vector;
-
- if (trapnr == X86_TRAP_PF)
- native_write_cr2(ctxt->fi.cr2);
-
- ctxt->regs->orig_ax = ctxt->fi.error_code;
- do_early_exception(ctxt->regs, trapnr);
-}
-
-static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
-{
- long *reg_array;
- int offset;
-
- reg_array = (long *)ctxt->regs;
- offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs);
-
- if (offset < 0)
- return NULL;
-
- offset /= sizeof(long);
-
- return reg_array + offset;
-}
-static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
- unsigned int bytes, bool read)
-{
- u64 exit_code, exit_info_1, exit_info_2;
- unsigned long ghcb_pa = __pa(ghcb);
- enum es_result res;
- phys_addr_t paddr;
- void __user *ref;
-
- ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs);
- if (ref == (void __user *)-1L)
- return ES_UNSUPPORTED;
-
- exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
-
- res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr);
- if (res != ES_OK) {
- if (res == ES_EXCEPTION && !read)
- ctxt->fi.error_code |= X86_PF_WRITE;
-
- return res;
- }
-
- exit_info_1 = paddr;
- /* Can never be greater than 8 */
- exit_info_2 = bytes;
-
- ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
-
- return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
-}
-
-/*
- * The MOVS instruction has two memory operands, which raises the
- * problem that it is not known whether the access to the source or the
- * destination caused the #VC exception (and hence whether an MMIO read
- * or write operation needs to be emulated).
- *
- * Instead of playing games with walking page-tables and trying to guess
- * whether the source or destination is an MMIO range, split the move
- * into two operations, a read and a write with only one memory operand.
- * This will cause a nested #VC exception on the MMIO address which can
- * then be handled.
- *
- * This implementation has the benefit that it also supports MOVS where
- * source _and_ destination are MMIO regions.
- *
- * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
- * rare operation. If it turns out to be a performance problem the split
- * operations can be moved to memcpy_fromio() and memcpy_toio().
- */
-static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
- unsigned int bytes)
-{
- unsigned long ds_base, es_base;
- unsigned char *src, *dst;
- unsigned char buffer[8];
- enum es_result ret;
- bool rep;
- int off;
-
- ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS);
- es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
-
- if (ds_base == -1L || es_base == -1L) {
- ctxt->fi.vector = X86_TRAP_GP;
- ctxt->fi.error_code = 0;
- return ES_EXCEPTION;
- }
-
- src = ds_base + (unsigned char *)ctxt->regs->si;
- dst = es_base + (unsigned char *)ctxt->regs->di;
-
- ret = vc_read_mem(ctxt, src, buffer, bytes);
- if (ret != ES_OK)
- return ret;
-
- ret = vc_write_mem(ctxt, dst, buffer, bytes);
- if (ret != ES_OK)
- return ret;
-
- if (ctxt->regs->flags & X86_EFLAGS_DF)
- off = -bytes;
- else
- off = bytes;
-
- ctxt->regs->si += off;
- ctxt->regs->di += off;
-
- rep = insn_has_rep_prefix(&ctxt->insn);
- if (rep)
- ctxt->regs->cx -= 1;
-
- if (!rep || ctxt->regs->cx == 0)
- return ES_OK;
- else
- return ES_RETRY;
-}
-
-static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- struct insn *insn = &ctxt->insn;
- enum insn_mmio_type mmio;
- unsigned int bytes = 0;
- enum es_result ret;
- u8 sign_byte;
- long *reg_data;
-
- mmio = insn_decode_mmio(insn, &bytes);
- if (mmio == INSN_MMIO_DECODE_FAILED)
- return ES_DECODE_FAILED;
-
- if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
- reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
- if (!reg_data)
- return ES_DECODE_FAILED;
- }
-
- if (user_mode(ctxt->regs))
- return ES_UNSUPPORTED;
-
- switch (mmio) {
- case INSN_MMIO_WRITE:
- memcpy(ghcb->shared_buffer, reg_data, bytes);
- ret = vc_do_mmio(ghcb, ctxt, bytes, false);
- break;
- case INSN_MMIO_WRITE_IMM:
- memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
- ret = vc_do_mmio(ghcb, ctxt, bytes, false);
- break;
- case INSN_MMIO_READ:
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- /* Zero-extend for 32-bit operation */
- if (bytes == 4)
- *reg_data = 0;
-
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
- case INSN_MMIO_READ_ZERO_EXTEND:
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- /* Zero extend based on operand size */
- memset(reg_data, 0, insn->opnd_bytes);
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
- case INSN_MMIO_READ_SIGN_EXTEND:
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- if (bytes == 1) {
- u8 *val = (u8 *)ghcb->shared_buffer;
-
- sign_byte = (*val & 0x80) ? 0xff : 0x00;
- } else {
- u16 *val = (u16 *)ghcb->shared_buffer;
-
- sign_byte = (*val & 0x8000) ? 0xff : 0x00;
- }
-
- /* Sign extend based on operand size */
- memset(reg_data, sign_byte, insn->opnd_bytes);
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
- case INSN_MMIO_MOVS:
- ret = vc_handle_mmio_movs(ctxt, bytes);
- break;
- default:
- ret = ES_UNSUPPORTED;
- break;
- }
-
- return ret;
-}
-
-static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
- long val, *reg = vc_insn_get_rm(ctxt);
- enum es_result ret;
-
- if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
- return ES_VMM_ERROR;
-
- if (!reg)
- return ES_DECODE_FAILED;
-
- val = *reg;
-
- /* Upper 32 bits must be written as zeroes */
- if (val >> 32) {
- ctxt->fi.vector = X86_TRAP_GP;
- ctxt->fi.error_code = 0;
- return ES_EXCEPTION;
- }
-
- /* Clear out other reserved bits and set bit 10 */
- val = (val & 0xffff23ffL) | BIT(10);
-
- /* Early non-zero writes to DR7 are not supported */
- if (!data && (val & ~DR7_RESET_VALUE))
- return ES_UNSUPPORTED;
-
- /* Using a value of 0 for ExitInfo1 means RAX holds the value */
- ghcb_set_rax(ghcb, val);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (data)
- data->dr7 = val;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
- long *reg = vc_insn_get_rm(ctxt);
-
- if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
- return ES_VMM_ERROR;
-
- if (!reg)
- return ES_DECODE_FAILED;
-
- if (data)
- *reg = data->dr7;
- else
- *reg = DR7_RESET_VALUE;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0);
-}
-
-static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- enum es_result ret;
-
- ghcb_set_rcx(ghcb, ctxt->regs->cx);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
- return ES_VMM_ERROR;
-
- ctxt->regs->ax = ghcb->save.rax;
- ctxt->regs->dx = ghcb->save.rdx;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_monitor(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- /*
- * Treat it as a NOP and do not leak a physical address to the
- * hypervisor.
- */
- return ES_OK;
-}
-
-static enum es_result vc_handle_mwait(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- /* Treat the same as MONITOR/MONITORX */
- return ES_OK;
-}
-
-static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- enum es_result ret;
-
- ghcb_set_rax(ghcb, ctxt->regs->ax);
- ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0);
-
- if (x86_platform.hyper.sev_es_hcall_prepare)
- x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!ghcb_rax_is_valid(ghcb))
- return ES_VMM_ERROR;
-
- ctxt->regs->ax = ghcb->save.rax;
-
- /*
- * Call sev_es_hcall_finish() after regs->ax is already set.
- * This allows the hypervisor handler to overwrite it again if
- * necessary.
- */
- if (x86_platform.hyper.sev_es_hcall_finish &&
- !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
- return ES_VMM_ERROR;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- /*
- * Calling ecx_alignment_check() directly does not work, because it
- * enables IRQs and the GHCB is active. Forward the exception and call
- * it later from vc_forward_exception().
- */
- ctxt->fi.vector = X86_TRAP_AC;
- ctxt->fi.error_code = 0;
- return ES_EXCEPTION;
-}
-
-static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
- struct ghcb *ghcb,
- unsigned long exit_code)
-{
- enum es_result result = vc_check_opcode_bytes(ctxt, exit_code);
-
- if (result != ES_OK)
- return result;
-
- switch (exit_code) {
- case SVM_EXIT_READ_DR7:
- result = vc_handle_dr7_read(ghcb, ctxt);
- break;
- case SVM_EXIT_WRITE_DR7:
- result = vc_handle_dr7_write(ghcb, ctxt);
- break;
- case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
- result = vc_handle_trap_ac(ghcb, ctxt);
- break;
- case SVM_EXIT_RDTSC:
- case SVM_EXIT_RDTSCP:
- result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
- break;
- case SVM_EXIT_RDPMC:
- result = vc_handle_rdpmc(ghcb, ctxt);
- break;
- case SVM_EXIT_INVD:
- pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
- result = ES_UNSUPPORTED;
- break;
- case SVM_EXIT_CPUID:
- result = vc_handle_cpuid(ghcb, ctxt);
- break;
- case SVM_EXIT_IOIO:
- result = vc_handle_ioio(ghcb, ctxt);
- break;
- case SVM_EXIT_MSR:
- result = vc_handle_msr(ghcb, ctxt);
- break;
- case SVM_EXIT_VMMCALL:
- result = vc_handle_vmmcall(ghcb, ctxt);
- break;
- case SVM_EXIT_WBINVD:
- result = vc_handle_wbinvd(ghcb, ctxt);
- break;
- case SVM_EXIT_MONITOR:
- result = vc_handle_monitor(ghcb, ctxt);
- break;
- case SVM_EXIT_MWAIT:
- result = vc_handle_mwait(ghcb, ctxt);
- break;
- case SVM_EXIT_NPF:
- result = vc_handle_mmio(ghcb, ctxt);
- break;
- default:
- /*
- * Unexpected #VC exception
- */
- result = ES_UNSUPPORTED;
- }
-
- return result;
-}
-
-static __always_inline bool is_vc2_stack(unsigned long sp)
-{
- return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
-}
-
-static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
-{
- unsigned long sp, prev_sp;
-
- sp = (unsigned long)regs;
- prev_sp = regs->sp;
-
- /*
- * If the code was already executing on the VC2 stack when the #VC
- * happened, let it proceed to the normal handling routine. This way the
- * code executing on the VC2 stack can cause #VC exceptions to get handled.
- */
- return is_vc2_stack(sp) && !is_vc2_stack(prev_sp);
-}
-
-static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
-{
- struct ghcb_state state;
- struct es_em_ctxt ctxt;
- enum es_result result;
- struct ghcb *ghcb;
- bool ret = true;
-
- ghcb = __sev_get_ghcb(&state);
-
- vc_ghcb_invalidate(ghcb);
- result = vc_init_em_ctxt(&ctxt, regs, error_code);
-
- if (result == ES_OK)
- result = vc_handle_exitcode(&ctxt, ghcb, error_code);
-
- __sev_put_ghcb(&state);
-
- /* Done - now check the result */
- switch (result) {
- case ES_OK:
- vc_finish_insn(&ctxt);
- break;
- case ES_UNSUPPORTED:
- pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n",
- error_code, regs->ip);
- ret = false;
- break;
- case ES_VMM_ERROR:
- pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
- error_code, regs->ip);
- ret = false;
- break;
- case ES_DECODE_FAILED:
- pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
- error_code, regs->ip);
- ret = false;
- break;
- case ES_EXCEPTION:
- vc_forward_exception(&ctxt);
- break;
- case ES_RETRY:
- /* Nothing to do */
- break;
- default:
- pr_emerg("Unknown result in %s():%d\n", __func__, result);
- /*
- * Emulating the instruction which caused the #VC exception
- * failed - can't continue so print debug information
- */
- BUG();
- }
-
- return ret;
-}
-
-static __always_inline bool vc_is_db(unsigned long error_code)
-{
- return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
-}
-
-/*
- * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
- * and will panic when an error happens.
- */
-DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
-{
- irqentry_state_t irq_state;
-
- /*
- * With the current implementation it is always possible to switch to a
- * safe stack because #VC exceptions only happen at known places, like
- * intercepted instructions or accesses to MMIO areas/IO ports. They can
- * also happen with code instrumentation when the hypervisor intercepts
- * #DB, but the critical paths are forbidden to be instrumented, so #DB
- * exceptions currently also only happen in safe places.
- *
- * But keep this here in case the noinstr annotations are violated due
- * to bug elsewhere.
- */
- if (unlikely(vc_from_invalid_context(regs))) {
- instrumentation_begin();
- panic("Can't handle #VC exception from unsupported context\n");
- instrumentation_end();
- }
-
- /*
- * Handle #DB before calling into !noinstr code to avoid recursive #DB.
- */
- if (vc_is_db(error_code)) {
- exc_debug(regs);
- return;
- }
-
- irq_state = irqentry_nmi_enter(regs);
-
- instrumentation_begin();
-
- if (!vc_raw_handle_exception(regs, error_code)) {
- /* Show some debug info */
- show_regs(regs);
-
- /* Ask hypervisor to sev_es_terminate */
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-
- /* If that fails and we get here - just panic */
- panic("Returned from Terminate-Request to Hypervisor\n");
- }
-
- instrumentation_end();
- irqentry_nmi_exit(regs, irq_state);
-}
-
-/*
- * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
- * and will kill the current task with SIGBUS when an error happens.
- */
-DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
-{
- /*
- * Handle #DB before calling into !noinstr code to avoid recursive #DB.
- */
- if (vc_is_db(error_code)) {
- noist_exc_debug(regs);
- return;
- }
-
- irqentry_enter_from_user_mode(regs);
- instrumentation_begin();
-
- if (!vc_raw_handle_exception(regs, error_code)) {
- /*
- * Do not kill the machine if user-space triggered the
- * exception. Send SIGBUS instead and let user-space deal with
- * it.
- */
- force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
- }
-
- instrumentation_end();
- irqentry_exit_to_user_mode(regs);
-}
-
-bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
-{
- unsigned long exit_code = regs->orig_ax;
- struct es_em_ctxt ctxt;
- enum es_result result;
-
- vc_ghcb_invalidate(boot_ghcb);
-
- result = vc_init_em_ctxt(&ctxt, regs, exit_code);
- if (result == ES_OK)
- result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
-
- /* Done - now check the result */
- switch (result) {
- case ES_OK:
- vc_finish_insn(&ctxt);
- break;
- case ES_UNSUPPORTED:
- early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
- exit_code, regs->ip);
- goto fail;
- case ES_VMM_ERROR:
- early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
- exit_code, regs->ip);
- goto fail;
- case ES_DECODE_FAILED:
- early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
- exit_code, regs->ip);
- goto fail;
- case ES_EXCEPTION:
- vc_early_forward_exception(&ctxt);
- break;
- case ES_RETRY:
- /* Nothing to do */
- break;
- default:
- BUG();
- }
-
- return true;
-
-fail:
- show_regs(regs);
-
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-}
-
-/*
- * Initial set up of SNP relies on information provided by the
- * Confidential Computing blob, which can be passed to the kernel
- * in the following ways, depending on how it is booted:
- *
- * - when booted via the boot/decompress kernel:
- * - via boot_params
- *
- * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH):
- * - via a setup_data entry, as defined by the Linux Boot Protocol
- *
- * Scan for the blob in that order.
- */
-static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
-{
- struct cc_blob_sev_info *cc_info;
-
- /* Boot kernel would have passed the CC blob via boot_params. */
- if (bp->cc_blob_address) {
- cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address;
- goto found_cc_info;
- }
-
- /*
- * If kernel was booted directly, without the use of the
- * boot/decompression kernel, the CC blob may have been passed via
- * setup_data instead.
- */
- cc_info = find_cc_blob_setup_data(bp);
- if (!cc_info)
- return NULL;
-
-found_cc_info:
- if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
- snp_abort();
-
- return cc_info;
-}
-
-static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
-{
- struct svsm_call call = {};
- int ret;
- u64 pa;
-
- /*
- * Record the SVSM Calling Area address (CAA) if the guest is not
- * running at VMPL0. The CA will be used to communicate with the
- * SVSM to perform the SVSM services.
- */
- if (!svsm_setup_ca(cc_info))
- return;
-
- /*
- * It is very early in the boot and the kernel is running identity
- * mapped but without having adjusted the pagetables to where the
- * kernel was loaded (physbase), so the get the CA address using
- * RIP-relative addressing.
- */
- pa = (u64)&RIP_REL_REF(boot_svsm_ca_page);
-
- /*
- * Switch over to the boot SVSM CA while the current CA is still
- * addressable. There is no GHCB at this point so use the MSR protocol.
- *
- * SVSM_CORE_REMAP_CA call:
- * RAX = 0 (Protocol=0, CallID=0)
- * RCX = New CA GPA
- */
- call.caa = svsm_get_caa();
- call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA);
- call.rcx = pa;
- ret = svsm_perform_call_protocol(&call);
- if (ret)
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL);
-
- RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa;
- RIP_REL_REF(boot_svsm_caa_pa) = pa;
-}
-
-bool __head snp_init(struct boot_params *bp)
-{
- struct cc_blob_sev_info *cc_info;
-
- if (!bp)
- return false;
-
- cc_info = find_cc_blob(bp);
- if (!cc_info)
- return false;
-
- if (cc_info->secrets_phys && cc_info->secrets_len == PAGE_SIZE)
- secrets_pa = cc_info->secrets_phys;
- else
- return false;
-
- setup_cpuid_table(cc_info);
-
- svsm_setup(cc_info);
-
- /*
- * The CC blob will be used later to access the secrets page. Cache
- * it here like the boot kernel does.
- */
- bp->cc_blob_address = (u32)(unsigned long)cc_info;
-
- return true;
-}
-
-void __head __noreturn snp_abort(void)
-{
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
-}
-
/*
* SEV-SNP guests should only execute dmi_setup() if EFI_CONFIG_TABLES are
* enabled, as the alternative (fallback) logic for DMI probing in the legacy
@@ -2835,7 +1596,7 @@ struct snp_msg_desc *snp_msg_alloc(void)
if (!mdesc)
return ERR_PTR(-ENOMEM);
- mem = ioremap_encrypted(secrets_pa, PAGE_SIZE);
+ mem = ioremap_encrypted(sev_secrets_pa, PAGE_SIZE);
if (!mem)
goto e_free_mdesc;
diff --git a/arch/x86/coco/sev/sev-nmi.c b/arch/x86/coco/sev/sev-nmi.c
new file mode 100644
index 000000000000..d8dfaddfb367
--- /dev/null
+++ b/arch/x86/coco/sev/sev-nmi.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "SEV: " fmt
+
+#include <linux/bug.h>
+#include <linux/kernel.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/msr.h>
+#include <asm/ptrace.h>
+#include <asm/sev.h>
+#include <asm/sev-internal.h>
+
+static __always_inline bool on_vc_stack(struct pt_regs *regs)
+{
+ unsigned long sp = regs->sp;
+
+ /* User-mode RSP is not trusted */
+ if (user_mode(regs))
+ return false;
+
+ /* SYSCALL gap still has user-mode RSP */
+ if (ip_within_syscall_gap(regs))
+ return false;
+
+ return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
+}
+
+/*
+ * This function handles the case when an NMI is raised in the #VC
+ * exception handler entry code, before the #VC handler has switched off
+ * its IST stack. In this case, the IST entry for #VC must be adjusted,
+ * so that any nested #VC exception will not overwrite the stack
+ * contents of the interrupted #VC handler.
+ *
+ * The IST entry is adjusted unconditionally so that it can be also be
+ * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a
+ * nested sev_es_ist_exit() call may adjust back the IST entry too
+ * early.
+ *
+ * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run
+ * on the NMI IST stack, as they are only called from NMI handling code
+ * right now.
+ */
+void noinstr __sev_es_ist_enter(struct pt_regs *regs)
+{
+ unsigned long old_ist, new_ist;
+
+ /* Read old IST entry */
+ new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+
+ /*
+ * If NMI happened while on the #VC IST stack, set the new IST
+ * value below regs->sp, so that the interrupted stack frame is
+ * not overwritten by subsequent #VC exceptions.
+ */
+ if (on_vc_stack(regs))
+ new_ist = regs->sp;
+
+ /*
+ * Reserve additional 8 bytes and store old IST value so this
+ * adjustment can be unrolled in __sev_es_ist_exit().
+ */
+ new_ist -= sizeof(old_ist);
+ *(unsigned long *)new_ist = old_ist;
+
+ /* Set new IST entry */
+ this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
+}
+
+void noinstr __sev_es_ist_exit(void)
+{
+ unsigned long ist;
+
+ /* Read IST entry */
+ ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+
+ if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
+ return;
+
+ /* Read back old IST entry and write it to the TSS */
+ this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
+}
+
+void noinstr __sev_es_nmi_complete(void)
+{
+ struct ghcb_state state;
+ struct ghcb *ghcb;
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
+ VMGEXIT();
+
+ __sev_put_ghcb(&state);
+}
diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c
new file mode 100644
index 000000000000..b4895c648024
--- /dev/null
+++ b/arch/x86/coco/sev/vc-handle.c
@@ -0,0 +1,1061 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "SEV: " fmt
+
+#include <linux/sched/debug.h> /* For show_regs() */
+#include <linux/cc_platform.h>
+#include <linux/printk.h>
+#include <linux/mm_types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/psp-sev.h>
+#include <uapi/linux/sev-guest.h>
+
+#include <asm/init.h>
+#include <asm/stacktrace.h>
+#include <asm/sev.h>
+#include <asm/sev-internal.h>
+#include <asm/insn-eval.h>
+#include <asm/fpu/xcr.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/traps.h>
+#include <asm/svm.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/cpuid.h>
+
+static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ unsigned long vaddr, phys_addr_t *paddr)
+{
+ unsigned long va = (unsigned long)vaddr;
+ unsigned int level;
+ phys_addr_t pa;
+ pgd_t *pgd;
+ pte_t *pte;
+
+ pgd = __va(read_cr3_pa());
+ pgd = &pgd[pgd_index(va)];
+ pte = lookup_address_in_pgd(pgd, va, &level);
+ if (!pte) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.cr2 = vaddr;
+ ctxt->fi.error_code = 0;
+
+ if (user_mode(ctxt->regs))
+ ctxt->fi.error_code |= X86_PF_USER;
+
+ return ES_EXCEPTION;
+ }
+
+ if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
+ /* Emulated MMIO to/from encrypted memory not supported */
+ return ES_UNSUPPORTED;
+
+ pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
+ pa |= va & ~page_level_mask(level);
+
+ *paddr = pa;
+
+ return ES_OK;
+}
+
+static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
+{
+ BUG_ON(size > 4);
+
+ if (user_mode(ctxt->regs)) {
+ struct thread_struct *t = &current->thread;
+ struct io_bitmap *iobm = t->io_bitmap;
+ size_t idx;
+
+ if (!iobm)
+ goto fault;
+
+ for (idx = port; idx < port + size; ++idx) {
+ if (test_bit(idx, iobm->bitmap))
+ goto fault;
+ }
+ }
+
+ return ES_OK;
+
+fault:
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+
+ return ES_EXCEPTION;
+}
+
+void vc_forward_exception(struct es_em_ctxt *ctxt)
+{
+ long error_code = ctxt->fi.error_code;
+ int trapnr = ctxt->fi.vector;
+
+ ctxt->regs->orig_ax = ctxt->fi.error_code;
+
+ switch (trapnr) {
+ case X86_TRAP_GP:
+ exc_general_protection(ctxt->regs, error_code);
+ break;
+ case X86_TRAP_UD:
+ exc_invalid_op(ctxt->regs);
+ break;
+ case X86_TRAP_PF:
+ write_cr2(ctxt->fi.cr2);
+ exc_page_fault(ctxt->regs, error_code);
+ break;
+ case X86_TRAP_AC:
+ exc_alignment_check(ctxt->regs, error_code);
+ break;
+ default:
+ pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
+ BUG();
+ }
+}
+
+static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
+ unsigned char *buffer)
+{
+ return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+}
+
+static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ int insn_bytes;
+
+ insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
+ if (insn_bytes == 0) {
+ /* Nothing could be copied */
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
+ ctxt->fi.cr2 = ctxt->regs->ip;
+ return ES_EXCEPTION;
+ } else if (insn_bytes == -EINVAL) {
+ /* Effective RIP could not be calculated */
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ ctxt->fi.cr2 = 0;
+ return ES_EXCEPTION;
+ }
+
+ if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes))
+ return ES_DECODE_FAILED;
+
+ if (ctxt->insn.immediate.got)
+ return ES_OK;
+ else
+ return ES_DECODE_FAILED;
+}
+
+static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ int res, ret;
+
+ res = vc_fetch_insn_kernel(ctxt, buffer);
+ if (res) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_INSTR;
+ ctxt->fi.cr2 = ctxt->regs->ip;
+ return ES_EXCEPTION;
+ }
+
+ ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
+ if (ret < 0)
+ return ES_DECODE_FAILED;
+ else
+ return ES_OK;
+}
+
+static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+{
+ if (user_mode(ctxt->regs))
+ return __vc_decode_user_insn(ctxt);
+ else
+ return __vc_decode_kern_insn(ctxt);
+}
+
+static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
+ char *dst, char *buf, size_t size)
+{
+ unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
+
+ /*
+ * This function uses __put_user() independent of whether kernel or user
+ * memory is accessed. This works fine because __put_user() does no
+ * sanity checks of the pointer being accessed. All that it does is
+ * to report when the access failed.
+ *
+ * Also, this function runs in atomic context, so __put_user() is not
+ * allowed to sleep. The page-fault handler detects that it is running
+ * in atomic context and will not try to take mmap_sem and handle the
+ * fault, so additional pagefault_enable()/disable() calls are not
+ * needed.
+ *
+ * The access can't be done via copy_to_user() here because
+ * vc_write_mem() must not use string instructions to access unsafe
+ * memory. The reason is that MOVS is emulated by the #VC handler by
+ * splitting the move up into a read and a write and taking a nested #VC
+ * exception on whatever of them is the MMIO access. Using string
+ * instructions here would cause infinite nesting.
+ */
+ switch (size) {
+ case 1: {
+ u8 d1;
+ u8 __user *target = (u8 __user *)dst;
+
+ memcpy(&d1, buf, 1);
+ if (__put_user(d1, target))
+ goto fault;
+ break;
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *target = (u16 __user *)dst;
+
+ memcpy(&d2, buf, 2);
+ if (__put_user(d2, target))
+ goto fault;
+ break;
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *target = (u32 __user *)dst;
+
+ memcpy(&d4, buf, 4);
+ if (__put_user(d4, target))
+ goto fault;
+ break;
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *target = (u64 __user *)dst;
+
+ memcpy(&d8, buf, 8);
+ if (__put_user(d8, target))
+ goto fault;
+ break;
+ }
+ default:
+ WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
+ return ES_UNSUPPORTED;
+ }
+
+ return ES_OK;
+
+fault:
+ if (user_mode(ctxt->regs))
+ error_code |= X86_PF_USER;
+
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = error_code;
+ ctxt->fi.cr2 = (unsigned long)dst;
+
+ return ES_EXCEPTION;
+}
+
+static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
+ char *src, char *buf, size_t size)
+{
+ unsigned long error_code = X86_PF_PROT;
+
+ /*
+ * This function uses __get_user() independent of whether kernel or user
+ * memory is accessed. This works fine because __get_user() does no
+ * sanity checks of the pointer being accessed. All that it does is
+ * to report when the access failed.
+ *
+ * Also, this function runs in atomic context, so __get_user() is not
+ * allowed to sleep. The page-fault handler detects that it is running
+ * in atomic context and will not try to take mmap_sem and handle the
+ * fault, so additional pagefault_enable()/disable() calls are not
+ * needed.
+ *
+ * The access can't be done via copy_from_user() here because
+ * vc_read_mem() must not use string instructions to access unsafe
+ * memory. The reason is that MOVS is emulated by the #VC handler by
+ * splitting the move up into a read and a write and taking a nested #VC
+ * exception on whatever of them is the MMIO access. Using string
+ * instructions here would cause infinite nesting.
+ */
+ switch (size) {
+ case 1: {
+ u8 d1;
+ u8 __user *s = (u8 __user *)src;
+
+ if (__get_user(d1, s))
+ goto fault;
+ memcpy(buf, &d1, 1);
+ break;
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *s = (u16 __user *)src;
+
+ if (__get_user(d2, s))
+ goto fault;
+ memcpy(buf, &d2, 2);
+ break;
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *s = (u32 __user *)src;
+
+ if (__get_user(d4, s))
+ goto fault;
+ memcpy(buf, &d4, 4);
+ break;
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *s = (u64 __user *)src;
+ if (__get_user(d8, s))
+ goto fault;
+ memcpy(buf, &d8, 8);
+ break;
+ }
+ default:
+ WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
+ return ES_UNSUPPORTED;
+ }
+
+ return ES_OK;
+
+fault:
+ if (user_mode(ctxt->regs))
+ error_code |= X86_PF_USER;
+
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = error_code;
+ ctxt->fi.cr2 = (unsigned long)src;
+
+ return ES_EXCEPTION;
+}
+
+#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__)
+
+#include "vc-shared.c"
+
+/* Writes to the SVSM CAA MSR are ignored */
+static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write)
+{
+ if (write)
+ return ES_OK;
+
+ regs->ax = lower_32_bits(this_cpu_read(svsm_caa_pa));
+ regs->dx = upper_32_bits(this_cpu_read(svsm_caa_pa));
+
+ return ES_OK;
+}
+
+/*
+ * TSC related accesses should not exit to the hypervisor when a guest is
+ * executing with Secure TSC enabled, so special handling is required for
+ * accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ.
+ */
+static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write)
+{
+ u64 tsc;
+
+ /*
+ * GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled.
+ * Terminate the SNP guest when the interception is enabled.
+ */
+ if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ)
+ return ES_VMM_ERROR;
+
+ /*
+ * Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC
+ * to return undefined values, so ignore all writes.
+ *
+ * Reads: Reads of MSR_IA32_TSC should return the current TSC value, use
+ * the value returned by rdtsc_ordered().
+ */
+ if (write) {
+ WARN_ONCE(1, "TSC MSR writes are verboten!\n");
+ return ES_OK;
+ }
+
+ tsc = rdtsc_ordered();
+ regs->ax = lower_32_bits(tsc);
+ regs->dx = upper_32_bits(tsc);
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ struct pt_regs *regs = ctxt->regs;
+ enum es_result ret;
+ bool write;
+
+ /* Is it a WRMSR? */
+ write = ctxt->insn.opcode.bytes[1] == 0x30;
+
+ switch (regs->cx) {
+ case MSR_SVSM_CAA:
+ return __vc_handle_msr_caa(regs, write);
+ case MSR_IA32_TSC:
+ case MSR_AMD64_GUEST_TSC_FREQ:
+ if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
+ return __vc_handle_secure_tsc_msrs(regs, write);
+ break;
+ default:
+ break;
+ }
+
+ ghcb_set_rcx(ghcb, regs->cx);
+ if (write) {
+ ghcb_set_rax(ghcb, regs->ax);
+ ghcb_set_rdx(ghcb, regs->dx);
+ }
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, write, 0);
+
+ if ((ret == ES_OK) && !write) {
+ regs->ax = ghcb->save.rax;
+ regs->dx = ghcb->save.rdx;
+ }
+
+ return ret;
+}
+
+static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
+{
+ int trapnr = ctxt->fi.vector;
+
+ if (trapnr == X86_TRAP_PF)
+ native_write_cr2(ctxt->fi.cr2);
+
+ ctxt->regs->orig_ax = ctxt->fi.error_code;
+ do_early_exception(ctxt->regs, trapnr);
+}
+
+static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
+{
+ long *reg_array;
+ int offset;
+
+ reg_array = (long *)ctxt->regs;
+ offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs);
+
+ if (offset < 0)
+ return NULL;
+
+ offset /= sizeof(long);
+
+ return reg_array + offset;
+}
+static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ unsigned int bytes, bool read)
+{
+ u64 exit_code, exit_info_1, exit_info_2;
+ unsigned long ghcb_pa = __pa(ghcb);
+ enum es_result res;
+ phys_addr_t paddr;
+ void __user *ref;
+
+ ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs);
+ if (ref == (void __user *)-1L)
+ return ES_UNSUPPORTED;
+
+ exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
+
+ res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr);
+ if (res != ES_OK) {
+ if (res == ES_EXCEPTION && !read)
+ ctxt->fi.error_code |= X86_PF_WRITE;
+
+ return res;
+ }
+
+ exit_info_1 = paddr;
+ /* Can never be greater than 8 */
+ exit_info_2 = bytes;
+
+ ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
+
+ return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
+}
+
+/*
+ * The MOVS instruction has two memory operands, which raises the
+ * problem that it is not known whether the access to the source or the
+ * destination caused the #VC exception (and hence whether an MMIO read
+ * or write operation needs to be emulated).
+ *
+ * Instead of playing games with walking page-tables and trying to guess
+ * whether the source or destination is an MMIO range, split the move
+ * into two operations, a read and a write with only one memory operand.
+ * This will cause a nested #VC exception on the MMIO address which can
+ * then be handled.
+ *
+ * This implementation has the benefit that it also supports MOVS where
+ * source _and_ destination are MMIO regions.
+ *
+ * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
+ * rare operation. If it turns out to be a performance problem the split
+ * operations can be moved to memcpy_fromio() and memcpy_toio().
+ */
+static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
+ unsigned int bytes)
+{
+ unsigned long ds_base, es_base;
+ unsigned char *src, *dst;
+ unsigned char buffer[8];
+ enum es_result ret;
+ bool rep;
+ int off;
+
+ ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS);
+ es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
+
+ if (ds_base == -1L || es_base == -1L) {
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+ }
+
+ src = ds_base + (unsigned char *)ctxt->regs->si;
+ dst = es_base + (unsigned char *)ctxt->regs->di;
+
+ ret = vc_read_mem(ctxt, src, buffer, bytes);
+ if (ret != ES_OK)
+ return ret;
+
+ ret = vc_write_mem(ctxt, dst, buffer, bytes);
+ if (ret != ES_OK)
+ return ret;
+
+ if (ctxt->regs->flags & X86_EFLAGS_DF)
+ off = -bytes;
+ else
+ off = bytes;
+
+ ctxt->regs->si += off;
+ ctxt->regs->di += off;
+
+ rep = insn_has_rep_prefix(&ctxt->insn);
+ if (rep)
+ ctxt->regs->cx -= 1;
+
+ if (!rep || ctxt->regs->cx == 0)
+ return ES_OK;
+ else
+ return ES_RETRY;
+}
+
+static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ struct insn *insn = &ctxt->insn;
+ enum insn_mmio_type mmio;
+ unsigned int bytes = 0;
+ enum es_result ret;
+ u8 sign_byte;
+ long *reg_data;
+
+ mmio = insn_decode_mmio(insn, &bytes);
+ if (mmio == INSN_MMIO_DECODE_FAILED)
+ return ES_DECODE_FAILED;
+
+ if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
+ reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
+ if (!reg_data)
+ return ES_DECODE_FAILED;
+ }
+
+ if (user_mode(ctxt->regs))
+ return ES_UNSUPPORTED;
+
+ switch (mmio) {
+ case INSN_MMIO_WRITE:
+ memcpy(ghcb->shared_buffer, reg_data, bytes);
+ ret = vc_do_mmio(ghcb, ctxt, bytes, false);
+ break;
+ case INSN_MMIO_WRITE_IMM:
+ memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
+ ret = vc_do_mmio(ghcb, ctxt, bytes, false);
+ break;
+ case INSN_MMIO_READ:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
+
+ /* Zero-extend for 32-bit operation */
+ if (bytes == 4)
+ *reg_data = 0;
+
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+ case INSN_MMIO_READ_ZERO_EXTEND:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
+
+ /* Zero extend based on operand size */
+ memset(reg_data, 0, insn->opnd_bytes);
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+ case INSN_MMIO_READ_SIGN_EXTEND:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
+
+ if (bytes == 1) {
+ u8 *val = (u8 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x80) ? 0xff : 0x00;
+ } else {
+ u16 *val = (u16 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x8000) ? 0xff : 0x00;
+ }
+
+ /* Sign extend based on operand size */
+ memset(reg_data, sign_byte, insn->opnd_bytes);
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+ case INSN_MMIO_MOVS:
+ ret = vc_handle_mmio_movs(ctxt, bytes);
+ break;
+ default:
+ ret = ES_UNSUPPORTED;
+ break;
+ }
+
+ return ret;
+}
+
+static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
+ long val, *reg = vc_insn_get_rm(ctxt);
+ enum es_result ret;
+
+ if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
+ return ES_VMM_ERROR;
+
+ if (!reg)
+ return ES_DECODE_FAILED;
+
+ val = *reg;
+
+ /* Upper 32 bits must be written as zeroes */
+ if (val >> 32) {
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+ }
+
+ /* Clear out other reserved bits and set bit 10 */
+ val = (val & 0xffff23ffL) | BIT(10);
+
+ /* Early non-zero writes to DR7 are not supported */
+ if (!data && (val & ~DR7_RESET_VALUE))
+ return ES_UNSUPPORTED;
+
+ /* Using a value of 0 for ExitInfo1 means RAX holds the value */
+ ghcb_set_rax(ghcb, val);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (data)
+ data->dr7 = val;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
+ long *reg = vc_insn_get_rm(ctxt);
+
+ if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
+ return ES_VMM_ERROR;
+
+ if (!reg)
+ return ES_DECODE_FAILED;
+
+ if (data)
+ *reg = data->dr7;
+ else
+ *reg = DR7_RESET_VALUE;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0);
+}
+
+static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ enum es_result ret;
+
+ ghcb_set_rcx(ghcb, ctxt->regs->cx);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ ctxt->regs->ax = ghcb->save.rax;
+ ctxt->regs->dx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_monitor(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ /*
+ * Treat it as a NOP and do not leak a physical address to the
+ * hypervisor.
+ */
+ return ES_OK;
+}
+
+static enum es_result vc_handle_mwait(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ /* Treat the same as MONITOR/MONITORX */
+ return ES_OK;
+}
+
+static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ enum es_result ret;
+
+ ghcb_set_rax(ghcb, ctxt->regs->ax);
+ ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0);
+
+ if (x86_platform.hyper.sev_es_hcall_prepare)
+ x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!ghcb_rax_is_valid(ghcb))
+ return ES_VMM_ERROR;
+
+ ctxt->regs->ax = ghcb->save.rax;
+
+ /*
+ * Call sev_es_hcall_finish() after regs->ax is already set.
+ * This allows the hypervisor handler to overwrite it again if
+ * necessary.
+ */
+ if (x86_platform.hyper.sev_es_hcall_finish &&
+ !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
+ return ES_VMM_ERROR;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ /*
+ * Calling ecx_alignment_check() directly does not work, because it
+ * enables IRQs and the GHCB is active. Forward the exception and call
+ * it later from vc_forward_exception().
+ */
+ ctxt->fi.vector = X86_TRAP_AC;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+}
+
+static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
+ struct ghcb *ghcb,
+ unsigned long exit_code)
+{
+ enum es_result result = vc_check_opcode_bytes(ctxt, exit_code);
+
+ if (result != ES_OK)
+ return result;
+
+ switch (exit_code) {
+ case SVM_EXIT_READ_DR7:
+ result = vc_handle_dr7_read(ghcb, ctxt);
+ break;
+ case SVM_EXIT_WRITE_DR7:
+ result = vc_handle_dr7_write(ghcb, ctxt);
+ break;
+ case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
+ result = vc_handle_trap_ac(ghcb, ctxt);
+ break;
+ case SVM_EXIT_RDTSC:
+ case SVM_EXIT_RDTSCP:
+ result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
+ break;
+ case SVM_EXIT_RDPMC:
+ result = vc_handle_rdpmc(ghcb, ctxt);
+ break;
+ case SVM_EXIT_INVD:
+ pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
+ result = ES_UNSUPPORTED;
+ break;
+ case SVM_EXIT_CPUID:
+ result = vc_handle_cpuid(ghcb, ctxt);
+ break;
+ case SVM_EXIT_IOIO:
+ result = vc_handle_ioio(ghcb, ctxt);
+ break;
+ case SVM_EXIT_MSR:
+ result = vc_handle_msr(ghcb, ctxt);
+ break;
+ case SVM_EXIT_VMMCALL:
+ result = vc_handle_vmmcall(ghcb, ctxt);
+ break;
+ case SVM_EXIT_WBINVD:
+ result = vc_handle_wbinvd(ghcb, ctxt);
+ break;
+ case SVM_EXIT_MONITOR:
+ result = vc_handle_monitor(ghcb, ctxt);
+ break;
+ case SVM_EXIT_MWAIT:
+ result = vc_handle_mwait(ghcb, ctxt);
+ break;
+ case SVM_EXIT_NPF:
+ result = vc_handle_mmio(ghcb, ctxt);
+ break;
+ default:
+ /*
+ * Unexpected #VC exception
+ */
+ result = ES_UNSUPPORTED;
+ }
+
+ return result;
+}
+
+static __always_inline bool is_vc2_stack(unsigned long sp)
+{
+ return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
+}
+
+static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
+{
+ unsigned long sp, prev_sp;
+
+ sp = (unsigned long)regs;
+ prev_sp = regs->sp;
+
+ /*
+ * If the code was already executing on the VC2 stack when the #VC
+ * happened, let it proceed to the normal handling routine. This way the
+ * code executing on the VC2 stack can cause #VC exceptions to get handled.
+ */
+ return is_vc2_stack(sp) && !is_vc2_stack(prev_sp);
+}
+
+static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
+{
+ struct ghcb_state state;
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+ struct ghcb *ghcb;
+ bool ret = true;
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ result = vc_init_em_ctxt(&ctxt, regs, error_code);
+
+ if (result == ES_OK)
+ result = vc_handle_exitcode(&ctxt, ghcb, error_code);
+
+ __sev_put_ghcb(&state);
+
+ /* Done - now check the result */
+ switch (result) {
+ case ES_OK:
+ vc_finish_insn(&ctxt);
+ break;
+ case ES_UNSUPPORTED:
+ pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n",
+ error_code, regs->ip);
+ ret = false;
+ break;
+ case ES_VMM_ERROR:
+ pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
+ error_code, regs->ip);
+ ret = false;
+ break;
+ case ES_DECODE_FAILED:
+ pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
+ error_code, regs->ip);
+ ret = false;
+ break;
+ case ES_EXCEPTION:
+ vc_forward_exception(&ctxt);
+ break;
+ case ES_RETRY:
+ /* Nothing to do */
+ break;
+ default:
+ pr_emerg("Unknown result in %s():%d\n", __func__, result);
+ /*
+ * Emulating the instruction which caused the #VC exception
+ * failed - can't continue so print debug information
+ */
+ BUG();
+ }
+
+ return ret;
+}
+
+static __always_inline bool vc_is_db(unsigned long error_code)
+{
+ return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
+}
+
+/*
+ * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
+ * and will panic when an error happens.
+ */
+DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
+{
+ irqentry_state_t irq_state;
+
+ /*
+ * With the current implementation it is always possible to switch to a
+ * safe stack because #VC exceptions only happen at known places, like
+ * intercepted instructions or accesses to MMIO areas/IO ports. They can
+ * also happen with code instrumentation when the hypervisor intercepts
+ * #DB, but the critical paths are forbidden to be instrumented, so #DB
+ * exceptions currently also only happen in safe places.
+ *
+ * But keep this here in case the noinstr annotations are violated due
+ * to bug elsewhere.
+ */
+ if (unlikely(vc_from_invalid_context(regs))) {
+ instrumentation_begin();
+ panic("Can't handle #VC exception from unsupported context\n");
+ instrumentation_end();
+ }
+
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (vc_is_db(error_code)) {
+ exc_debug(regs);
+ return;
+ }
+
+ irq_state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+
+ if (!vc_raw_handle_exception(regs, error_code)) {
+ /* Show some debug info */
+ show_regs(regs);
+
+ /* Ask hypervisor to sev_es_terminate */
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ /* If that fails and we get here - just panic */
+ panic("Returned from Terminate-Request to Hypervisor\n");
+ }
+
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
+}
+
+/*
+ * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
+ * and will kill the current task with SIGBUS when an error happens.
+ */
+DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
+{
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (vc_is_db(error_code)) {
+ noist_exc_debug(regs);
+ return;
+ }
+
+ irqentry_enter_from_user_mode(regs);
+ instrumentation_begin();
+
+ if (!vc_raw_handle_exception(regs, error_code)) {
+ /*
+ * Do not kill the machine if user-space triggered the
+ * exception. Send SIGBUS instead and let user-space deal with
+ * it.
+ */
+ force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
+ }
+
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+}
+
+bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
+{
+ unsigned long exit_code = regs->orig_ax;
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+
+ vc_ghcb_invalidate(boot_ghcb);
+
+ result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+ if (result == ES_OK)
+ result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
+
+ /* Done - now check the result */
+ switch (result) {
+ case ES_OK:
+ vc_finish_insn(&ctxt);
+ break;
+ case ES_UNSUPPORTED:
+ early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
+ exit_code, regs->ip);
+ goto fail;
+ case ES_VMM_ERROR:
+ early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
+ exit_code, regs->ip);
+ goto fail;
+ case ES_DECODE_FAILED:
+ early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
+ exit_code, regs->ip);
+ goto fail;
+ case ES_EXCEPTION:
+ vc_early_forward_exception(&ctxt);
+ break;
+ case ES_RETRY:
+ /* Nothing to do */
+ break;
+ default:
+ BUG();
+ }
+
+ return true;
+
+fail:
+ show_regs(regs);
+
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
+
diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c
new file mode 100644
index 000000000000..2c0ab0fdc060
--- /dev/null
+++ b/arch/x86/coco/sev/vc-shared.c
@@ -0,0 +1,504 @@
+// SPDX-License-Identifier: GPL-2.0
+
+static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt,
+ unsigned long exit_code)
+{
+ unsigned int opcode = (unsigned int)ctxt->insn.opcode.value;
+ u8 modrm = ctxt->insn.modrm.value;
+
+ switch (exit_code) {
+
+ case SVM_EXIT_IOIO:
+ case SVM_EXIT_NPF:
+ /* handled separately */
+ return ES_OK;
+
+ case SVM_EXIT_CPUID:
+ if (opcode == 0xa20f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_INVD:
+ if (opcode == 0x080f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_MONITOR:
+ /* MONITOR and MONITORX instructions generate the same error code */
+ if (opcode == 0x010f && (modrm == 0xc8 || modrm == 0xfa))
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_MWAIT:
+ /* MWAIT and MWAITX instructions generate the same error code */
+ if (opcode == 0x010f && (modrm == 0xc9 || modrm == 0xfb))
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_MSR:
+ /* RDMSR */
+ if (opcode == 0x320f ||
+ /* WRMSR */
+ opcode == 0x300f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_RDPMC:
+ if (opcode == 0x330f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_RDTSC:
+ if (opcode == 0x310f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_RDTSCP:
+ if (opcode == 0x010f && modrm == 0xf9)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_READ_DR7:
+ if (opcode == 0x210f &&
+ X86_MODRM_REG(ctxt->insn.modrm.value) == 7)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_VMMCALL:
+ if (opcode == 0x010f && modrm == 0xd9)
+ return ES_OK;
+
+ break;
+
+ case SVM_EXIT_WRITE_DR7:
+ if (opcode == 0x230f &&
+ X86_MODRM_REG(ctxt->insn.modrm.value) == 7)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_WBINVD:
+ if (opcode == 0x90f)
+ return ES_OK;
+ break;
+
+ default:
+ break;
+ }
+
+ sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n",
+ opcode, exit_code, ctxt->regs->ip);
+
+ return ES_UNSUPPORTED;
+}
+
+static bool vc_decoding_needed(unsigned long exit_code)
+{
+ /* Exceptions don't require to decode the instruction */
+ return !(exit_code >= SVM_EXIT_EXCP_BASE &&
+ exit_code <= SVM_EXIT_LAST_EXCP);
+}
+
+static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt,
+ struct pt_regs *regs,
+ unsigned long exit_code)
+{
+ enum es_result ret = ES_OK;
+
+ memset(ctxt, 0, sizeof(*ctxt));
+ ctxt->regs = regs;
+
+ if (vc_decoding_needed(exit_code))
+ ret = vc_decode_insn(ctxt);
+
+ return ret;
+}
+
+static void vc_finish_insn(struct es_em_ctxt *ctxt)
+{
+ ctxt->regs->ip += ctxt->insn.length;
+}
+
+static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt,
+ unsigned long address,
+ bool write)
+{
+ if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_USER;
+ ctxt->fi.cr2 = address;
+ if (write)
+ ctxt->fi.error_code |= X86_PF_WRITE;
+
+ return ES_EXCEPTION;
+ }
+
+ return ES_OK;
+}
+
+static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
+ void *src, char *buf,
+ unsigned int data_size,
+ unsigned int count,
+ bool backwards)
+{
+ int i, b = backwards ? -1 : 1;
+ unsigned long address = (unsigned long)src;
+ enum es_result ret;
+
+ ret = vc_insn_string_check(ctxt, address, false);
+ if (ret != ES_OK)
+ return ret;
+
+ for (i = 0; i < count; i++) {
+ void *s = src + (i * data_size * b);
+ char *d = buf + (i * data_size);
+
+ ret = vc_read_mem(ctxt, s, d, data_size);
+ if (ret != ES_OK)
+ break;
+ }
+
+ return ret;
+}
+
+static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
+ void *dst, char *buf,
+ unsigned int data_size,
+ unsigned int count,
+ bool backwards)
+{
+ int i, s = backwards ? -1 : 1;
+ unsigned long address = (unsigned long)dst;
+ enum es_result ret;
+
+ ret = vc_insn_string_check(ctxt, address, true);
+ if (ret != ES_OK)
+ return ret;
+
+ for (i = 0; i < count; i++) {
+ void *d = dst + (i * data_size * s);
+ char *b = buf + (i * data_size);
+
+ ret = vc_write_mem(ctxt, d, b, data_size);
+ if (ret != ES_OK)
+ break;
+ }
+
+ return ret;
+}
+
+#define IOIO_TYPE_STR BIT(2)
+#define IOIO_TYPE_IN 1
+#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR)
+#define IOIO_TYPE_OUT 0
+#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR)
+
+#define IOIO_REP BIT(3)
+
+#define IOIO_ADDR_64 BIT(9)
+#define IOIO_ADDR_32 BIT(8)
+#define IOIO_ADDR_16 BIT(7)
+
+#define IOIO_DATA_32 BIT(6)
+#define IOIO_DATA_16 BIT(5)
+#define IOIO_DATA_8 BIT(4)
+
+#define IOIO_SEG_ES (0 << 10)
+#define IOIO_SEG_DS (3 << 10)
+
+static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
+{
+ struct insn *insn = &ctxt->insn;
+ size_t size;
+ u64 port;
+
+ *exitinfo = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ /* INS opcodes */
+ case 0x6c:
+ case 0x6d:
+ *exitinfo |= IOIO_TYPE_INS;
+ *exitinfo |= IOIO_SEG_ES;
+ port = ctxt->regs->dx & 0xffff;
+ break;
+
+ /* OUTS opcodes */
+ case 0x6e:
+ case 0x6f:
+ *exitinfo |= IOIO_TYPE_OUTS;
+ *exitinfo |= IOIO_SEG_DS;
+ port = ctxt->regs->dx & 0xffff;
+ break;
+
+ /* IN immediate opcodes */
+ case 0xe4:
+ case 0xe5:
+ *exitinfo |= IOIO_TYPE_IN;
+ port = (u8)insn->immediate.value & 0xffff;
+ break;
+
+ /* OUT immediate opcodes */
+ case 0xe6:
+ case 0xe7:
+ *exitinfo |= IOIO_TYPE_OUT;
+ port = (u8)insn->immediate.value & 0xffff;
+ break;
+
+ /* IN register opcodes */
+ case 0xec:
+ case 0xed:
+ *exitinfo |= IOIO_TYPE_IN;
+ port = ctxt->regs->dx & 0xffff;
+ break;
+
+ /* OUT register opcodes */
+ case 0xee:
+ case 0xef:
+ *exitinfo |= IOIO_TYPE_OUT;
+ port = ctxt->regs->dx & 0xffff;
+ break;
+
+ default:
+ return ES_DECODE_FAILED;
+ }
+
+ *exitinfo |= port << 16;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0x6c:
+ case 0x6e:
+ case 0xe4:
+ case 0xe6:
+ case 0xec:
+ case 0xee:
+ /* Single byte opcodes */
+ *exitinfo |= IOIO_DATA_8;
+ size = 1;
+ break;
+ default:
+ /* Length determined by instruction parsing */
+ *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16
+ : IOIO_DATA_32;
+ size = (insn->opnd_bytes == 2) ? 2 : 4;
+ }
+
+ switch (insn->addr_bytes) {
+ case 2:
+ *exitinfo |= IOIO_ADDR_16;
+ break;
+ case 4:
+ *exitinfo |= IOIO_ADDR_32;
+ break;
+ case 8:
+ *exitinfo |= IOIO_ADDR_64;
+ break;
+ }
+
+ if (insn_has_rep_prefix(insn))
+ *exitinfo |= IOIO_REP;
+
+ return vc_ioio_check(ctxt, (u16)port, size);
+}
+
+static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ struct pt_regs *regs = ctxt->regs;
+ u64 exit_info_1, exit_info_2;
+ enum es_result ret;
+
+ ret = vc_ioio_exitinfo(ctxt, &exit_info_1);
+ if (ret != ES_OK)
+ return ret;
+
+ if (exit_info_1 & IOIO_TYPE_STR) {
+
+ /* (REP) INS/OUTS */
+
+ bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF);
+ unsigned int io_bytes, exit_bytes;
+ unsigned int ghcb_count, op_count;
+ unsigned long es_base;
+ u64 sw_scratch;
+
+ /*
+ * For the string variants with rep prefix the amount of in/out
+ * operations per #VC exception is limited so that the kernel
+ * has a chance to take interrupts and re-schedule while the
+ * instruction is emulated.
+ */
+ io_bytes = (exit_info_1 >> 4) & 0x7;
+ ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes;
+
+ op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1;
+ exit_info_2 = min(op_count, ghcb_count);
+ exit_bytes = exit_info_2 * io_bytes;
+
+ es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
+
+ /* Read bytes of OUTS into the shared buffer */
+ if (!(exit_info_1 & IOIO_TYPE_IN)) {
+ ret = vc_insn_string_read(ctxt,
+ (void *)(es_base + regs->si),
+ ghcb->shared_buffer, io_bytes,
+ exit_info_2, df);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Issue an VMGEXIT to the HV to consume the bytes from the
+ * shared buffer or to have it write them into the shared buffer
+ * depending on the instruction: OUTS or INS.
+ */
+ sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer);
+ ghcb_set_sw_scratch(ghcb, sw_scratch);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO,
+ exit_info_1, exit_info_2);
+ if (ret != ES_OK)
+ return ret;
+
+ /* Read bytes from shared buffer into the guest's destination. */
+ if (exit_info_1 & IOIO_TYPE_IN) {
+ ret = vc_insn_string_write(ctxt,
+ (void *)(es_base + regs->di),
+ ghcb->shared_buffer, io_bytes,
+ exit_info_2, df);
+ if (ret)
+ return ret;
+
+ if (df)
+ regs->di -= exit_bytes;
+ else
+ regs->di += exit_bytes;
+ } else {
+ if (df)
+ regs->si -= exit_bytes;
+ else
+ regs->si += exit_bytes;
+ }
+
+ if (exit_info_1 & IOIO_REP)
+ regs->cx -= exit_info_2;
+
+ ret = regs->cx ? ES_RETRY : ES_OK;
+
+ } else {
+
+ /* IN/OUT into/from rAX */
+
+ int bits = (exit_info_1 & 0x70) >> 1;
+ u64 rax = 0;
+
+ if (!(exit_info_1 & IOIO_TYPE_IN))
+ rax = lower_bits(regs->ax, bits);
+
+ ghcb_set_rax(ghcb, rax);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (exit_info_1 & IOIO_TYPE_IN) {
+ if (!ghcb_rax_is_valid(ghcb))
+ return ES_VMM_ERROR;
+ regs->ax = lower_bits(ghcb->save.rax, bits);
+ }
+ }
+
+ return ret;
+}
+
+static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ struct pt_regs *regs = ctxt->regs;
+ struct cpuid_leaf leaf;
+ int ret;
+
+ leaf.fn = regs->ax;
+ leaf.subfn = regs->cx;
+ ret = snp_cpuid(ghcb, ctxt, &leaf);
+ if (!ret) {
+ regs->ax = leaf.eax;
+ regs->bx = leaf.ebx;
+ regs->cx = leaf.ecx;
+ regs->dx = leaf.edx;
+ }
+
+ return ret;
+}
+
+static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct pt_regs *regs = ctxt->regs;
+ u32 cr4 = native_read_cr4();
+ enum es_result ret;
+ int snp_cpuid_ret;
+
+ snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt);
+ if (!snp_cpuid_ret)
+ return ES_OK;
+ if (snp_cpuid_ret != -EOPNOTSUPP)
+ return ES_VMM_ERROR;
+
+ ghcb_set_rax(ghcb, regs->ax);
+ ghcb_set_rcx(ghcb, regs->cx);
+
+ if (cr4 & X86_CR4_OSXSAVE)
+ /* Safe to read xcr0 */
+ ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
+ else
+ /* xgetbv will cause #GP - use reset value for xcr0 */
+ ghcb_set_xcr0(ghcb, 1);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) &&
+ ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ regs->ax = ghcb->save.rax;
+ regs->bx = ghcb->save.rbx;
+ regs->cx = ghcb->save.rcx;
+ regs->dx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt,
+ unsigned long exit_code)
+{
+ bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
+ enum es_result ret;
+
+ /*
+ * The hypervisor should not be intercepting RDTSC/RDTSCP when Secure
+ * TSC is enabled. A #VC exception will be generated if the RDTSC/RDTSCP
+ * instructions are being intercepted. If this should occur and Secure
+ * TSC is enabled, guest execution should be terminated as the guest
+ * cannot rely on the TSC value provided by the hypervisor.
+ */
+ if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
+ return ES_VMM_ERROR;
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) &&
+ (!rdtscp || ghcb_rcx_is_valid(ghcb))))
+ return ES_VMM_ERROR;
+
+ ctxt->regs->ax = ghcb->save.rax;
+ ctxt->regs->dx = ghcb->save.rdx;
+ if (rdtscp)
+ ctxt->regs->cx = ghcb->save.rcx;
+
+ return ES_OK;
+}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 139ad80d1df3..5fe201efc753 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2803,8 +2803,15 @@ static unsigned long get_segment_base(unsigned int segment)
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
+ /*
+ * If we're not in a valid context with a real (not just lazy)
+ * user mm, then don't even try.
+ */
+ if (!nmi_uaccess_okay())
+ return 0;
+
/* IRQs are off, so this synchronizes with smp_store_release */
- ldt = READ_ONCE(current->active_mm->context.ldt);
+ ldt = smp_load_acquire(&current->mm->context.ldt);
if (!ldt || idx >= ldt->nr_entries)
return 0;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 4a37a8bd87fd..e18cdaa1573c 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -82,6 +82,12 @@ struct alt_instr {
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+extern s32 __retpoline_sites[], __retpoline_sites_end[];
+extern s32 __return_sites[], __return_sites_end[];
+extern s32 __cfi_sites[], __cfi_sites_end[];
+extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
+
/*
* Debug flag that can be tested to see whether alternative
* instructions were patched in already:
@@ -335,11 +341,6 @@ void nop_func(void);
__ALTERNATIVE(\oldinstr, \newinstr, \ft_flags)
.endm
-#define old_len 141b-140b
-#define new_len1 144f-143f
-#define new_len2 145f-144f
-#define new_len3 146f-145f
-
/*
* Same as ALTERNATIVE macro above but for two alternatives. If CPU
* has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index cbc6157f0b4b..b5982b94bdba 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -16,8 +16,7 @@ static __always_inline unsigned int __arch_hweight32(unsigned int w)
{
unsigned int res;
- asm_inline (ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE
- "call __sw_hweight32",
+ asm_inline (ALTERNATIVE("call __sw_hweight32",
"popcntl %[val], %[cnt]", X86_FEATURE_POPCNT)
: [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT
: [val] REG_IN (w));
@@ -46,8 +45,7 @@ static __always_inline unsigned long __arch_hweight64(__u64 w)
{
unsigned long res;
- asm_inline (ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE
- "call __sw_hweight64",
+ asm_inline (ALTERNATIVE("call __sw_hweight64",
"popcntq %[val], %[cnt]", X86_FEATURE_POPCNT)
: [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT
: [val] REG_IN (w));
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index cc2881576c2c..eef0771512de 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -114,17 +114,12 @@
#endif
#ifndef __ASSEMBLER__
-#ifndef __pic__
static __always_inline __pure void *rip_rel_ptr(void *p)
{
asm("leaq %c1(%%rip), %0" : "=r"(p) : "i"(p));
return p;
}
-#define RIP_REL_REF(var) (*(typeof(&(var)))rip_rel_ptr(&(var)))
-#else
-#define RIP_REL_REF(var) (var)
-#endif
#endif
/*
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 100413aff640..eebbc8889e70 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -248,7 +248,7 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
static __always_inline unsigned long variable__ffs(unsigned long word)
{
- asm("rep; bsf %1,%0"
+ asm("tzcnt %1,%0"
: "=r" (word)
: ASM_INPUT_RM (word));
return word;
@@ -267,10 +267,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word)
static __always_inline unsigned long variable_ffz(unsigned long word)
{
- asm("rep; bsf %1,%0"
- : "=r" (word)
- : "r" (~word));
- return word;
+ return variable__ffs(~word);
}
/**
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 3f02ff6d333d..02b23aa78955 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -74,6 +74,11 @@
# define BOOT_STACK_SIZE 0x1000
#endif
+#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE)
+
+#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
+#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0
+
#ifndef __ASSEMBLER__
extern unsigned int output_len;
extern const unsigned long kernel_text_size;
@@ -83,6 +88,11 @@ unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
void (*error)(char *x));
extern struct boot_params *boot_params_ptr;
+extern unsigned long *trampoline_32bit;
+extern const u16 trampoline_ljmp_imm_offset;
+
+void trampoline_32bit_src(void *trampoline, bool enable_5lvl);
+
#endif
#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
index e7225452963f..e1dbf8df1b69 100644
--- a/arch/x86/include/asm/coco.h
+++ b/arch/x86/include/asm/coco.h
@@ -22,7 +22,7 @@ static inline u64 cc_get_mask(void)
static inline void cc_set_mask(u64 mask)
{
- RIP_REL_REF(cc_mask) = mask;
+ cc_mask = mask;
}
u64 cc_mkenc(u64 val);
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index fdbbbfec745a..719d95f1ab5e 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -23,7 +23,7 @@ DECLARE_PER_CPU(unsigned long, cpu_dr7);
static __always_inline unsigned long native_get_debugreg(int regno)
{
- unsigned long val = 0; /* Damn you, gcc! */
+ unsigned long val;
switch (regno) {
case 0:
@@ -43,7 +43,7 @@ static __always_inline unsigned long native_get_debugreg(int regno)
break;
case 7:
/*
- * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them
+ * Use "asm volatile" for DR7 reads to forbid re-ordering them
* with other code.
*
* This is needed because a DR7 access can cause a #VC exception
@@ -55,7 +55,7 @@ static __always_inline unsigned long native_get_debugreg(int regno)
* re-ordered to happen before the call to sev_es_ist_enter(),
* causing stack recursion.
*/
- asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER);
+ asm volatile("mov %%db7, %0" : "=r" (val));
break;
default:
BUG();
@@ -83,15 +83,15 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value)
break;
case 7:
/*
- * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them
+ * Use "asm volatile" for DR7 writes to forbid re-ordering them
* with other code.
*
* While is didn't happen with a DR7 write (see the DR7 read
* comment above which explains where it happened), add the
- * __FORCE_ORDER here too to avoid similar problems in the
+ * "asm volatile" here too to avoid similar problems in the
* future.
*/
- asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER);
+ asm volatile("mov %0, %%db7" ::"r" (value));
break;
default:
BUG();
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
index 53e4015242b4..97f341777db5 100644
--- a/arch/x86/include/asm/inat.h
+++ b/arch/x86/include/asm/inat.h
@@ -82,6 +82,7 @@
#define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8))
#define INAT_REX2_VARIANT (1 << (INAT_FLAG_OFFS + 9))
#define INAT_EVEX_SCALABLE (1 << (INAT_FLAG_OFFS + 10))
+#define INAT_INV64 (1 << (INAT_FLAG_OFFS + 11))
/* Attribute making macros for attribute tables */
#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS)
#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS)
@@ -242,4 +243,9 @@ static inline int inat_evex_scalable(insn_attr_t attr)
{
return attr & INAT_EVEX_SCALABLE;
}
+
+static inline int inat_is_invalid64(insn_attr_t attr)
+{
+ return attr & INAT_INV64;
+}
#endif
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index e889c3bab5a2..ca309a3227c7 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -217,7 +217,7 @@ void memset_io(volatile void __iomem *, int, size_t);
static inline void __iowrite32_copy(void __iomem *to, const void *from,
size_t count)
{
- asm volatile("rep ; movsl"
+ asm volatile("rep movsl"
: "=&c"(count), "=&D"(to), "=&S"(from)
: "0"(count), "1"(to), "2"(from)
: "memory");
@@ -282,7 +282,7 @@ static inline void outs##bwl(u16 port, const void *addr, unsigned long count) \
count--; \
} \
} else { \
- asm volatile("rep; outs" #bwl \
+ asm volatile("rep outs" #bwl \
: "+S"(addr), "+c"(count) \
: "d"(port) : "memory"); \
} \
@@ -298,7 +298,7 @@ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \
count--; \
} \
} else { \
- asm volatile("rep; ins" #bwl \
+ asm volatile("rep ins" #bwl \
: "+D"(addr), "+c"(count) \
: "d"(port) : "memory"); \
} \
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 5432457d2338..f2ad77929d6e 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -8,6 +8,9 @@
# define PA_PGD 2
# define PA_SWAP_PAGE 3
# define PAGES_NR 4
+#else
+/* Size of each exception handler referenced by the IDT */
+# define KEXEC_DEBUG_EXC_HANDLER_SIZE 6 /* PUSHI, PUSHI, 2-byte JMP */
#endif
# define KEXEC_CONTROL_PAGE_SIZE 4096
@@ -59,6 +62,10 @@ struct kimage;
extern unsigned long kexec_va_control_page;
extern unsigned long kexec_pa_table_page;
extern unsigned long kexec_pa_swap_page;
+extern gate_desc kexec_debug_idt[];
+extern unsigned char kexec_debug_exc_vectors[];
+extern uint16_t kexec_debug_8250_port;
+extern unsigned long kexec_debug_8250_mmio32;
#endif
/*
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index b51d8a4673f5..9d38ae744a2e 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -141,5 +141,15 @@
#define SYM_FUNC_START_WEAK_NOALIGN(name) \
SYM_START(name, SYM_L_WEAK, SYM_A_NONE)
+/*
+ * Expose 'sym' to the startup code in arch/x86/boot/startup/, by emitting an
+ * alias prefixed with __pi_
+ */
+#ifdef __ASSEMBLER__
+#define SYM_PIC_ALIAS(sym) SYM_ALIAS(__pi_ ## sym, sym, SYM_L_GLOBAL)
+#else
+#define SYM_PIC_ALIAS(sym) extern typeof(sym) __PASTE(__pi_, sym) __alias(sym)
+#endif
+
#endif /* _ASM_X86_LINKAGE_H */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 1530ee301dfe..ea6494628cb0 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -61,7 +61,7 @@ void __init sev_es_init_vc_handling(void);
static inline u64 sme_get_me_mask(void)
{
- return RIP_REL_REF(sme_me_mask);
+ return sme_me_mask;
}
#define __bss_decrypted __section(".bss..decrypted")
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 695e569159c1..be7cddc414e4 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -17,10 +17,12 @@ struct ucode_cpu_info {
void load_ucode_bsp(void);
void load_ucode_ap(void);
void microcode_bsp_resume(void);
+bool __init microcode_loader_disabled(void);
#else
static inline void load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void microcode_bsp_resume(void) { }
+static inline bool __init microcode_loader_disabled(void) { return false; }
#endif
extern unsigned long initrd_start_early;
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 8b8055a8eb9e..0fe9c569d171 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -16,6 +16,8 @@
#define MM_CONTEXT_LOCK_LAM 2
/* Allow LAM and SVA coexisting */
#define MM_CONTEXT_FORCE_TAGGED_SVA 3
+/* Tracks mm_cpumask */
+#define MM_CONTEXT_NOTRACK 4
/*
* x86 has arch-specific MMU state beyond what lives in mm_struct.
@@ -44,9 +46,7 @@ typedef struct {
struct ldt_struct *ldt;
#endif
-#ifdef CONFIG_X86_64
unsigned long flags;
-#endif
#ifdef CONFIG_ADDRESS_MASKING
/* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 2398058b6e83..73bf3b1b44e8 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -190,7 +190,7 @@ extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
#define activate_mm(prev, next) \
do { \
paravirt_enter_mmap(next); \
- switch_mm((prev), (next), NULL); \
+ switch_mm_irqs_off((prev), (next), NULL); \
} while (0);
#ifdef CONFIG_X86_32
@@ -247,6 +247,16 @@ static inline bool is_64bit_mm(struct mm_struct *mm)
}
#endif
+static inline bool is_notrack_mm(struct mm_struct *mm)
+{
+ return test_bit(MM_CONTEXT_NOTRACK, &mm->context.flags);
+}
+
+static inline void set_notrack_mm(struct mm_struct *mm)
+{
+ set_bit(MM_CONTEXT_NOTRACK, &mm->context.flags);
+}
+
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
@@ -272,4 +282,7 @@ unsigned long __get_current_cr3_fast(void);
#include <asm-generic/mmu_context.h>
+extern struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm);
+extern void unuse_temporary_mm(struct mm_struct *prev_mm);
+
#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index fd4effd7654a..51a677fe9a8d 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -561,7 +561,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
-DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);
+DECLARE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
extern u16 mds_verw_sel;
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 9f77bf03d747..018a8d906ca3 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -29,9 +29,7 @@
#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
/* Physical address where kernel should be loaded. */
-#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
- + (CONFIG_PHYSICAL_ALIGN - 1)) \
- & ~(CONFIG_PHYSICAL_ALIGN - 1))
+#define LOAD_PHYSICAL_ADDR __ALIGN_KERNEL_MASK(CONFIG_PHYSICAL_START, CONFIG_PHYSICAL_ALIGN - 1)
#define __START_KERNEL (__START_KERNEL_map + LOAD_PHYSICAL_ADDR)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 5fe314a2e73e..b0d03b6c279b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -29,6 +29,8 @@
#ifdef CONFIG_SMP
+#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":"
+
#ifdef CONFIG_CC_HAS_NAMED_AS
#ifdef __CHECKER__
@@ -36,23 +38,23 @@
# define __seg_fs __attribute__((address_space(__seg_fs)))
#endif
+#define __percpu_prefix
#define __percpu_seg_override CONCATENATE(__seg_, __percpu_seg)
-#define __percpu_prefix ""
#else /* !CONFIG_CC_HAS_NAMED_AS: */
+#define __percpu_prefix __force_percpu_prefix
#define __percpu_seg_override
-#define __percpu_prefix "%%"__stringify(__percpu_seg)":"
#endif /* CONFIG_CC_HAS_NAMED_AS */
-#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":"
-#define __my_cpu_offset this_cpu_read(this_cpu_off)
-
/*
* Compared to the generic __my_cpu_offset version, the following
* saves one instruction and avoids clobbering a temp register.
- *
+ */
+#define __my_cpu_offset this_cpu_read(this_cpu_off)
+
+/*
* arch_raw_cpu_ptr should not be used in 32-bit VDSO for a 64-bit
* kernel, because games are played with CONFIG_X86_64 there and
* sizeof(this_cpu_off) becames 4.
@@ -77,9 +79,9 @@
#else /* !CONFIG_SMP: */
+#define __force_percpu_prefix
+#define __percpu_prefix
#define __percpu_seg_override
-#define __percpu_prefix ""
-#define __force_percpu_prefix ""
#define PER_CPU_VAR(var) (var)__percpu_rel
@@ -97,8 +99,8 @@
# define __my_cpu_var(var) (*__my_cpu_ptr(&(var)))
#endif
-#define __percpu_arg(x) __percpu_prefix "%" #x
#define __force_percpu_arg(x) __force_percpu_prefix "%" #x
+#define __percpu_arg(x) __percpu_prefix "%" #x
/*
* For arch-specific code, we can use direct single-insn ops (they
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5d2f7e5aff26..0973bed22172 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -734,6 +734,7 @@ void store_cpu_caps(struct cpuinfo_x86 *info);
enum l1tf_mitigations {
L1TF_MITIGATION_OFF,
+ L1TF_MITIGATION_AUTO,
L1TF_MITIGATION_FLUSH_NOWARN,
L1TF_MITIGATION_FLUSH,
L1TF_MITIGATION_FLUSH_NOSMT,
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ad9212df0ec0..6324f4c6c545 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -52,6 +52,7 @@ extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp);
extern void startup_64_setup_gdt_idt(void);
+extern void startup_64_load_idt(void *vc_handler);
extern void early_setup_idt(void);
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h
new file mode 100644
index 000000000000..b7232081f8f7
--- /dev/null
+++ b/arch/x86/include/asm/sev-internal.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define DR7_RESET_VALUE 0x400
+
+extern struct ghcb boot_ghcb_page;
+extern u64 sev_hv_features;
+extern u64 sev_secrets_pa;
+
+/* #VC handler runtime per-CPU data */
+struct sev_es_runtime_data {
+ struct ghcb ghcb_page;
+
+ /*
+ * Reserve one page per CPU as backup storage for the unencrypted GHCB.
+ * It is needed when an NMI happens while the #VC handler uses the real
+ * GHCB, and the NMI handler itself is causing another #VC exception. In
+ * that case the GHCB content of the first handler needs to be backed up
+ * and restored.
+ */
+ struct ghcb backup_ghcb;
+
+ /*
+ * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
+ * There is no need for it to be atomic, because nothing is written to
+ * the GHCB between the read and the write of ghcb_active. So it is safe
+ * to use it when a nested #VC exception happens before the write.
+ *
+ * This is necessary for example in the #VC->NMI->#VC case when the NMI
+ * happens while the first #VC handler uses the GHCB. When the NMI code
+ * raises a second #VC handler it might overwrite the contents of the
+ * GHCB written by the first handler. To avoid this the content of the
+ * GHCB is saved and restored when the GHCB is detected to be in use
+ * already.
+ */
+ bool ghcb_active;
+ bool backup_ghcb_active;
+
+ /*
+ * Cached DR7 value - write it on DR7 writes and return it on reads.
+ * That value will never make it to the real hardware DR7 as debugging
+ * is currently unsupported in SEV-ES guests.
+ */
+ unsigned long dr7;
+};
+
+struct ghcb_state {
+ struct ghcb *ghcb;
+};
+
+extern struct svsm_ca boot_svsm_ca_page;
+
+struct ghcb *__sev_get_ghcb(struct ghcb_state *state);
+void __sev_put_ghcb(struct ghcb_state *state);
+
+DECLARE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
+DECLARE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
+
+void early_set_pages_state(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages, enum psc_op op);
+
+DECLARE_PER_CPU(struct svsm_ca *, svsm_caa);
+DECLARE_PER_CPU(u64, svsm_caa_pa);
+
+extern struct svsm_ca *boot_svsm_caa;
+extern u64 boot_svsm_caa_pa;
+
+static __always_inline struct svsm_ca *svsm_get_caa(void)
+{
+ if (sev_cfg.use_cas)
+ return this_cpu_read(svsm_caa);
+ else
+ return boot_svsm_caa;
+}
+
+static __always_inline u64 svsm_get_caa_pa(void)
+{
+ if (sev_cfg.use_cas)
+ return this_cpu_read(svsm_caa_pa);
+ else
+ return boot_svsm_caa_pa;
+}
+
+int svsm_perform_call_protocol(struct svsm_call *call);
+
+static inline u64 sev_es_rd_ghcb_msr(void)
+{
+ return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
+}
+
+static __always_inline void sev_es_wr_ghcb_msr(u64 val)
+{
+ u32 low, high;
+
+ low = (u32)(val);
+ high = (u32)(val >> 32);
+
+ native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
+}
+
+void snp_register_ghcb_early(unsigned long paddr);
+bool sev_es_negotiate_protocol(void);
+bool sev_es_check_cpu_features(void);
+u64 get_hv_features(void);
+
+const struct snp_cpuid_table *snp_cpuid_get_table(void);
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index ba7999f66abe..6158893786d6 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -15,6 +15,7 @@
#include <asm/sev-common.h>
#include <asm/coco.h>
#include <asm/set_memory.h>
+#include <asm/svm.h>
#define GHCB_PROTOCOL_MIN 1ULL
#define GHCB_PROTOCOL_MAX 2ULL
@@ -83,6 +84,36 @@ extern void vc_no_ghcb(void);
extern void vc_boot_ghcb(void);
extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
+/*
+ * Individual entries of the SNP CPUID table, as defined by the SNP
+ * Firmware ABI, Revision 0.9, Section 7.1, Table 14.
+ */
+struct snp_cpuid_fn {
+ u32 eax_in;
+ u32 ecx_in;
+ u64 xcr0_in;
+ u64 xss_in;
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+ u64 __reserved;
+} __packed;
+
+/*
+ * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9,
+ * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit
+ * of 64 entries per CPUID table.
+ */
+#define SNP_CPUID_COUNT_MAX 64
+
+struct snp_cpuid_table {
+ u32 count;
+ u32 __reserved1;
+ u64 __reserved2;
+ struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX];
+} __packed;
+
/* PVALIDATE return codes */
#define PVALIDATE_FAIL_SIZEMISMATCH 6
@@ -484,6 +515,34 @@ int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req
void __init snp_secure_tsc_prepare(void);
void __init snp_secure_tsc_init(void);
+static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
+{
+ ghcb->save.sw_exit_code = 0;
+ __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+}
+
+void vc_forward_exception(struct es_em_ctxt *ctxt);
+
+/* I/O parameters for CPUID-related helpers */
+struct cpuid_leaf {
+ u32 fn;
+ u32 subfn;
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+};
+
+int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf);
+
+void __noreturn sev_es_terminate(unsigned int set, unsigned int reason);
+enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt,
+ u64 exit_code, u64 exit_info_1,
+ u64 exit_info_2);
+
+extern struct ghcb *boot_ghcb;
+
#else /* !CONFIG_AMD_MEM_ENCRYPT */
#define snp_vmpl 0
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 6266d6b9e0b8..ecda17efa042 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -10,30 +10,19 @@
#include <linux/irqflags.h>
#include <linux/jump_label.h>
-/*
- * The compiler should not reorder volatile asm statements with respect to each
- * other: they should execute in program order. However GCC 4.9.x and 5.x have
- * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder
- * volatile asm. The write functions are not affected since they have memory
- * clobbers preventing reordering. To prevent reads from being reordered with
- * respect to writes, use a dummy memory operand.
- */
-
-#define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL)
-
void native_write_cr0(unsigned long val);
static inline unsigned long native_read_cr0(void)
{
unsigned long val;
- asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER);
+ asm volatile("mov %%cr0,%0" : "=r" (val));
return val;
}
static __always_inline unsigned long native_read_cr2(void)
{
unsigned long val;
- asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER);
+ asm volatile("mov %%cr2,%0" : "=r" (val));
return val;
}
@@ -45,7 +34,7 @@ static __always_inline void native_write_cr2(unsigned long val)
static __always_inline unsigned long __native_read_cr3(void)
{
unsigned long val;
- asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER);
+ asm volatile("mov %%cr3,%0" : "=r" (val));
return val;
}
@@ -66,10 +55,10 @@ static inline unsigned long native_read_cr4(void)
asm volatile("1: mov %%cr4, %0\n"
"2:\n"
_ASM_EXTABLE(1b, 2b)
- : "=r" (val) : "0" (0), __FORCE_ORDER);
+ : "=r" (val) : "0" (0));
#else
/* CR4 always exists on x86_64. */
- asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER);
+ asm volatile("mov %%cr4,%0" : "=r" (val));
#endif
return val;
}
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 32c0d981a82a..e9cce169bb4c 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -33,11 +33,11 @@ extern size_t strlen(const char *s);
static __always_inline void *__memcpy(void *to, const void *from, size_t n)
{
int d0, d1, d2;
- asm volatile("rep ; movsl\n\t"
+ asm volatile("rep movsl\n\t"
"movl %4,%%ecx\n\t"
"andl $3,%%ecx\n\t"
"jz 1f\n\t"
- "rep ; movsb\n\t"
+ "rep movsb\n\t"
"1:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
@@ -89,7 +89,7 @@ static __always_inline void *__constant_memcpy(void *to, const void *from,
if (n >= 5 * 4) {
/* large block: use rep prefix */
int ecx;
- asm volatile("rep ; movsl"
+ asm volatile("rep movsl"
: "=&c" (ecx), "=&D" (edi), "=&S" (esi)
: "0" (n / 4), "1" (edi), "2" (esi)
: "memory"
@@ -165,8 +165,7 @@ extern void *memchr(const void *cs, int c, size_t count);
static inline void *__memset_generic(void *s, char c, size_t count)
{
int d0, d1;
- asm volatile("rep\n\t"
- "stosb"
+ asm volatile("rep stosb"
: "=&c" (d0), "=&D" (d1)
: "a" (c), "1" (s), "0" (count)
: "memory");
@@ -199,8 +198,7 @@ extern void *memset(void *, int, size_t);
static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
{
int d0, d1;
- asm volatile("rep\n\t"
- "stosw"
+ asm volatile("rep stosw"
: "=&c" (d0), "=&D" (d1)
: "a" (v), "1" (s), "0" (n)
: "memory");
@@ -211,8 +209,7 @@ static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
{
int d0, d1;
- asm volatile("rep\n\t"
- "stosl"
+ asm volatile("rep stosl"
: "=&c" (d0), "=&D" (d1)
: "a" (v), "1" (s), "0" (n)
: "memory");
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index ab9e143ec9fe..5337f1be18f6 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -11,11 +11,11 @@
* JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5.
* Raise it if needed.
*/
-#define POKE_MAX_OPCODE_SIZE 5
+#define TEXT_POKE_MAX_OPCODE_SIZE 5
extern void text_poke_early(void *addr, const void *opcode, size_t len);
-extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len);
+extern void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len);
/*
* Clear and restore the kernel write-protection flag on the local CPU.
@@ -32,17 +32,17 @@ extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u
* an inconsistent instruction while you patch.
*/
extern void *text_poke(void *addr, const void *opcode, size_t len);
-extern void text_poke_sync(void);
+extern void smp_text_poke_sync_each_cpu(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
#define text_poke_copy text_poke_copy
extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok);
extern void *text_poke_set(void *addr, int c, size_t len);
-extern int poke_int3_handler(struct pt_regs *regs);
-extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
+extern int smp_text_poke_int3_handler(struct pt_regs *regs);
+extern void smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate);
-extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate);
-extern void text_poke_finish(void);
+extern void smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate);
+extern void smp_text_poke_batch_finish(void);
#define INT3_INSN_SIZE 1
#define INT3_INSN_OPCODE 0xCC
@@ -82,7 +82,7 @@ static __always_inline int text_opcode_size(u8 opcode)
}
union text_poke_insn {
- u8 text[POKE_MAX_OPCODE_SIZE];
+ u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
struct {
u8 opcode;
s32 disp;
@@ -128,8 +128,8 @@ void *text_gen_insn(u8 opcode, const void *addr, const void *dest)
}
extern int after_bootmem;
-extern __ro_after_init struct mm_struct *poking_mm;
-extern __ro_after_init unsigned long poking_addr;
+extern __ro_after_init struct mm_struct *text_poke_mm;
+extern __ro_after_init unsigned long text_poke_mm_addr;
#ifndef CONFIG_UML_X86
static __always_inline
@@ -142,13 +142,14 @@ static __always_inline
void int3_emulate_push(struct pt_regs *regs, unsigned long val)
{
/*
- * The int3 handler in entry_64.S adds a gap between the
+ * The INT3 handler in entry_64.S adds a gap between the
* stack where the break point happened, and the saving of
* pt_regs. We can extend the original stack because of
- * this gap. See the idtentry macro's create_gap option.
+ * this gap. See the idtentry macro's X86_TRAP_BP logic.
*
- * Similarly entry_32.S will have a gap on the stack for (any) hardware
- * exception and pt_regs; see FIXUP_FRAME.
+ * Similarly, entry_32.S will have a gap on the stack for
+ * (any) hardware exception and pt_regs; see the
+ * FIXUP_FRAME macro.
*/
regs->sp -= sizeof(unsigned long);
*(unsigned long *)regs->sp = val;
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index c52f0133425b..c8a5ae35c871 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -26,8 +26,8 @@ extern unsigned long USER_PTR_MAX;
*/
static inline unsigned long __untagged_addr(unsigned long addr)
{
- asm (ALTERNATIVE("",
- "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM)
+ asm_inline (ALTERNATIVE("", "and " __percpu_arg([mask]) ", %[addr]",
+ X86_FEATURE_LAM)
: [addr] "+r" (addr)
: [mask] "m" (__my_cpu_var(tlbstate_untag_mask)));
@@ -54,7 +54,7 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
#endif
#define valid_user_address(x) \
- ((__force unsigned long)(x) <= runtime_const_ptr(USER_PTR_MAX))
+ likely((__force unsigned long)(x) <= runtime_const_ptr(USER_PTR_MAX))
/*
* Masking the user address is an alternative to a conditional
diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h
index c9b2ba7a9ec4..7000aeb59aa2 100644
--- a/arch/x86/include/asm/vdso/processor.h
+++ b/arch/x86/include/asm/vdso/processor.h
@@ -7,15 +7,15 @@
#ifndef __ASSEMBLER__
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static __always_inline void rep_nop(void)
+/* PAUSE is a good thing to insert into busy-wait loops. */
+static __always_inline void native_pause(void)
{
- asm volatile("rep; nop" ::: "memory");
+ asm volatile("pause" ::: "memory");
}
static __always_inline void cpu_relax(void)
{
- rep_nop();
+ native_pause();
}
struct getcpu_cache;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index bf82c6f7d690..ddbc303e41e3 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,36 +1,14 @@
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "SMP alternatives: " fmt
-#include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/mmu_context.h>
#include <linux/perf_event.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/stringify.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
-#include <linux/stop_machine.h>
-#include <linux/slab.h>
-#include <linux/kdebug.h>
-#include <linux/kprobes.h>
-#include <linux/mmu_context.h>
-#include <linux/bsearch.h>
-#include <linux/sync_core.h>
+
#include <asm/text-patching.h>
-#include <asm/alternative.h>
-#include <asm/sections.h>
-#include <asm/mce.h>
-#include <asm/nmi.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
#include <asm/insn.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/paravirt.h>
-#include <asm/asm-prototypes.h>
-#include <asm/cfi.h>
+#include <asm/nmi.h>
int __read_mostly alternatives_patched;
@@ -171,13 +149,6 @@ static void add_nop(u8 *buf, unsigned int len)
*buf = INT3_INSN_OPCODE;
}
-extern s32 __retpoline_sites[], __retpoline_sites_end[];
-extern s32 __return_sites[], __return_sites_end[];
-extern s32 __cfi_sites[], __cfi_sites_end[];
-extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
-extern s32 __smp_locks[], __smp_locks_end[];
-void text_poke_early(void *addr, const void *opcode, size_t len);
-
/*
* Matches NOP and NOPL, not any of the other possible NOPs.
*/
@@ -369,7 +340,7 @@ static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen,
}
}
-void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
{
__apply_relocation(buf, instr, instrlen, repl, repl_len);
optimize_nops(instr, buf, instrlen);
@@ -525,7 +496,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
- apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
+ text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
@@ -2010,7 +1981,7 @@ __visible noinline void __init __alt_reloc_selftest(void *arg)
static noinline void __init alt_reloc_selftest(void)
{
/*
- * Tests apply_relocation().
+ * Tests text_poke_apply_relocation().
*
* This has a relative immediate (CALL) in a place other than the first
* instruction and additionally on x86_64 we get a RIP-relative LEA:
@@ -2140,76 +2111,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
}
}
-typedef struct {
- struct mm_struct *mm;
-} temp_mm_state_t;
-
-/*
- * Using a temporary mm allows to set temporary mappings that are not accessible
- * by other CPUs. Such mappings are needed to perform sensitive memory writes
- * that override the kernel memory protections (e.g., W^X), without exposing the
- * temporary page-table mappings that are required for these write operations to
- * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
- * mapping is torn down.
- *
- * Context: The temporary mm needs to be used exclusively by a single core. To
- * harden security IRQs must be disabled while the temporary mm is
- * loaded, thereby preventing interrupt handler bugs from overriding
- * the kernel memory protection.
- */
-static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
-{
- temp_mm_state_t temp_state;
-
- lockdep_assert_irqs_disabled();
-
- /*
- * Make sure not to be in TLB lazy mode, as otherwise we'll end up
- * with a stale address space WITHOUT being in lazy mode after
- * restoring the previous mm.
- */
- if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
- leave_mm();
-
- temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
- switch_mm_irqs_off(NULL, mm, current);
-
- /*
- * If breakpoints are enabled, disable them while the temporary mm is
- * used. Userspace might set up watchpoints on addresses that are used
- * in the temporary mm, which would lead to wrong signals being sent or
- * crashes.
- *
- * Note that breakpoints are not disabled selectively, which also causes
- * kernel breakpoints (e.g., perf's) to be disabled. This might be
- * undesirable, but still seems reasonable as the code that runs in the
- * temporary mm should be short.
- */
- if (hw_breakpoint_active())
- hw_breakpoint_disable();
-
- return temp_state;
-}
-
-__ro_after_init struct mm_struct *poking_mm;
-__ro_after_init unsigned long poking_addr;
-
-static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
-{
- lockdep_assert_irqs_disabled();
-
- switch_mm_irqs_off(NULL, prev_state.mm, current);
-
- /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
- cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm));
-
- /*
- * Restore the breakpoints if they were disabled before the temporary mm
- * was loaded.
- */
- if (hw_breakpoint_active())
- hw_breakpoint_restore();
-}
+__ro_after_init struct mm_struct *text_poke_mm;
+__ro_after_init unsigned long text_poke_mm_addr;
static void text_poke_memcpy(void *dst, const void *src, size_t len)
{
@@ -2229,7 +2132,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
{
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
struct page *pages[2] = {NULL};
- temp_mm_state_t prev;
+ struct mm_struct *prev_mm;
unsigned long flags;
pte_t pte, *ptep;
spinlock_t *ptl;
@@ -2266,7 +2169,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
/*
* The lock is not really needed, but this allows to avoid open-coding.
*/
- ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
+ ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
/*
* This must not fail; preallocated in poking_init().
@@ -2276,21 +2179,21 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
local_irq_save(flags);
pte = mk_pte(pages[0], pgprot);
- set_pte_at(poking_mm, poking_addr, ptep, pte);
+ set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte);
if (cross_page_boundary) {
pte = mk_pte(pages[1], pgprot);
- set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
+ set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte);
}
/*
* Loading the temporary mm behaves as a compiler barrier, which
* guarantees that the PTE will be set at the time memcpy() is done.
*/
- prev = use_temporary_mm(poking_mm);
+ prev_mm = use_temporary_mm(text_poke_mm);
kasan_disable_current();
- func((u8 *)poking_addr + offset_in_page(addr), src, len);
+ func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len);
kasan_enable_current();
/*
@@ -2299,22 +2202,22 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
*/
barrier();
- pte_clear(poking_mm, poking_addr, ptep);
+ pte_clear(text_poke_mm, text_poke_mm_addr, ptep);
if (cross_page_boundary)
- pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
+ pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1);
/*
* Loading the previous page-table hierarchy requires a serializing
* instruction that already allows the core to see the updated version.
* Xen-PV is assumed to serialize execution in a similar manner.
*/
- unuse_temporary_mm(prev);
+ unuse_temporary_mm(prev_mm);
/*
* Flushing the TLB might involve IPIs, which would require enabled
* IRQs, but not if the mm is not used, as it is in this point.
*/
- flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
+ flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr +
(cross_page_boundary ? 2 : 1) * PAGE_SIZE,
PAGE_SHIFT, false);
@@ -2450,7 +2353,7 @@ static void do_sync_core(void *info)
sync_core();
}
-void text_poke_sync(void)
+void smp_text_poke_sync_each_cpu(void)
{
on_each_cpu(do_sync_core, NULL, 1);
}
@@ -2460,64 +2363,66 @@ void text_poke_sync(void)
* this thing. When len == 6 everything is prefixed with 0x0f and we map
* opcode to Jcc.d8, using len to distinguish.
*/
-struct text_poke_loc {
+struct smp_text_poke_loc {
/* addr := _stext + rel_addr */
s32 rel_addr;
s32 disp;
u8 len;
u8 opcode;
- const u8 text[POKE_MAX_OPCODE_SIZE];
- /* see text_poke_bp_batch() */
+ const u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
+ /* see smp_text_poke_batch_finish() */
u8 old;
};
-struct bp_patching_desc {
- struct text_poke_loc *vec;
+#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc))
+
+static struct smp_text_poke_array {
+ struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX];
int nr_entries;
- atomic_t refs;
-};
+} text_poke_array;
-static struct bp_patching_desc bp_desc;
+static DEFINE_PER_CPU(atomic_t, text_poke_array_refs);
-static __always_inline
-struct bp_patching_desc *try_get_desc(void)
+/*
+ * These four __always_inline annotations imply noinstr, necessary
+ * due to smp_text_poke_int3_handler() being noinstr:
+ */
+
+static __always_inline bool try_get_text_poke_array(void)
{
- struct bp_patching_desc *desc = &bp_desc;
+ atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
- if (!raw_atomic_inc_not_zero(&desc->refs))
- return NULL;
+ if (!raw_atomic_inc_not_zero(refs))
+ return false;
- return desc;
+ return true;
}
-static __always_inline void put_desc(void)
+static __always_inline void put_text_poke_array(void)
{
- struct bp_patching_desc *desc = &bp_desc;
+ atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
smp_mb__before_atomic();
- raw_atomic_dec(&desc->refs);
+ raw_atomic_dec(refs);
}
-static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
+static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl)
{
- return _stext + tp->rel_addr;
+ return _stext + tpl->rel_addr;
}
-static __always_inline int patch_cmp(const void *key, const void *elt)
+static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b)
{
- struct text_poke_loc *tp = (struct text_poke_loc *) elt;
-
- if (key < text_poke_addr(tp))
+ if (tpl_a < text_poke_addr(tpl_b))
return -1;
- if (key > text_poke_addr(tp))
+ if (tpl_a > text_poke_addr(tpl_b))
return 1;
return 0;
}
-noinstr int poke_int3_handler(struct pt_regs *regs)
+noinstr int smp_text_poke_int3_handler(struct pt_regs *regs)
{
- struct bp_patching_desc *desc;
- struct text_poke_loc *tp;
+ struct smp_text_poke_loc *tpl;
int ret = 0;
void *ip;
@@ -2526,41 +2431,40 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
/*
* Having observed our INT3 instruction, we now must observe
- * bp_desc with non-zero refcount:
+ * text_poke_array with non-zero refcount:
*
- * bp_desc.refs = 1 INT3
- * WMB RMB
- * write INT3 if (bp_desc.refs != 0)
+ * text_poke_array_refs = 1 INT3
+ * WMB RMB
+ * write INT3 if (text_poke_array_refs != 0)
*/
smp_rmb();
- desc = try_get_desc();
- if (!desc)
+ if (!try_get_text_poke_array())
return 0;
/*
- * Discount the INT3. See text_poke_bp_batch().
+ * Discount the INT3. See smp_text_poke_batch_finish().
*/
ip = (void *) regs->ip - INT3_INSN_SIZE;
/*
* Skip the binary search if there is a single member in the vector.
*/
- if (unlikely(desc->nr_entries > 1)) {
- tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
- sizeof(struct text_poke_loc),
+ if (unlikely(text_poke_array.nr_entries > 1)) {
+ tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries,
+ sizeof(struct smp_text_poke_loc),
patch_cmp);
- if (!tp)
+ if (!tpl)
goto out_put;
} else {
- tp = desc->vec;
- if (text_poke_addr(tp) != ip)
+ tpl = text_poke_array.vec;
+ if (text_poke_addr(tpl) != ip)
goto out_put;
}
- ip += tp->len;
+ ip += tpl->len;
- switch (tp->opcode) {
+ switch (tpl->opcode) {
case INT3_INSN_OPCODE:
/*
* Someone poked an explicit INT3, they'll want to handle it,
@@ -2573,16 +2477,16 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
break;
case CALL_INSN_OPCODE:
- int3_emulate_call(regs, (long)ip + tp->disp);
+ int3_emulate_call(regs, (long)ip + tpl->disp);
break;
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
- int3_emulate_jmp(regs, (long)ip + tp->disp);
+ int3_emulate_jmp(regs, (long)ip + tpl->disp);
break;
case 0x70 ... 0x7f: /* Jcc */
- int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
+ int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp);
break;
default:
@@ -2592,51 +2496,50 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
ret = 1;
out_put:
- put_desc();
+ put_text_poke_array();
return ret;
}
-#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
-static struct text_poke_loc tp_vec[TP_VEC_MAX];
-static int tp_vec_nr;
-
/**
- * text_poke_bp_batch() -- update instructions on live kernel on SMP
- * @tp: vector of instructions to patch
- * @nr_entries: number of entries in the vector
+ * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP
*
- * Modify multi-byte instruction by using int3 breakpoint on SMP.
- * We completely avoid stop_machine() here, and achieve the
- * synchronization using int3 breakpoint.
+ * Input state:
+ * text_poke_array.vec: vector of instructions to patch
+ * text_poke_array.nr_entries: number of entries in the vector
+ *
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
*
* The way it is done:
* - For each entry in the vector:
- * - add a int3 trap to the address that will be patched
- * - sync cores
+ * - add an INT3 trap to the address that will be patched
+ * - SMP sync all CPUs
* - For each entry in the vector:
* - update all but the first byte of the patched range
- * - sync cores
+ * - SMP sync all CPUs
* - For each entry in the vector:
- * - replace the first byte (int3) by the first byte of
+ * - replace the first byte (INT3) by the first byte of the
* replacing opcode
- * - sync cores
+ * - SMP sync all CPUs
*/
-static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
+void smp_text_poke_batch_finish(void)
{
unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
int do_sync;
- lockdep_assert_held(&text_mutex);
+ if (!text_poke_array.nr_entries)
+ return;
- bp_desc.vec = tp;
- bp_desc.nr_entries = nr_entries;
+ lockdep_assert_held(&text_mutex);
/*
- * Corresponds to the implicit memory barrier in try_get_desc() to
- * ensure reading a non-zero refcount provides up to date bp_desc data.
+ * Corresponds to the implicit memory barrier in try_get_text_poke_array() to
+ * ensure reading a non-zero refcount provides up to date text_poke_array data.
*/
- atomic_set_release(&bp_desc.refs, 1);
+ for_each_possible_cpu(i)
+ atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1);
/*
* Function tracing can enable thousands of places that need to be
@@ -2649,33 +2552,33 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
cond_resched();
/*
- * Corresponding read barrier in int3 notifier for making sure the
- * nr_entries and handler are correctly ordered wrt. patching.
+ * Corresponding read barrier in INT3 notifier for making sure the
+ * text_poke_array.nr_entries and handler are correctly ordered wrt. patching.
*/
smp_wmb();
/*
- * First step: add a int3 trap to the address that will be patched.
+ * First step: add a INT3 trap to the address that will be patched.
*/
- for (i = 0; i < nr_entries; i++) {
- tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
- text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
+ for (i = 0; i < text_poke_array.nr_entries; i++) {
+ text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]);
+ text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE);
}
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
/*
* Second step: update all but the first byte of the patched range.
*/
- for (do_sync = 0, i = 0; i < nr_entries; i++) {
- u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
- u8 _new[POKE_MAX_OPCODE_SIZE+1];
- const u8 *new = tp[i].text;
- int len = tp[i].len;
+ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
+ u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, };
+ u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1];
+ const u8 *new = text_poke_array.vec[i].text;
+ int len = text_poke_array.vec[i].len;
if (len - INT3_INSN_SIZE > 0) {
memcpy(old + INT3_INSN_SIZE,
- text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
if (len == 6) {
@@ -2684,7 +2587,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
new = _new;
}
- text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
new + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
@@ -2715,7 +2618,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* The old instruction is recorded so that the event can be
* processed forwards or backwards.
*/
- perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
+ perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len);
}
if (do_sync) {
@@ -2724,63 +2627,79 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
}
/*
- * Third step: replace the first byte (int3) by the first byte of
+ * Third step: replace the first byte (INT3) by the first byte of the
* replacing opcode.
*/
- for (do_sync = 0, i = 0; i < nr_entries; i++) {
- u8 byte = tp[i].text[0];
+ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
+ u8 byte = text_poke_array.vec[i].text[0];
- if (tp[i].len == 6)
+ if (text_poke_array.vec[i].len == 6)
byte = 0x0f;
if (byte == INT3_INSN_OPCODE)
continue;
- text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
+ text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE);
do_sync++;
}
if (do_sync)
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
/*
* Remove and wait for refs to be zero.
+ *
+ * Notably, if after step-3 above the INT3 got removed, then the
+ * smp_text_poke_sync_each_cpu() will have serialized against any running INT3
+ * handlers and the below spin-wait will not happen.
+ *
+ * IOW. unless the replacement instruction is INT3, this case goes
+ * unused.
*/
- if (!atomic_dec_and_test(&bp_desc.refs))
- atomic_cond_read_acquire(&bp_desc.refs, !VAL);
+ for_each_possible_cpu(i) {
+ atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i);
+
+ if (unlikely(!atomic_dec_and_test(refs)))
+ atomic_cond_read_acquire(refs, !VAL);
+ }
+
+ /* They are all completed: */
+ text_poke_array.nr_entries = 0;
}
-static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
- const void *opcode, size_t len, const void *emulate)
+static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
{
+ struct smp_text_poke_loc *tpl;
struct insn insn;
int ret, i = 0;
+ tpl = &text_poke_array.vec[text_poke_array.nr_entries++];
+
if (len == 6)
i = 1;
- memcpy((void *)tp->text, opcode+i, len-i);
+ memcpy((void *)tpl->text, opcode+i, len-i);
if (!emulate)
emulate = opcode;
ret = insn_decode_kernel(&insn, emulate);
BUG_ON(ret < 0);
- tp->rel_addr = addr - (void *)_stext;
- tp->len = len;
- tp->opcode = insn.opcode.bytes[0];
+ tpl->rel_addr = addr - (void *)_stext;
+ tpl->len = len;
+ tpl->opcode = insn.opcode.bytes[0];
if (is_jcc32(&insn)) {
/*
* Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
*/
- tp->opcode = insn.opcode.bytes[1] - 0x10;
+ tpl->opcode = insn.opcode.bytes[1] - 0x10;
}
- switch (tp->opcode) {
+ switch (tpl->opcode) {
case RET_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
@@ -2789,14 +2708,14 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
* next instruction can be padded with INT3.
*/
for (i = insn.length; i < len; i++)
- BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
+ BUG_ON(tpl->text[i] != INT3_INSN_OPCODE);
break;
default:
BUG_ON(len != insn.length);
}
- switch (tp->opcode) {
+ switch (tpl->opcode) {
case INT3_INSN_OPCODE:
case RET_INSN_OPCODE:
break;
@@ -2805,21 +2724,21 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
case 0x70 ... 0x7f: /* Jcc */
- tp->disp = insn.immediate.value;
+ tpl->disp = insn.immediate.value;
break;
default: /* assume NOP */
switch (len) {
case 2: /* NOP2 -- emulate as JMP8+0 */
BUG_ON(memcmp(emulate, x86_nops[len], len));
- tp->opcode = JMP8_INSN_OPCODE;
- tp->disp = 0;
+ tpl->opcode = JMP8_INSN_OPCODE;
+ tpl->disp = 0;
break;
case 5: /* NOP5 -- emulate as JMP32+0 */
BUG_ON(memcmp(emulate, x86_nops[len], len));
- tp->opcode = JMP32_INSN_OPCODE;
- tp->disp = 0;
+ tpl->opcode = JMP32_INSN_OPCODE;
+ tpl->disp = 0;
break;
default: /* unknown instruction */
@@ -2830,51 +2749,50 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
}
/*
- * We hard rely on the tp_vec being ordered; ensure this is so by flushing
+ * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing
* early if needed.
*/
-static bool tp_order_fail(void *addr)
+static bool text_poke_addr_ordered(void *addr)
{
- struct text_poke_loc *tp;
-
- if (!tp_vec_nr)
- return false;
+ WARN_ON_ONCE(!addr);
- if (!addr) /* force */
+ if (!text_poke_array.nr_entries)
return true;
- tp = &tp_vec[tp_vec_nr - 1];
- if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
- return true;
-
- return false;
-}
-
-static void text_poke_flush(void *addr)
-{
- if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
- text_poke_bp_batch(tp_vec, tp_vec_nr);
- tp_vec_nr = 0;
- }
-}
+ /*
+ * If the last current entry's address is higher than the
+ * new entry's address we'd like to add, then ordering
+ * is violated and we must first flush all pending patching
+ * requests:
+ */
+ if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr)
+ return false;
-void text_poke_finish(void)
-{
- text_poke_flush(NULL);
+ return true;
}
-void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
+/**
+ * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @emulate: instruction to be emulated
+ *
+ * Add a new instruction to the current queue of to-be-patched instructions
+ * the kernel maintains. The patching request will not be executed immediately,
+ * but becomes part of an array of patching requests, optimized for batched
+ * execution. All pending patching requests will be executed on the next
+ * smp_text_poke_batch_finish() call.
+ */
+void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
{
- struct text_poke_loc *tp;
-
- text_poke_flush(addr);
-
- tp = &tp_vec[tp_vec_nr++];
- text_poke_loc_init(tp, addr, opcode, len, emulate);
+ if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr))
+ smp_text_poke_batch_finish();
+ __smp_text_poke_batch_add(addr, opcode, len, emulate);
}
/**
- * text_poke_bp() -- update instructions on live kernel on SMP
+ * smp_text_poke_single() -- update instruction on live kernel on SMP immediately
* @addr: address to patch
* @opcode: opcode of new instruction
* @len: length to copy
@@ -2882,12 +2800,11 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi
*
* Update a single instruction with the vector in the stack, avoiding
* dynamically allocated memory. This function should be used when it is
- * not possible to allocate memory.
+ * not possible to allocate memory for a vector. The single instruction
+ * is patched in immediately.
*/
-void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate)
{
- struct text_poke_loc tp;
-
- text_poke_loc_init(&tp, addr, opcode, len, emulate);
- text_poke_bp_batch(&tp, 1);
+ __smp_text_poke_batch_add(addr, opcode, len, emulate);
+ smp_text_poke_batch_finish();
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index eebc360ed1bb..ba5a4ccda37a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1486,7 +1486,7 @@ static void __init delay_with_tsc(void)
* 1 GHz == 40 jiffies
*/
do {
- rep_nop();
+ native_pause();
now = rdtsc();
} while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end));
}
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index d86d7d6e750c..a951333c5995 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -185,7 +185,7 @@ static void *patch_dest(void *dest, bool direct)
u8 *pad = dest - tsize;
memcpy(insn_buff, skl_call_thunk_template, tsize);
- apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
+ text_poke_apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
/* Already patched? */
if (!bcmp(pad, insn_buff, tsize))
@@ -294,7 +294,7 @@ static bool is_callthunk(void *addr)
pad = (void *)(dest - tmpl_size);
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
- apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
+ text_poke_apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
return !bcmp(pad, insn_buff, tmpl_size);
}
@@ -312,7 +312,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip)
return 0;
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
- apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
+ text_poke_apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
memcpy(*pprog, insn_buff, tmpl_size);
*pprog += tmpl_size;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 606c0cb97e86..246a76ddbf46 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -34,21 +34,63 @@
#include "cpu.h"
+/*
+ * Speculation Vulnerability Handling
+ *
+ * Each vulnerability is handled with the following functions:
+ * <vuln>_select_mitigation() -- Selects a mitigation to use. This should
+ * take into account all relevant command line
+ * options.
+ * <vuln>_update_mitigation() -- This is called after all vulnerabilities have
+ * selected a mitigation, in case the selection
+ * may want to change based on other choices
+ * made. This function is optional.
+ * <vuln>_apply_mitigation() -- Enable the selected mitigation.
+ *
+ * The compile-time mitigation in all cases should be AUTO. An explicit
+ * command-line option can override AUTO. If no such option is
+ * provided, <vuln>_select_mitigation() will override AUTO to the best
+ * mitigation option.
+ */
+
static void __init spectre_v1_select_mitigation(void);
+static void __init spectre_v1_apply_mitigation(void);
static void __init spectre_v2_select_mitigation(void);
+static void __init spectre_v2_update_mitigation(void);
+static void __init spectre_v2_apply_mitigation(void);
static void __init retbleed_select_mitigation(void);
+static void __init retbleed_update_mitigation(void);
+static void __init retbleed_apply_mitigation(void);
static void __init spectre_v2_user_select_mitigation(void);
+static void __init spectre_v2_user_update_mitigation(void);
+static void __init spectre_v2_user_apply_mitigation(void);
static void __init ssb_select_mitigation(void);
+static void __init ssb_apply_mitigation(void);
static void __init l1tf_select_mitigation(void);
+static void __init l1tf_apply_mitigation(void);
static void __init mds_select_mitigation(void);
-static void __init md_clear_update_mitigation(void);
-static void __init md_clear_select_mitigation(void);
+static void __init mds_update_mitigation(void);
+static void __init mds_apply_mitigation(void);
static void __init taa_select_mitigation(void);
+static void __init taa_update_mitigation(void);
+static void __init taa_apply_mitigation(void);
static void __init mmio_select_mitigation(void);
+static void __init mmio_update_mitigation(void);
+static void __init mmio_apply_mitigation(void);
+static void __init rfds_select_mitigation(void);
+static void __init rfds_update_mitigation(void);
+static void __init rfds_apply_mitigation(void);
static void __init srbds_select_mitigation(void);
+static void __init srbds_apply_mitigation(void);
static void __init l1d_flush_select_mitigation(void);
static void __init srso_select_mitigation(void);
+static void __init srso_update_mitigation(void);
+static void __init srso_apply_mitigation(void);
static void __init gds_select_mitigation(void);
+static void __init gds_apply_mitigation(void);
+static void __init bhi_select_mitigation(void);
+static void __init bhi_update_mitigation(void);
+static void __init bhi_apply_mitigation(void);
/* The base value of the SPEC_CTRL MSR without task-specific bits set */
u64 x86_spec_ctrl_base;
@@ -127,9 +169,13 @@ EXPORT_SYMBOL_GPL(mds_idle_clear);
*/
DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
-/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */
-DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear);
-EXPORT_SYMBOL_GPL(mmio_stale_data_clear);
+/*
+ * Controls CPU Fill buffer clear before VMenter. This is a subset of
+ * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only
+ * mitigation is required.
+ */
+DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
+EXPORT_SYMBOL_GPL(cpu_buf_vm_clear);
void __init cpu_select_mitigations(void)
{
@@ -154,30 +200,60 @@ void __init cpu_select_mitigations(void)
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
- /*
- * retbleed_select_mitigation() relies on the state set by
- * spectre_v2_select_mitigation(); specifically it wants to know about
- * spectre_v2=ibrs.
- */
retbleed_select_mitigation();
- /*
- * spectre_v2_user_select_mitigation() relies on the state set by
- * retbleed_select_mitigation(); specifically the STIBP selection is
- * forced for UNRET or IBPB.
- */
spectre_v2_user_select_mitigation();
ssb_select_mitigation();
l1tf_select_mitigation();
- md_clear_select_mitigation();
+ mds_select_mitigation();
+ taa_select_mitigation();
+ mmio_select_mitigation();
+ rfds_select_mitigation();
srbds_select_mitigation();
l1d_flush_select_mitigation();
+ srso_select_mitigation();
+ gds_select_mitigation();
+ bhi_select_mitigation();
/*
- * srso_select_mitigation() depends and must run after
- * retbleed_select_mitigation().
+ * After mitigations are selected, some may need to update their
+ * choices.
*/
- srso_select_mitigation();
- gds_select_mitigation();
+ spectre_v2_update_mitigation();
+ /*
+ * retbleed_update_mitigation() relies on the state set by
+ * spectre_v2_update_mitigation(); specifically it wants to know about
+ * spectre_v2=ibrs.
+ */
+ retbleed_update_mitigation();
+
+ /*
+ * spectre_v2_user_update_mitigation() depends on
+ * retbleed_update_mitigation(), specifically the STIBP
+ * selection is forced for UNRET or IBPB.
+ */
+ spectre_v2_user_update_mitigation();
+ mds_update_mitigation();
+ taa_update_mitigation();
+ mmio_update_mitigation();
+ rfds_update_mitigation();
+ bhi_update_mitigation();
+ /* srso_update_mitigation() depends on retbleed_update_mitigation(). */
+ srso_update_mitigation();
+
+ spectre_v1_apply_mitigation();
+ spectre_v2_apply_mitigation();
+ retbleed_apply_mitigation();
+ spectre_v2_user_apply_mitigation();
+ ssb_apply_mitigation();
+ l1tf_apply_mitigation();
+ mds_apply_mitigation();
+ taa_apply_mitigation();
+ mmio_apply_mitigation();
+ rfds_apply_mitigation();
+ srbds_apply_mitigation();
+ srso_apply_mitigation();
+ gds_apply_mitigation();
+ bhi_apply_mitigation();
}
/*
@@ -280,6 +356,12 @@ enum rfds_mitigations {
static enum rfds_mitigations rfds_mitigation __ro_after_init =
IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF;
+/*
+ * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing
+ * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry.
+ */
+static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init;
+
static void __init mds_select_mitigation(void)
{
if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
@@ -290,12 +372,34 @@ static void __init mds_select_mitigation(void)
if (mds_mitigation == MDS_MITIGATION_AUTO)
mds_mitigation = MDS_MITIGATION_FULL;
+ if (mds_mitigation == MDS_MITIGATION_OFF)
+ return;
+
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init mds_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
+ return;
+
+ /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */
+ if (verw_clear_cpu_buf_mitigation_selected)
+ mds_mitigation = MDS_MITIGATION_FULL;
+
if (mds_mitigation == MDS_MITIGATION_FULL) {
if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
mds_mitigation = MDS_MITIGATION_VMWERV;
+ }
- setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ pr_info("%s\n", mds_strings[mds_mitigation]);
+}
+static void __init mds_apply_mitigation(void)
+{
+ if (mds_mitigation == MDS_MITIGATION_FULL ||
+ mds_mitigation == MDS_MITIGATION_VMWERV) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
(mds_nosmt || cpu_mitigations_auto_nosmt()))
cpu_smt_disable(false);
@@ -335,6 +439,11 @@ static const char * const taa_strings[] = {
[TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
};
+static bool __init taa_vulnerable(void)
+{
+ return boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM);
+}
+
static void __init taa_select_mitigation(void)
{
if (!boot_cpu_has_bug(X86_BUG_TAA)) {
@@ -348,48 +457,63 @@ static void __init taa_select_mitigation(void)
return;
}
- if (cpu_mitigations_off()) {
+ if (cpu_mitigations_off())
taa_mitigation = TAA_MITIGATION_OFF;
- return;
- }
- /*
- * TAA mitigation via VERW is turned off if both
- * tsx_async_abort=off and mds=off are specified.
- */
- if (taa_mitigation == TAA_MITIGATION_OFF &&
- mds_mitigation == MDS_MITIGATION_OFF)
+ /* Microcode will be checked in taa_update_mitigation(). */
+ if (taa_mitigation == TAA_MITIGATION_AUTO)
+ taa_mitigation = TAA_MITIGATION_VERW;
+
+ if (taa_mitigation != TAA_MITIGATION_OFF)
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init taa_update_mitigation(void)
+{
+ if (!taa_vulnerable() || cpu_mitigations_off())
return;
- if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ if (verw_clear_cpu_buf_mitigation_selected)
taa_mitigation = TAA_MITIGATION_VERW;
- else
- taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
- /*
- * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
- * A microcode update fixes this behavior to clear CPU buffers. It also
- * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
- * ARCH_CAP_TSX_CTRL_MSR bit.
- *
- * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
- * update is required.
- */
- if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) &&
- !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))
- taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+ if (taa_mitigation == TAA_MITIGATION_VERW) {
+ /* Check if the requisite ucode is available. */
+ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
- /*
- * TSX is enabled, select alternate mitigation for TAA which is
- * the same as MDS. Enable MDS static branch to clear CPU buffers.
- *
- * For guests that can't determine whether the correct microcode is
- * present on host, enable the mitigation for UCODE_NEEDED as well.
- */
- setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ /*
+ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+ * A microcode update fixes this behavior to clear CPU buffers. It also
+ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+ * ARCH_CAP_TSX_CTRL_MSR bit.
+ *
+ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+ * update is required.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+ }
- if (taa_nosmt || cpu_mitigations_auto_nosmt())
- cpu_smt_disable(false);
+ pr_info("%s\n", taa_strings[taa_mitigation]);
+}
+
+static void __init taa_apply_mitigation(void)
+{
+ if (taa_mitigation == TAA_MITIGATION_VERW ||
+ taa_mitigation == TAA_MITIGATION_UCODE_NEEDED) {
+ /*
+ * TSX is enabled, select alternate mitigation for TAA which is
+ * the same as MDS. Enable MDS static branch to clear CPU buffers.
+ *
+ * For guests that can't determine whether the correct microcode is
+ * present on host, enable the mitigation for UCODE_NEEDED as well.
+ */
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+
+ if (taa_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+ }
}
static int __init tsx_async_abort_parse_cmdline(char *str)
@@ -432,25 +556,62 @@ static void __init mmio_select_mitigation(void)
return;
}
+ /* Microcode will be checked in mmio_update_mitigation(). */
+ if (mmio_mitigation == MMIO_MITIGATION_AUTO)
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+
if (mmio_mitigation == MMIO_MITIGATION_OFF)
return;
/*
* Enable CPU buffer clear mitigation for host and VMM, if also affected
- * by MDS or TAA. Otherwise, enable mitigation for VMM only.
+ * by MDS or TAA.
*/
- if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
- boot_cpu_has(X86_FEATURE_RTM)))
- setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ if (boot_cpu_has_bug(X86_BUG_MDS) || taa_vulnerable())
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init mmio_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || cpu_mitigations_off())
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+
+ if (mmio_mitigation == MMIO_MITIGATION_VERW) {
+ /*
+ * Check if the system has the right microcode.
+ *
+ * CPU Fill buffer clear mitigation is enumerated by either an explicit
+ * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
+ * affected systems.
+ */
+ if (!((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) ||
+ (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
+ boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))))
+ mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", mmio_strings[mmio_mitigation]);
+}
+
+static void __init mmio_apply_mitigation(void)
+{
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
/*
- * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based
- * mitigations, disable KVM-only mitigation in that case.
+ * Only enable the VMM mitigation if the CPU buffer clear mitigation is
+ * not being used.
*/
- if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF))
- static_branch_disable(&mmio_stale_data_clear);
- else
- static_branch_enable(&mmio_stale_data_clear);
+ if (verw_clear_cpu_buf_mitigation_selected) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ static_branch_disable(&cpu_buf_vm_clear);
+ } else {
+ static_branch_enable(&cpu_buf_vm_clear);
+ }
/*
* If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can
@@ -460,21 +621,6 @@ static void __init mmio_select_mitigation(void)
if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
static_branch_enable(&mds_idle_clear);
- /*
- * Check if the system has the right microcode.
- *
- * CPU Fill buffer clear mitigation is enumerated by either an explicit
- * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
- * affected systems.
- */
- if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) ||
- (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
- boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
- !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)))
- mmio_mitigation = MMIO_MITIGATION_VERW;
- else
- mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
-
if (mmio_nosmt || cpu_mitigations_auto_nosmt())
cpu_smt_disable(false);
}
@@ -509,22 +655,48 @@ static const char * const rfds_strings[] = {
[RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
};
+static inline bool __init verw_clears_cpu_reg_file(void)
+{
+ return (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR);
+}
+
static void __init rfds_select_mitigation(void)
{
if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) {
rfds_mitigation = RFDS_MITIGATION_OFF;
return;
}
+
+ if (rfds_mitigation == RFDS_MITIGATION_AUTO)
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
if (rfds_mitigation == RFDS_MITIGATION_OFF)
return;
- if (rfds_mitigation == RFDS_MITIGATION_AUTO)
+ if (verw_clears_cpu_reg_file())
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init rfds_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off())
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
rfds_mitigation = RFDS_MITIGATION_VERW;
- if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
+ if (rfds_mitigation == RFDS_MITIGATION_VERW) {
+ if (!verw_clears_cpu_reg_file())
+ rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", rfds_strings[rfds_mitigation]);
+}
+
+static void __init rfds_apply_mitigation(void)
+{
+ if (rfds_mitigation == RFDS_MITIGATION_VERW)
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
- else
- rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
}
static __init int rfds_parse_cmdline(char *str)
@@ -545,74 +717,11 @@ static __init int rfds_parse_cmdline(char *str)
early_param("reg_file_data_sampling", rfds_parse_cmdline);
#undef pr_fmt
-#define pr_fmt(fmt) "" fmt
-
-static void __init md_clear_update_mitigation(void)
-{
- if (cpu_mitigations_off())
- return;
-
- if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF))
- goto out;
-
- /*
- * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO
- * Stale Data mitigation, if necessary.
- */
- if (mds_mitigation == MDS_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_MDS)) {
- mds_mitigation = MDS_MITIGATION_FULL;
- mds_select_mitigation();
- }
- if (taa_mitigation == TAA_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_TAA)) {
- taa_mitigation = TAA_MITIGATION_VERW;
- taa_select_mitigation();
- }
- /*
- * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear
- * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state.
- */
- if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
- mmio_mitigation = MMIO_MITIGATION_VERW;
- mmio_select_mitigation();
- }
- if (rfds_mitigation == RFDS_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_RFDS)) {
- rfds_mitigation = RFDS_MITIGATION_VERW;
- rfds_select_mitigation();
- }
-out:
- if (boot_cpu_has_bug(X86_BUG_MDS))
- pr_info("MDS: %s\n", mds_strings[mds_mitigation]);
- if (boot_cpu_has_bug(X86_BUG_TAA))
- pr_info("TAA: %s\n", taa_strings[taa_mitigation]);
- if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
- pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]);
- if (boot_cpu_has_bug(X86_BUG_RFDS))
- pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]);
-}
-
-static void __init md_clear_select_mitigation(void)
-{
- mds_select_mitigation();
- taa_select_mitigation();
- mmio_select_mitigation();
- rfds_select_mitigation();
-
- /*
- * As these mitigations are inter-related and rely on VERW instruction
- * to clear the microarchitural buffers, update and print their status
- * after mitigation selection is done for each of these vulnerabilities.
- */
- md_clear_update_mitigation();
-}
-
-#undef pr_fmt
#define pr_fmt(fmt) "SRBDS: " fmt
enum srbds_mitigations {
SRBDS_MITIGATION_OFF,
+ SRBDS_MITIGATION_AUTO,
SRBDS_MITIGATION_UCODE_NEEDED,
SRBDS_MITIGATION_FULL,
SRBDS_MITIGATION_TSX_OFF,
@@ -620,7 +729,7 @@ enum srbds_mitigations {
};
static enum srbds_mitigations srbds_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_AUTO : SRBDS_MITIGATION_OFF;
static const char * const srbds_strings[] = {
[SRBDS_MITIGATION_OFF] = "Vulnerable",
@@ -671,8 +780,13 @@ void update_srbds_msr(void)
static void __init srbds_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS) || cpu_mitigations_off()) {
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
return;
+ }
+
+ if (srbds_mitigation == SRBDS_MITIGATION_AUTO)
+ srbds_mitigation = SRBDS_MITIGATION_FULL;
/*
* Check to see if this is one of the MDS_NO systems supporting TSX that
@@ -686,13 +800,17 @@ static void __init srbds_select_mitigation(void)
srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED;
- else if (cpu_mitigations_off() || srbds_off)
+ else if (srbds_off)
srbds_mitigation = SRBDS_MITIGATION_OFF;
- update_srbds_msr();
pr_info("%s\n", srbds_strings[srbds_mitigation]);
}
+static void __init srbds_apply_mitigation(void)
+{
+ update_srbds_msr();
+}
+
static int __init srbds_parse_cmdline(char *str)
{
if (!str)
@@ -739,6 +857,7 @@ early_param("l1d_flush", l1d_flush_parse_cmdline);
enum gds_mitigations {
GDS_MITIGATION_OFF,
+ GDS_MITIGATION_AUTO,
GDS_MITIGATION_UCODE_NEEDED,
GDS_MITIGATION_FORCE,
GDS_MITIGATION_FULL,
@@ -747,7 +866,7 @@ enum gds_mitigations {
};
static enum gds_mitigations gds_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_AUTO : GDS_MITIGATION_OFF;
static const char * const gds_strings[] = {
[GDS_MITIGATION_OFF] = "Vulnerable",
@@ -788,6 +907,7 @@ void update_gds_msr(void)
case GDS_MITIGATION_FORCE:
case GDS_MITIGATION_UCODE_NEEDED:
case GDS_MITIGATION_HYPERVISOR:
+ case GDS_MITIGATION_AUTO:
return;
}
@@ -811,26 +931,21 @@ static void __init gds_select_mitigation(void)
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
gds_mitigation = GDS_MITIGATION_HYPERVISOR;
- goto out;
+ return;
}
if (cpu_mitigations_off())
gds_mitigation = GDS_MITIGATION_OFF;
/* Will verify below that mitigation _can_ be disabled */
+ if (gds_mitigation == GDS_MITIGATION_AUTO)
+ gds_mitigation = GDS_MITIGATION_FULL;
+
/* No microcode */
if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) {
- if (gds_mitigation == GDS_MITIGATION_FORCE) {
- /*
- * This only needs to be done on the boot CPU so do it
- * here rather than in update_gds_msr()
- */
- setup_clear_cpu_cap(X86_FEATURE_AVX);
- pr_warn("Microcode update needed! Disabling AVX as mitigation.\n");
- } else {
+ if (gds_mitigation != GDS_MITIGATION_FORCE)
gds_mitigation = GDS_MITIGATION_UCODE_NEEDED;
- }
- goto out;
+ return;
}
/* Microcode has mitigation, use it */
@@ -851,9 +966,25 @@ static void __init gds_select_mitigation(void)
*/
gds_mitigation = GDS_MITIGATION_FULL_LOCKED;
}
+}
+
+static void __init gds_apply_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ /* Microcode is present */
+ if (x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)
+ update_gds_msr();
+ else if (gds_mitigation == GDS_MITIGATION_FORCE) {
+ /*
+ * This only needs to be done on the boot CPU so do it
+ * here rather than in update_gds_msr()
+ */
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ pr_warn("Microcode update needed! Disabling AVX as mitigation.\n");
+ }
- update_gds_msr();
-out:
pr_info("%s\n", gds_strings[gds_mitigation]);
}
@@ -914,10 +1045,14 @@ static bool smap_works_speculatively(void)
static void __init spectre_v1_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off())
spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+}
+
+static void __init spectre_v1_apply_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off())
return;
- }
if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) {
/*
@@ -972,6 +1107,7 @@ enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE;
enum retbleed_mitigation {
RETBLEED_MITIGATION_NONE,
+ RETBLEED_MITIGATION_AUTO,
RETBLEED_MITIGATION_UNRET,
RETBLEED_MITIGATION_IBPB,
RETBLEED_MITIGATION_IBRS,
@@ -979,14 +1115,6 @@ enum retbleed_mitigation {
RETBLEED_MITIGATION_STUFF,
};
-enum retbleed_mitigation_cmd {
- RETBLEED_CMD_OFF,
- RETBLEED_CMD_AUTO,
- RETBLEED_CMD_UNRET,
- RETBLEED_CMD_IBPB,
- RETBLEED_CMD_STUFF,
-};
-
static const char * const retbleed_strings[] = {
[RETBLEED_MITIGATION_NONE] = "Vulnerable",
[RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk",
@@ -997,9 +1125,7 @@ static const char * const retbleed_strings[] = {
};
static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
- RETBLEED_MITIGATION_NONE;
-static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_MITIGATION_AUTO : RETBLEED_MITIGATION_NONE;
static int __ro_after_init retbleed_nosmt = false;
@@ -1016,15 +1142,15 @@ static int __init retbleed_parse_cmdline(char *str)
}
if (!strcmp(str, "off")) {
- retbleed_cmd = RETBLEED_CMD_OFF;
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
} else if (!strcmp(str, "auto")) {
- retbleed_cmd = RETBLEED_CMD_AUTO;
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
} else if (!strcmp(str, "unret")) {
- retbleed_cmd = RETBLEED_CMD_UNRET;
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
} else if (!strcmp(str, "ibpb")) {
- retbleed_cmd = RETBLEED_CMD_IBPB;
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
} else if (!strcmp(str, "stuff")) {
- retbleed_cmd = RETBLEED_CMD_STUFF;
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
} else if (!strcmp(str, "nosmt")) {
retbleed_nosmt = true;
} else if (!strcmp(str, "force")) {
@@ -1045,72 +1171,109 @@ early_param("retbleed", retbleed_parse_cmdline);
static void __init retbleed_select_mitigation(void)
{
- bool mitigate_smt = false;
-
- if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
- return;
-
- switch (retbleed_cmd) {
- case RETBLEED_CMD_OFF:
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
return;
+ }
- case RETBLEED_CMD_UNRET:
- if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) {
- retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
- } else {
+ switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_UNRET:
+ if (!IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n");
- goto do_cmd_auto;
}
break;
-
- case RETBLEED_CMD_IBPB:
+ case RETBLEED_MITIGATION_IBPB:
if (!boot_cpu_has(X86_FEATURE_IBPB)) {
pr_err("WARNING: CPU does not support IBPB.\n");
- goto do_cmd_auto;
- } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
- retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
- } else {
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
- goto do_cmd_auto;
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
}
break;
+ case RETBLEED_MITIGATION_STUFF:
+ if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ pr_err("WARNING: retbleed=stuff only supported for Intel CPUs.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ }
+ break;
+ default:
+ break;
+ }
- case RETBLEED_CMD_STUFF:
- if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) &&
- spectre_v2_enabled == SPECTRE_V2_RETPOLINE) {
- retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+ if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO)
+ return;
- } else {
- if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING))
- pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n");
- else
- pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n");
+ /* Intel mitigation selected in retbleed_update_mitigation() */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
+ if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY))
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
+ else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) &&
+ boot_cpu_has(X86_FEATURE_IBPB))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ else
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+}
- goto do_cmd_auto;
- }
- break;
+static void __init retbleed_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
+ return;
+
+ if (retbleed_mitigation == RETBLEED_MITIGATION_NONE)
+ goto out;
-do_cmd_auto:
- case RETBLEED_CMD_AUTO:
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
- boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
- if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY))
- retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
- else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) &&
- boot_cpu_has(X86_FEATURE_IBPB))
- retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ /*
+ * retbleed=stuff is only allowed on Intel. If stuffing can't be used
+ * then a different mitigation will be selected below.
+ */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF) {
+ if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) {
+ pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
}
+ }
+ /*
+ * Let IBRS trump all on Intel without affecting the effects of the
+ * retbleed= cmdline option except for call depth based stuffing
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_IBRS:
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ break;
+ case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ break;
+ default:
+ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+ pr_err(RETBLEED_INTEL_MSG);
+ }
+ /* If nothing has set the mitigation yet, default to NONE. */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO)
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+out:
+ pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
+}
- /*
- * The Intel mitigation (IBRS or eIBRS) was already selected in
- * spectre_v2_select_mitigation(). 'retbleed_mitigation' will
- * be set accordingly below.
- */
- break;
- }
+static void __init retbleed_apply_mitigation(void)
+{
+ bool mitigate_smt = false;
switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_NONE:
+ return;
+
case RETBLEED_MITIGATION_UNRET:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_UNRET);
@@ -1160,28 +1323,6 @@ do_cmd_auto:
if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) &&
(retbleed_nosmt || cpu_mitigations_auto_nosmt()))
cpu_smt_disable(false);
-
- /*
- * Let IBRS trump all on Intel without affecting the effects of the
- * retbleed= cmdline option except for call depth based stuffing
- */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
- switch (spectre_v2_enabled) {
- case SPECTRE_V2_IBRS:
- retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
- break;
- case SPECTRE_V2_EIBRS:
- case SPECTRE_V2_EIBRS_RETPOLINE:
- case SPECTRE_V2_EIBRS_LFENCE:
- retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
- break;
- default:
- if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
- pr_err(RETBLEED_INTEL_MSG);
- }
- }
-
- pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
}
#undef pr_fmt
@@ -1261,6 +1402,8 @@ enum spectre_v2_mitigation_cmd {
SPECTRE_V2_CMD_IBRS,
};
+static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO;
+
enum spectre_v2_user_cmd {
SPECTRE_V2_USER_CMD_NONE,
SPECTRE_V2_USER_CMD_AUTO,
@@ -1299,31 +1442,18 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure)
pr_info("spectre_v2_user=%s forced on command line.\n", reason);
}
-static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd;
-
-static enum spectre_v2_user_cmd __init
-spectre_v2_parse_user_cmdline(void)
+static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void)
{
- enum spectre_v2_user_cmd mode;
char arg[20];
int ret, i;
- mode = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ?
- SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE;
-
- switch (spectre_v2_cmd) {
- case SPECTRE_V2_CMD_NONE:
+ if (cpu_mitigations_off() || !IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2))
return SPECTRE_V2_USER_CMD_NONE;
- case SPECTRE_V2_CMD_FORCE:
- return SPECTRE_V2_USER_CMD_FORCE;
- default:
- break;
- }
ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
arg, sizeof(arg));
if (ret < 0)
- return mode;
+ return SPECTRE_V2_USER_CMD_AUTO;
for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
if (match_option(arg, ret, v2_user_options[i].option)) {
@@ -1334,7 +1464,7 @@ spectre_v2_parse_user_cmdline(void)
}
pr_err("Unknown user space protection option (%s). Switching to default\n", arg);
- return mode;
+ return SPECTRE_V2_USER_CMD_AUTO;
}
static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
@@ -1342,60 +1472,72 @@ static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS;
}
-static void __init
-spectre_v2_user_select_mitigation(void)
+static void __init spectre_v2_user_select_mitigation(void)
{
- enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
- enum spectre_v2_user_cmd cmd;
-
if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
return;
- cmd = spectre_v2_parse_user_cmdline();
- switch (cmd) {
+ switch (spectre_v2_parse_user_cmdline()) {
case SPECTRE_V2_USER_CMD_NONE:
- goto set_mode;
+ return;
case SPECTRE_V2_USER_CMD_FORCE:
- mode = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
break;
case SPECTRE_V2_USER_CMD_AUTO:
case SPECTRE_V2_USER_CMD_PRCTL:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
- mode = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
break;
case SPECTRE_V2_USER_CMD_SECCOMP:
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_SECCOMP;
+ else
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = spectre_v2_user_ibpb;
+ break;
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
if (IS_ENABLED(CONFIG_SECCOMP))
- mode = SPECTRE_V2_USER_SECCOMP;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_SECCOMP;
else
- mode = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
break;
}
- /* Initialize Indirect Branch Prediction Barrier */
- if (boot_cpu_has(X86_FEATURE_IBPB)) {
- static_branch_enable(&switch_vcpu_ibpb);
+ /*
+ * At this point, an STIBP mode other than "off" has been set.
+ * If STIBP support is not being forced, check if STIBP always-on
+ * is preferred.
+ */
+ if ((spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) &&
+ boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED;
- spectre_v2_user_ibpb = mode;
- switch (cmd) {
- case SPECTRE_V2_USER_CMD_NONE:
- break;
- case SPECTRE_V2_USER_CMD_FORCE:
- case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
- case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
- static_branch_enable(&switch_mm_always_ibpb);
- spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
- break;
- case SPECTRE_V2_USER_CMD_PRCTL:
- case SPECTRE_V2_USER_CMD_AUTO:
- case SPECTRE_V2_USER_CMD_SECCOMP:
- static_branch_enable(&switch_mm_cond_ibpb);
- break;
- }
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE;
- pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
- static_key_enabled(&switch_mm_always_ibpb) ?
- "always-on" : "conditional");
+ if (!boot_cpu_has(X86_FEATURE_STIBP))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+}
+
+static void __init spectre_v2_user_update_mitigation(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ /* The spectre_v2 cmd line can override spectre_v2_user options */
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) {
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+ } else if (spectre_v2_cmd == SPECTRE_V2_CMD_FORCE) {
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
}
/*
@@ -1413,30 +1555,44 @@ spectre_v2_user_select_mitigation(void)
if (!boot_cpu_has(X86_FEATURE_STIBP) ||
!cpu_smt_possible() ||
(spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
- !boot_cpu_has(X86_FEATURE_AUTOIBRS)))
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS))) {
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
return;
+ }
- /*
- * At this point, an STIBP mode other than "off" has been set.
- * If STIBP support is not being forced, check if STIBP always-on
- * is preferred.
- */
- if (mode != SPECTRE_V2_USER_STRICT &&
- boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
- mode = SPECTRE_V2_USER_STRICT_PREFERRED;
-
- if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
- retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
- if (mode != SPECTRE_V2_USER_STRICT &&
- mode != SPECTRE_V2_USER_STRICT_PREFERRED)
+ if (spectre_v2_user_stibp != SPECTRE_V2_USER_NONE &&
+ (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
+ retbleed_mitigation == RETBLEED_MITIGATION_IBPB)) {
+ if (spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT &&
+ spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT_PREFERRED)
pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n");
- mode = SPECTRE_V2_USER_STRICT_PREFERRED;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED;
}
+ pr_info("%s\n", spectre_v2_user_strings[spectre_v2_user_stibp]);
+}
+
+static void __init spectre_v2_user_apply_mitigation(void)
+{
+ /* Initialize Indirect Branch Prediction Barrier */
+ if (spectre_v2_user_ibpb != SPECTRE_V2_USER_NONE) {
+ static_branch_enable(&switch_vcpu_ibpb);
- spectre_v2_user_stibp = mode;
+ switch (spectre_v2_user_ibpb) {
+ case SPECTRE_V2_USER_STRICT:
+ static_branch_enable(&switch_mm_always_ibpb);
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ static_branch_enable(&switch_mm_cond_ibpb);
+ break;
+ default:
+ break;
+ }
-set_mode:
- pr_info("%s\n", spectre_v2_user_strings[mode]);
+ pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
+ static_key_enabled(&switch_mm_always_ibpb) ?
+ "always-on" : "conditional");
+ }
}
static const char * const spectre_v2_strings[] = {
@@ -1656,12 +1812,13 @@ static bool __init spec_ctrl_bhi_dis(void)
enum bhi_mitigations {
BHI_MITIGATION_OFF,
+ BHI_MITIGATION_AUTO,
BHI_MITIGATION_ON,
BHI_MITIGATION_VMEXIT_ONLY,
};
static enum bhi_mitigations bhi_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
static int __init spectre_bhi_parse_cmdline(char *str)
{
@@ -1683,6 +1840,25 @@ early_param("spectre_bhi", spectre_bhi_parse_cmdline);
static void __init bhi_select_mitigation(void)
{
+ if (!boot_cpu_has(X86_BUG_BHI) || cpu_mitigations_off())
+ bhi_mitigation = BHI_MITIGATION_OFF;
+
+ if (bhi_mitigation == BHI_MITIGATION_AUTO)
+ bhi_mitigation = BHI_MITIGATION_ON;
+}
+
+static void __init bhi_update_mitigation(void)
+{
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE)
+ bhi_mitigation = BHI_MITIGATION_OFF;
+
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
+ spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)
+ bhi_mitigation = BHI_MITIGATION_OFF;
+}
+
+static void __init bhi_apply_mitigation(void)
+{
if (bhi_mitigation == BHI_MITIGATION_OFF)
return;
@@ -1714,75 +1890,80 @@ static void __init bhi_select_mitigation(void)
static void __init spectre_v2_select_mitigation(void)
{
- enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
- enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
+ spectre_v2_cmd = spectre_v2_parse_cmdline();
- /*
- * If the CPU is not affected and the command line mode is NONE or AUTO
- * then nothing to do.
- */
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
- (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO))
+ (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO))
return;
- switch (cmd) {
+ switch (spectre_v2_cmd) {
case SPECTRE_V2_CMD_NONE:
return;
case SPECTRE_V2_CMD_FORCE:
case SPECTRE_V2_CMD_AUTO:
if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
- mode = SPECTRE_V2_EIBRS;
- break;
- }
-
- if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) &&
- boot_cpu_has_bug(X86_BUG_RETBLEED) &&
- retbleed_cmd != RETBLEED_CMD_OFF &&
- retbleed_cmd != RETBLEED_CMD_STUFF &&
- boot_cpu_has(X86_FEATURE_IBRS) &&
- boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
- mode = SPECTRE_V2_IBRS;
+ spectre_v2_enabled = SPECTRE_V2_EIBRS;
break;
}
- mode = spectre_v2_select_retpoline();
+ spectre_v2_enabled = spectre_v2_select_retpoline();
break;
case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
pr_err(SPECTRE_V2_LFENCE_MSG);
- mode = SPECTRE_V2_LFENCE;
+ spectre_v2_enabled = SPECTRE_V2_LFENCE;
break;
case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
- mode = SPECTRE_V2_RETPOLINE;
+ spectre_v2_enabled = SPECTRE_V2_RETPOLINE;
break;
case SPECTRE_V2_CMD_RETPOLINE:
- mode = spectre_v2_select_retpoline();
+ spectre_v2_enabled = spectre_v2_select_retpoline();
break;
case SPECTRE_V2_CMD_IBRS:
- mode = SPECTRE_V2_IBRS;
+ spectre_v2_enabled = SPECTRE_V2_IBRS;
break;
case SPECTRE_V2_CMD_EIBRS:
- mode = SPECTRE_V2_EIBRS;
+ spectre_v2_enabled = SPECTRE_V2_EIBRS;
break;
case SPECTRE_V2_CMD_EIBRS_LFENCE:
- mode = SPECTRE_V2_EIBRS_LFENCE;
+ spectre_v2_enabled = SPECTRE_V2_EIBRS_LFENCE;
break;
case SPECTRE_V2_CMD_EIBRS_RETPOLINE:
- mode = SPECTRE_V2_EIBRS_RETPOLINE;
+ spectre_v2_enabled = SPECTRE_V2_EIBRS_RETPOLINE;
break;
}
+}
+
+static void __init spectre_v2_update_mitigation(void)
+{
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_AUTO) {
+ if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) &&
+ boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ retbleed_mitigation != RETBLEED_MITIGATION_NONE &&
+ retbleed_mitigation != RETBLEED_MITIGATION_STUFF &&
+ boot_cpu_has(X86_FEATURE_IBRS) &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ spectre_v2_enabled = SPECTRE_V2_IBRS;
+ }
+ }
- if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && !cpu_mitigations_off())
+ pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]);
+}
+
+static void __init spectre_v2_apply_mitigation(void)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
- if (spectre_v2_in_ibrs_mode(mode)) {
+ if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) {
if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) {
msr_set_bit(MSR_EFER, _EFER_AUTOIBRS);
} else {
@@ -1791,8 +1972,10 @@ static void __init spectre_v2_select_mitigation(void)
}
}
- switch (mode) {
+ switch (spectre_v2_enabled) {
case SPECTRE_V2_NONE:
+ return;
+
case SPECTRE_V2_EIBRS:
break;
@@ -1818,18 +2001,12 @@ static void __init spectre_v2_select_mitigation(void)
* JMPs gets protection against BHI and Intramode-BTI, but RET
* prediction from a non-RSB predictor is still a risk.
*/
- if (mode == SPECTRE_V2_EIBRS_LFENCE ||
- mode == SPECTRE_V2_EIBRS_RETPOLINE ||
- mode == SPECTRE_V2_RETPOLINE)
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE ||
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_RETPOLINE ||
+ spectre_v2_enabled == SPECTRE_V2_RETPOLINE)
spec_ctrl_disable_kernel_rrsba();
- if (boot_cpu_has(X86_BUG_BHI))
- bhi_select_mitigation();
-
- spectre_v2_enabled = mode;
- pr_info("%s\n", spectre_v2_strings[mode]);
-
- spectre_v2_select_rsb_mitigation(mode);
+ spectre_v2_select_rsb_mitigation(spectre_v2_enabled);
/*
* Retpoline protects the kernel, but doesn't protect firmware. IBRS
@@ -1837,28 +2014,26 @@ static void __init spectre_v2_select_mitigation(void)
* firmware calls only when IBRS / Enhanced / Automatic IBRS aren't
* otherwise enabled.
*
- * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
- * the user might select retpoline on the kernel command line and if
- * the CPU supports Enhanced IBRS, kernel might un-intentionally not
- * enable IBRS around firmware calls.
+ * Use "spectre_v2_enabled" to check Enhanced IBRS instead of
+ * boot_cpu_has(), because the user might select retpoline on the kernel
+ * command line and if the CPU supports Enhanced IBRS, kernel might
+ * un-intentionally not enable IBRS around firmware calls.
*/
if (boot_cpu_has_bug(X86_BUG_RETBLEED) &&
boot_cpu_has(X86_FEATURE_IBPB) &&
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) {
- if (retbleed_cmd != RETBLEED_CMD_IBPB) {
+ if (retbleed_mitigation != RETBLEED_MITIGATION_IBPB) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW);
pr_info("Enabling Speculation Barrier for firmware calls\n");
}
- } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) {
+ } else if (boot_cpu_has(X86_FEATURE_IBRS) &&
+ !spectre_v2_in_ibrs_mode(spectre_v2_enabled)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
-
- /* Set up IBPB and STIBP depending on the general spectre V2 command */
- spectre_v2_cmd = cmd;
}
static void update_stibp_msr(void * __unused)
@@ -2047,19 +2222,18 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
return cmd;
}
-static enum ssb_mitigation __init __ssb_select_mitigation(void)
+static void __init ssb_select_mitigation(void)
{
- enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE;
enum ssb_mitigation_cmd cmd;
if (!boot_cpu_has(X86_FEATURE_SSBD))
- return mode;
+ goto out;
cmd = ssb_parse_cmdline();
if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) &&
(cmd == SPEC_STORE_BYPASS_CMD_NONE ||
cmd == SPEC_STORE_BYPASS_CMD_AUTO))
- return mode;
+ return;
switch (cmd) {
case SPEC_STORE_BYPASS_CMD_SECCOMP:
@@ -2068,28 +2242,35 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
* enabled.
*/
if (IS_ENABLED(CONFIG_SECCOMP))
- mode = SPEC_STORE_BYPASS_SECCOMP;
+ ssb_mode = SPEC_STORE_BYPASS_SECCOMP;
else
- mode = SPEC_STORE_BYPASS_PRCTL;
+ ssb_mode = SPEC_STORE_BYPASS_PRCTL;
break;
case SPEC_STORE_BYPASS_CMD_ON:
- mode = SPEC_STORE_BYPASS_DISABLE;
+ ssb_mode = SPEC_STORE_BYPASS_DISABLE;
break;
case SPEC_STORE_BYPASS_CMD_AUTO:
case SPEC_STORE_BYPASS_CMD_PRCTL:
- mode = SPEC_STORE_BYPASS_PRCTL;
+ ssb_mode = SPEC_STORE_BYPASS_PRCTL;
break;
case SPEC_STORE_BYPASS_CMD_NONE:
break;
}
+out:
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ pr_info("%s\n", ssb_strings[ssb_mode]);
+}
+
+static void __init ssb_apply_mitigation(void)
+{
/*
* We have three CPU feature flags that are in play here:
* - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
* - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
* - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
*/
- if (mode == SPEC_STORE_BYPASS_DISABLE) {
+ if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) {
setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
/*
* Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
@@ -2103,16 +2284,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
update_spec_ctrl(x86_spec_ctrl_base);
}
}
-
- return mode;
-}
-
-static void ssb_select_mitigation(void)
-{
- ssb_mode = __ssb_select_mitigation();
-
- if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
- pr_info("%s\n", ssb_strings[ssb_mode]);
}
#undef pr_fmt
@@ -2368,7 +2539,7 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
/* Default mitigation for L1TF-affected CPUs */
enum l1tf_mitigations l1tf_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF;
#if IS_ENABLED(CONFIG_KVM_INTEL)
EXPORT_SYMBOL_GPL(l1tf_mitigation);
#endif
@@ -2416,22 +2587,33 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
static void __init l1tf_select_mitigation(void)
{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF) || cpu_mitigations_off()) {
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ return;
+ }
+
+ if (l1tf_mitigation == L1TF_MITIGATION_AUTO) {
+ if (cpu_mitigations_auto_nosmt())
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ }
+}
+
+static void __init l1tf_apply_mitigation(void)
+{
u64 half_pa;
if (!boot_cpu_has_bug(X86_BUG_L1TF))
return;
- if (cpu_mitigations_off())
- l1tf_mitigation = L1TF_MITIGATION_OFF;
- else if (cpu_mitigations_auto_nosmt())
- l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
-
override_cache_bits(&boot_cpu_data);
switch (l1tf_mitigation) {
case L1TF_MITIGATION_OFF:
case L1TF_MITIGATION_FLUSH_NOWARN:
case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_AUTO:
break;
case L1TF_MITIGATION_FLUSH_NOSMT:
case L1TF_MITIGATION_FULL:
@@ -2491,6 +2673,7 @@ early_param("l1tf", l1tf_cmdline);
enum srso_mitigation {
SRSO_MITIGATION_NONE,
+ SRSO_MITIGATION_AUTO,
SRSO_MITIGATION_UCODE_NEEDED,
SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
SRSO_MITIGATION_MICROCODE,
@@ -2500,14 +2683,6 @@ enum srso_mitigation {
SRSO_MITIGATION_BP_SPEC_REDUCE,
};
-enum srso_mitigation_cmd {
- SRSO_CMD_OFF,
- SRSO_CMD_MICROCODE,
- SRSO_CMD_SAFE_RET,
- SRSO_CMD_IBPB,
- SRSO_CMD_IBPB_ON_VMEXIT,
-};
-
static const char * const srso_strings[] = {
[SRSO_MITIGATION_NONE] = "Vulnerable",
[SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
@@ -2519,8 +2694,7 @@ static const char * const srso_strings[] = {
[SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
};
-static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE;
-static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET;
+static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO;
static int __init srso_parse_cmdline(char *str)
{
@@ -2528,15 +2702,15 @@ static int __init srso_parse_cmdline(char *str)
return -EINVAL;
if (!strcmp(str, "off"))
- srso_cmd = SRSO_CMD_OFF;
+ srso_mitigation = SRSO_MITIGATION_NONE;
else if (!strcmp(str, "microcode"))
- srso_cmd = SRSO_CMD_MICROCODE;
+ srso_mitigation = SRSO_MITIGATION_MICROCODE;
else if (!strcmp(str, "safe-ret"))
- srso_cmd = SRSO_CMD_SAFE_RET;
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
else if (!strcmp(str, "ibpb"))
- srso_cmd = SRSO_CMD_IBPB;
+ srso_mitigation = SRSO_MITIGATION_IBPB;
else if (!strcmp(str, "ibpb-vmexit"))
- srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT;
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
else
pr_err("Ignoring unknown SRSO option (%s).", str);
@@ -2548,132 +2722,83 @@ early_param("spec_rstack_overflow", srso_parse_cmdline);
static void __init srso_select_mitigation(void)
{
- bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
+ bool has_microcode;
- if (!boot_cpu_has_bug(X86_BUG_SRSO) ||
- cpu_mitigations_off() ||
- srso_cmd == SRSO_CMD_OFF) {
- if (boot_cpu_has(X86_FEATURE_SBPB))
- x86_pred_cmd = PRED_CMD_SBPB;
- goto out;
- }
+ if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
+ srso_mitigation = SRSO_MITIGATION_NONE;
+
+ if (srso_mitigation == SRSO_MITIGATION_NONE)
+ return;
+ if (srso_mitigation == SRSO_MITIGATION_AUTO)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+
+ has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
if (has_microcode) {
/*
* Zen1/2 with SMT off aren't vulnerable after the right
* IBPB microcode has been applied.
- *
- * Zen1/2 don't have SBPB, no need to try to enable it here.
*/
if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
- goto out;
- }
-
- if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
- srso_mitigation = SRSO_MITIGATION_IBPB;
- goto out;
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ return;
}
} else {
pr_warn("IBPB-extending microcode not applied!\n");
pr_warn(SRSO_NOTICE);
-
- /* may be overwritten by SRSO_CMD_SAFE_RET below */
- srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
}
- switch (srso_cmd) {
- case SRSO_CMD_MICROCODE:
- if (has_microcode) {
- srso_mitigation = SRSO_MITIGATION_MICROCODE;
- pr_warn(SRSO_NOTICE);
- }
- break;
-
- case SRSO_CMD_SAFE_RET:
- if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))
+ switch (srso_mitigation) {
+ case SRSO_MITIGATION_SAFE_RET:
+ if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) {
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
goto ibpb_on_vmexit;
+ }
- if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
- /*
- * Enable the return thunk for generated code
- * like ftrace, static_call, etc.
- */
- setup_force_cpu_cap(X86_FEATURE_RETHUNK);
- setup_force_cpu_cap(X86_FEATURE_UNRET);
-
- if (boot_cpu_data.x86 == 0x19) {
- setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
- x86_return_thunk = srso_alias_return_thunk;
- } else {
- setup_force_cpu_cap(X86_FEATURE_SRSO);
- x86_return_thunk = srso_return_thunk;
- }
- if (has_microcode)
- srso_mitigation = SRSO_MITIGATION_SAFE_RET;
- else
- srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
- } else {
+ if (!IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
+ srso_mitigation = SRSO_MITIGATION_NONE;
}
- break;
- case SRSO_CMD_IBPB:
- if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
- if (has_microcode) {
- setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
- setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
- srso_mitigation = SRSO_MITIGATION_IBPB;
-
- /*
- * IBPB on entry already obviates the need for
- * software-based untraining so clear those in case some
- * other mitigation like Retbleed has selected them.
- */
- setup_clear_cpu_cap(X86_FEATURE_UNRET);
- setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
-
- /*
- * There is no need for RSB filling: write_ibpb() ensures
- * all predictions, including the RSB, are invalidated,
- * regardless of IBPB implementation.
- */
- setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
- }
- } else {
- pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
- }
+ if (!has_microcode)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
break;
-
ibpb_on_vmexit:
- case SRSO_CMD_IBPB_ON_VMEXIT:
+ case SRSO_MITIGATION_IBPB_ON_VMEXIT:
if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) {
pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n");
srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE;
break;
}
-
- if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
- if (has_microcode) {
- setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
- srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
-
- /*
- * There is no need for RSB filling: write_ibpb() ensures
- * all predictions, including the RSB, are invalidated,
- * regardless of IBPB implementation.
- */
- setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
- }
- } else {
+ fallthrough;
+ case SRSO_MITIGATION_IBPB:
+ if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ srso_mitigation = SRSO_MITIGATION_NONE;
}
+
+ if (!has_microcode)
+ srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
break;
default:
break;
}
+}
-out:
+static void __init srso_update_mitigation(void)
+{
+ /* If retbleed is using IBPB, that works for SRSO as well */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB &&
+ boot_cpu_has(X86_FEATURE_IBPB_BRTYPE))
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+
+ if (boot_cpu_has_bug(X86_BUG_SRSO) && !cpu_mitigations_off())
+ pr_info("%s\n", srso_strings[srso_mitigation]);
+}
+
+static void __init srso_apply_mitigation(void)
+{
/*
* Clear the feature flag if this mitigation is not selected as that
* feature flag controls the BpSpecReduce MSR bit toggling in KVM.
@@ -2681,8 +2806,52 @@ out:
if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE)
setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE);
- if (srso_mitigation != SRSO_MITIGATION_NONE)
- pr_info("%s\n", srso_strings[srso_mitigation]);
+ if (srso_mitigation == SRSO_MITIGATION_NONE) {
+ if (boot_cpu_has(X86_FEATURE_SBPB))
+ x86_pred_cmd = PRED_CMD_SBPB;
+ return;
+ }
+
+ switch (srso_mitigation) {
+ case SRSO_MITIGATION_SAFE_RET:
+ case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED:
+ /*
+ * Enable the return thunk for generated code
+ * like ftrace, static_call, etc.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_UNRET);
+
+ if (boot_cpu_data.x86 == 0x19) {
+ setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
+ x86_return_thunk = srso_alias_return_thunk;
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_SRSO);
+ x86_return_thunk = srso_return_thunk;
+ }
+ break;
+ case SRSO_MITIGATION_IBPB:
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like Retbleed has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+ fallthrough;
+ case SRSO_MITIGATION_IBPB_ON_VMEXIT:
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ /*
+ * There is no need for RSB filling: entry_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ break;
+ default:
+ break;
+ }
}
#undef pr_fmt
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e5734df3b4a1..8a1f9a889eb4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -242,6 +242,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+SYM_PIC_ALIAS(gdt_page);
#ifdef CONFIG_X86_64
static int __init x86_nopcid_setup(char *s)
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 4a10d35e70aa..96cb992d50ef 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -1098,15 +1098,17 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz
static int __init save_microcode_in_initrd(void)
{
- unsigned int cpuid_1_eax = native_cpuid_eax(1);
struct cpuinfo_x86 *c = &boot_cpu_data;
struct cont_desc desc = { 0 };
+ unsigned int cpuid_1_eax;
enum ucode_state ret;
struct cpio_data cp;
- if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+ if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
return 0;
+ cpuid_1_eax = native_cpuid_eax(1);
+
if (!find_blobs_in_containers(&cp))
return -EINVAL;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b3658d11e7b6..079f046ee26d 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -41,8 +41,8 @@
#include "internal.h"
-static struct microcode_ops *microcode_ops;
-bool dis_ucode_ldr = true;
+static struct microcode_ops *microcode_ops;
+static bool dis_ucode_ldr = false;
bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV);
module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
@@ -84,6 +84,9 @@ static bool amd_check_current_patch_level(void)
u32 lvl, dummy, i;
u32 *levels;
+ if (x86_cpuid_vendor() != X86_VENDOR_AMD)
+ return false;
+
native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
levels = final_levels;
@@ -95,27 +98,29 @@ static bool amd_check_current_patch_level(void)
return false;
}
-static bool __init check_loader_disabled_bsp(void)
+bool __init microcode_loader_disabled(void)
{
- static const char *__dis_opt_str = "dis_ucode_ldr";
- const char *cmdline = boot_command_line;
- const char *option = __dis_opt_str;
+ if (dis_ucode_ldr)
+ return true;
/*
- * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
- * completely accurate as xen pv guests don't see that CPUID bit set but
- * that's good enough as they don't land on the BSP path anyway.
+ * Disable when:
+ *
+ * 1) The CPU does not support CPUID.
+ *
+ * 2) Bit 31 in CPUID[1]:ECX is clear
+ * The bit is reserved for hypervisor use. This is still not
+ * completely accurate as XEN PV guests don't see that CPUID bit
+ * set, but that's good enough as they don't land on the BSP
+ * path anyway.
+ *
+ * 3) Certain AMD patch levels are not allowed to be
+ * overwritten.
*/
- if (native_cpuid_ecx(1) & BIT(31))
- return true;
-
- if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
- if (amd_check_current_patch_level())
- return true;
- }
-
- if (cmdline_find_option_bool(cmdline, option) <= 0)
- dis_ucode_ldr = false;
+ if (!have_cpuid_p() ||
+ native_cpuid_ecx(1) & BIT(31) ||
+ amd_check_current_patch_level())
+ dis_ucode_ldr = true;
return dis_ucode_ldr;
}
@@ -125,7 +130,10 @@ void __init load_ucode_bsp(void)
unsigned int cpuid_1_eax;
bool intel = true;
- if (!have_cpuid_p())
+ if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0)
+ dis_ucode_ldr = true;
+
+ if (microcode_loader_disabled())
return;
cpuid_1_eax = native_cpuid_eax(1);
@@ -146,9 +154,6 @@ void __init load_ucode_bsp(void)
return;
}
- if (check_loader_disabled_bsp())
- return;
-
if (intel)
load_ucode_intel_bsp(&early_data);
else
@@ -159,6 +164,11 @@ void load_ucode_ap(void)
{
unsigned int cpuid_1_eax;
+ /*
+ * Can't use microcode_loader_disabled() here - .init section
+ * hell. It doesn't have to either - the BSP variant must've
+ * parsed cmdline already anyway.
+ */
if (dis_ucode_ldr)
return;
@@ -810,7 +820,7 @@ static int __init microcode_init(void)
struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
- if (dis_ucode_ldr)
+ if (microcode_loader_disabled())
return -EINVAL;
if (c->x86_vendor == X86_VENDOR_INTEL)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 819199bc0119..2a397da43923 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -389,7 +389,7 @@ static int __init save_builtin_microcode(void)
if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED)
return 0;
- if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
uci.mc = get_microcode_blob(&uci, true);
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index 5df621752fef..50a9702ae4e2 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -94,7 +94,6 @@ static inline unsigned int x86_cpuid_family(void)
return x86_family(eax);
}
-extern bool dis_ucode_ldr;
extern bool force_minrev;
#ifdef CONFIG_CPU_SUP_AMD
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 3aad78bfcb26..cba75306e5b6 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/console.h>
#include <linux/kernel.h>
+#include <linux/kexec.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/screen_info.h>
@@ -144,6 +145,11 @@ static __init void early_serial_hw_init(unsigned divisor)
static_call(serial_out)(early_serial_base, DLL, divisor & 0xff);
static_call(serial_out)(early_serial_base, DLH, (divisor >> 8) & 0xff);
static_call(serial_out)(early_serial_base, LCR, c & ~DLAB);
+
+#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64)
+ if (static_call_query(serial_in) == io_serial_in)
+ kexec_debug_8250_port = early_serial_base;
+#endif
}
#define DEFAULT_BAUD 9600
@@ -327,6 +333,9 @@ static __init void early_pci_serial_init(char *s)
/* WARNING! assuming the address is always in the first 4G */
early_serial_base =
(unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10);
+#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64)
+ kexec_debug_8250_mmio32 = bar0 & PCI_BASE_ADDRESS_MEM_MASK;
+#endif
write_pci_config(bus, slot, func, PCI_COMMAND,
cmdreg|PCI_COMMAND_MEMORY);
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cace6e8d7cc7..0853ba3fd04a 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -55,10 +55,10 @@ void ftrace_arch_code_modify_post_process(void)
{
/*
* ftrace_make_{call,nop}() may be called during
- * module load, and we need to finish the text_poke_queue()
+ * module load, and we need to finish the smp_text_poke_batch_add()
* that they do, here.
*/
- text_poke_finish();
+ smp_text_poke_batch_finish();
ftrace_poke_late = 0;
mutex_unlock(&text_mutex);
}
@@ -119,7 +119,7 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code,
/* replace the text with the new text */
if (ftrace_poke_late)
- text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
+ smp_text_poke_batch_add((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
else
text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
return 0;
@@ -186,11 +186,11 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
ip = (unsigned long)(&ftrace_call);
new = ftrace_call_replace(ip, (unsigned long)func);
- text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
ip = (unsigned long)(&ftrace_regs_call);
new = ftrace_call_replace(ip, (unsigned long)func);
- text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
return 0;
}
@@ -247,10 +247,10 @@ void ftrace_replace_code(int enable)
break;
}
- text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
+ smp_text_poke_batch_add((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
ftrace_update_record(rec, enable);
}
- text_poke_finish();
+ smp_text_poke_batch_finish();
}
void arch_ftrace_update_code(int command)
@@ -492,7 +492,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
mutex_lock(&text_mutex);
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
- text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
mutex_unlock(&text_mutex);
}
@@ -586,7 +586,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func)
const char *new;
new = ftrace_jmp_replace(ip, (unsigned long)func);
- text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
return 0;
}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index de001b2146ab..375f2d7f1762 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -145,10 +145,6 @@ void __init __no_stack_protector mk_early_pgtbl_32(void)
*ptr = (unsigned long)ptep + PAGE_OFFSET;
#ifdef CONFIG_MICROCODE_INITRD32
- /* Running on a hypervisor? */
- if (native_cpuid_ecx(1) & BIT(31))
- return;
-
params = (struct boot_params *)__pa_nodebug(&boot_params);
if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image)
return;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index fa9b6339975f..510fb41f55fc 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -47,7 +47,8 @@
* Manage page tables very early on.
*/
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
-static unsigned int __initdata next_early_pgt;
+unsigned int __initdata next_early_pgt;
+SYM_PIC_ALIAS(next_early_pgt);
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
#ifdef CONFIG_X86_5LEVEL
@@ -61,221 +62,15 @@ EXPORT_SYMBOL(ptrs_per_p4d);
#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
EXPORT_SYMBOL(page_offset_base);
+SYM_PIC_ALIAS(page_offset_base);
unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4;
EXPORT_SYMBOL(vmalloc_base);
+SYM_PIC_ALIAS(vmalloc_base);
unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
EXPORT_SYMBOL(vmemmap_base);
+SYM_PIC_ALIAS(vmemmap_base);
#endif
-static inline bool check_la57_support(void)
-{
- if (!IS_ENABLED(CONFIG_X86_5LEVEL))
- return false;
-
- /*
- * 5-level paging is detected and enabled at kernel decompression
- * stage. Only check if it has been enabled there.
- */
- if (!(native_read_cr4() & X86_CR4_LA57))
- return false;
-
- RIP_REL_REF(__pgtable_l5_enabled) = 1;
- RIP_REL_REF(pgdir_shift) = 48;
- RIP_REL_REF(ptrs_per_p4d) = 512;
- RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5;
- RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5;
- RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5;
-
- return true;
-}
-
-static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
- pmdval_t *pmd,
- unsigned long p2v_offset)
-{
- unsigned long paddr, paddr_end;
- int i;
-
- /* Encrypt the kernel and related (if SME is active) */
- sme_encrypt_kernel(bp);
-
- /*
- * Clear the memory encryption mask from the .bss..decrypted section.
- * The bss section will be memset to zero later in the initialization so
- * there is no need to zero it after changing the memory encryption
- * attribute.
- */
- if (sme_get_me_mask()) {
- paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted);
- paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted);
-
- for (; paddr < paddr_end; paddr += PMD_SIZE) {
- /*
- * On SNP, transition the page to shared in the RMP table so that
- * it is consistent with the page table attribute change.
- *
- * __start_bss_decrypted has a virtual address in the high range
- * mapping (kernel .text). PVALIDATE, by way of
- * early_snp_set_memory_shared(), requires a valid virtual
- * address but the kernel is currently running off of the identity
- * mapping so use the PA to get a *currently* valid virtual address.
- */
- early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
-
- i = pmd_index(paddr - p2v_offset);
- pmd[i] -= sme_get_me_mask();
- }
- }
-
- /*
- * Return the SME encryption mask (if SME is active) to be used as a
- * modifier for the initial pgdir entry programmed into CR3.
- */
- return sme_get_me_mask();
-}
-
-/* Code in __startup_64() can be relocated during execution, but the compiler
- * doesn't have to generate PC-relative relocations when accessing globals from
- * that function. Clang actually does not generate them, which leads to
- * boot-time crashes. To work around this problem, every global pointer must
- * be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined
- * by subtracting p2v_offset from the RIP-relative address.
- */
-unsigned long __head __startup_64(unsigned long p2v_offset,
- struct boot_params *bp)
-{
- pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
- unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text);
- unsigned long va_text, va_end;
- unsigned long pgtable_flags;
- unsigned long load_delta;
- pgdval_t *pgd;
- p4dval_t *p4d;
- pudval_t *pud;
- pmdval_t *pmd, pmd_entry;
- bool la57;
- int i;
-
- la57 = check_la57_support();
-
- /* Is the address too large? */
- if (physaddr >> MAX_PHYSMEM_BITS)
- for (;;);
-
- /*
- * Compute the delta between the address I am compiled to run at
- * and the address I am actually running at.
- */
- load_delta = __START_KERNEL_map + p2v_offset;
- RIP_REL_REF(phys_base) = load_delta;
-
- /* Is the address not 2M aligned? */
- if (load_delta & ~PMD_MASK)
- for (;;);
-
- va_text = physaddr - p2v_offset;
- va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset;
-
- /* Include the SME encryption mask in the fixup value */
- load_delta += sme_get_me_mask();
-
- /* Fixup the physical addresses in the page table */
-
- pgd = &RIP_REL_REF(early_top_pgt)->pgd;
- pgd[pgd_index(__START_KERNEL_map)] += load_delta;
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
- p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
- p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
-
- pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
- }
-
- RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta;
- RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta;
-
- for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
- RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta;
-
- /*
- * Set up the identity mapping for the switchover. These
- * entries should *NOT* have the global bit set! This also
- * creates a bunch of nonsense entries but that is fine --
- * it avoids problems around wraparound.
- */
-
- pud = &early_pgts[0]->pmd;
- pmd = &early_pgts[1]->pmd;
- RIP_REL_REF(next_early_pgt) = 2;
-
- pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
-
- if (la57) {
- p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd;
-
- i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
- pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
- pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
-
- i = physaddr >> P4D_SHIFT;
- p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
- p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
- } else {
- i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
- pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
- pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
- }
-
- i = physaddr >> PUD_SHIFT;
- pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
- pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
-
- pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
- /* Filter out unsupported __PAGE_KERNEL_* bits: */
- pmd_entry &= RIP_REL_REF(__supported_pte_mask);
- pmd_entry += sme_get_me_mask();
- pmd_entry += physaddr;
-
- for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
- int idx = i + (physaddr >> PMD_SHIFT);
-
- pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
- }
-
- /*
- * Fixup the kernel text+data virtual addresses. Note that
- * we might write invalid pmds, when the kernel is relocated
- * cleanup_highmap() fixes this up along with the mappings
- * beyond _end.
- *
- * Only the region occupied by the kernel image has so far
- * been checked against the table of usable memory regions
- * provided by the firmware, so invalidate pages outside that
- * region. A page table entry that maps to a reserved area of
- * memory would allow processor speculation into that area,
- * and on some hardware (particularly the UV platform) even
- * speculative access to some reserved areas is caught as an
- * error, causing the BIOS to halt the system.
- */
-
- pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd;
-
- /* invalidate pages before the kernel image */
- for (i = 0; i < pmd_index(va_text); i++)
- pmd[i] &= ~_PAGE_PRESENT;
-
- /* fixup pages that are part of the kernel image */
- for (; i <= pmd_index(va_end); i++)
- if (pmd[i] & _PAGE_PRESENT)
- pmd[i] += load_delta;
-
- /* invalidate pages after the kernel image */
- for (; i < PTRS_PER_PMD; i++)
- pmd[i] &= ~_PAGE_PRESENT;
-
- return sme_postprocess_startup(bp, pmd, p2v_offset);
-}
-
/* Wipe all early page tables except for the kernel symbol map */
static void __init reset_early_page_tables(void)
{
@@ -513,41 +308,6 @@ void __init __noreturn x86_64_start_reservations(char *real_mode_data)
start_kernel();
}
-/*
- * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is
- * used until the idt_table takes over. On the boot CPU this happens in
- * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases
- * this happens in the functions called from head_64.S.
- *
- * The idt_table can't be used that early because all the code modifying it is
- * in idt.c and can be instrumented by tracing or KASAN, which both don't work
- * during early CPU bringup. Also the idt_table has the runtime vectors
- * configured which require certain CPU state to be setup already (like TSS),
- * which also hasn't happened yet in early CPU bringup.
- */
-static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data;
-
-/* This may run while still in the direct mapping */
-static void __head startup_64_load_idt(void *vc_handler)
-{
- struct desc_ptr desc = {
- .address = (unsigned long)&RIP_REL_REF(bringup_idt_table),
- .size = sizeof(bringup_idt_table) - 1,
- };
- struct idt_data data;
- gate_desc idt_desc;
-
- /* @vc_handler is set only for a VMM Communication Exception */
- if (vc_handler) {
- init_idt_data(&data, X86_TRAP_VC, vc_handler);
- idt_init_desc(&idt_desc, &data);
- native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc);
- }
-
- native_load_idt(&desc);
-}
-
-/* This is used when running on kernel addresses */
void early_setup_idt(void)
{
void *handler = NULL;
@@ -559,30 +319,3 @@ void early_setup_idt(void)
startup_64_load_idt(handler);
}
-
-/*
- * Setup boot CPU state needed before kernel switches to virtual addresses.
- */
-void __head startup_64_setup_gdt_idt(void)
-{
- struct desc_struct *gdt = (void *)(__force unsigned long)gdt_page.gdt;
- void *handler = NULL;
-
- struct desc_ptr startup_gdt_descr = {
- .address = (unsigned long)&RIP_REL_REF(*gdt),
- .size = GDT_SIZE - 1,
- };
-
- /* Load GDT */
- native_load_gdt(&startup_gdt_descr);
-
- /* New GDT is live - reload data segment registers */
- asm volatile("movl %%eax, %%ds\n"
- "movl %%eax, %%ss\n"
- "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory");
-
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
- handler = &RIP_REL_REF(vc_no_ghcb);
-
- startup_64_load_idt(handler);
-}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 2e42056d2306..76743dfad6ab 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -86,7 +86,7 @@ SYM_CODE_START(startup_32)
movl $pa(__bss_stop),%ecx
subl %edi,%ecx
shrl $2,%ecx
- rep ; stosl
+ rep stosl
/*
* Copy bootup parameters out of the way.
* Note: %esi still has the pointer to the real-mode data.
@@ -98,15 +98,13 @@ SYM_CODE_START(startup_32)
movl $pa(boot_params),%edi
movl $(PARAM_SIZE/4),%ecx
cld
- rep
- movsl
+ rep movsl
movl pa(boot_params) + NEW_CL_POINTER,%esi
andl %esi,%esi
jz 1f # No command line
movl $pa(boot_command_line),%edi
movl $(COMMAND_LINE_SIZE/4),%ecx
- rep
- movsl
+ rep movsl
1:
#ifdef CONFIG_OLPC
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index fefe2a25cf02..069420853304 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -573,6 +573,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb)
/* Pure iret required here - don't use INTERRUPT_RETURN */
iretq
SYM_CODE_END(vc_no_ghcb)
+SYM_PIC_ALIAS(vc_no_ghcb);
#endif
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
@@ -604,10 +605,12 @@ SYM_DATA_START_PTI_ALIGNED(early_top_pgt)
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0
SYM_DATA_END(early_top_pgt)
+SYM_PIC_ALIAS(early_top_pgt)
SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
SYM_DATA_END(early_dynamic_pgts)
+SYM_PIC_ALIAS(early_dynamic_pgts);
SYM_DATA(early_recursion_flag, .long 0)
@@ -651,6 +654,7 @@ SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
SYM_DATA_END(level4_kernel_pgt)
+SYM_PIC_ALIAS(level4_kernel_pgt)
#endif
SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt)
@@ -659,6 +663,7 @@ SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt)
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
SYM_DATA_END(level3_kernel_pgt)
+SYM_PIC_ALIAS(level3_kernel_pgt)
SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt)
/*
@@ -676,6 +681,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt)
*/
PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE)
SYM_DATA_END(level2_kernel_pgt)
+SYM_PIC_ALIAS(level2_kernel_pgt)
SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt)
.fill (512 - 4 - FIXMAP_PMD_NUM),8,0
@@ -688,6 +694,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt)
/* 6 MB reserved space + a 2MB hole */
.fill 4,8,0
SYM_DATA_END(level2_fixmap_pgt)
+SYM_PIC_ALIAS(level2_fixmap_pgt)
SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt)
.rept (FIXMAP_PMD_NUM)
@@ -703,6 +710,7 @@ SYM_DATA(smpboot_control, .long 0)
.align 16
/* This must match the first entry in level2_kernel_pgt */
SYM_DATA(phys_base, .quad 0x0)
+SYM_PIC_ALIAS(phys_base);
EXPORT_SYMBOL(phys_base)
#include "../xen/xen-head.S"
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index f5b8ef02d172..a7949a54a0ff 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -102,7 +102,7 @@ __jump_label_transform(struct jump_entry *entry,
return;
}
- text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
+ smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
}
static void __ref jump_label_transform(struct jump_entry *entry,
@@ -135,7 +135,7 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
mutex_lock(&text_mutex);
jlp = __jump_label_patch(entry, type);
- text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
+ smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
mutex_unlock(&text_mutex);
return true;
}
@@ -143,6 +143,6 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
void arch_jump_label_transform_apply(void)
{
mutex_lock(&text_mutex);
- text_poke_finish();
+ smp_text_poke_batch_finish();
mutex_unlock(&text_mutex);
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 09608fd93687..47cb8eb138ba 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -808,7 +808,7 @@ void arch_arm_kprobe(struct kprobe *p)
u8 int3 = INT3_INSN_OPCODE;
text_poke(p->addr, &int3, 1);
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
}
@@ -818,7 +818,7 @@ void arch_disarm_kprobe(struct kprobe *p)
perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
text_poke(p->addr, &p->opcode, 1);
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
}
void arch_remove_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 36d6809c6c9e..0aabd4c4e2c4 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -488,7 +488,7 @@ void arch_optimize_kprobes(struct list_head *oplist)
insn_buff[0] = JMP32_INSN_OPCODE;
*(s32 *)(&insn_buff[1]) = rel;
- text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
+ smp_text_poke_single(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
list_del_init(&op->list);
}
@@ -513,11 +513,11 @@ void arch_unoptimize_kprobe(struct optimized_kprobe *op)
JMP32_INSN_SIZE - INT3_INSN_SIZE);
text_poke(addr, new, INT3_INSN_SIZE);
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
text_poke(addr + INT3_INSN_SIZE,
new + INT3_INSN_SIZE,
JMP32_INSN_SIZE - INT3_INSN_SIZE);
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
}
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index a68f5a0a9f37..949c9e4bfad2 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -76,6 +76,19 @@ map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p)
static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; }
#endif
+static int map_mmio_serial(struct x86_mapping_info *info, pgd_t *level4p)
+{
+ unsigned long mstart, mend;
+
+ if (!kexec_debug_8250_mmio32)
+ return 0;
+
+ mstart = kexec_debug_8250_mmio32 & PAGE_MASK;
+ mend = (kexec_debug_8250_mmio32 + PAGE_SIZE + 23) & PAGE_MASK;
+ pr_info("Map PCI serial at %lx - %lx\n", mstart, mend);
+ return kernel_ident_mapping_init(info, level4p, mstart, mend);
+}
+
#ifdef CONFIG_KEXEC_FILE
const struct kexec_file_ops * const kexec_file_loaders[] = {
&kexec_bzImage64_ops,
@@ -285,6 +298,10 @@ static int init_pgtable(struct kimage *image, unsigned long control_page)
if (result)
return result;
+ result = map_mmio_serial(&info, image->arch.pgd);
+ if (result)
+ return result;
+
/*
* This must be last because the intermediate page table pages it
* allocates will not be control pages and may overlap the image.
@@ -304,6 +321,24 @@ static void load_segments(void)
);
}
+static void prepare_debug_idt(unsigned long control_page, unsigned long vec_ofs)
+{
+ gate_desc idtentry = { 0 };
+ int i;
+
+ idtentry.bits.p = 1;
+ idtentry.bits.type = GATE_TRAP;
+ idtentry.segment = __KERNEL_CS;
+ idtentry.offset_low = (control_page & 0xFFFF) + vec_ofs;
+ idtentry.offset_middle = (control_page >> 16) & 0xFFFF;
+ idtentry.offset_high = control_page >> 32;
+
+ for (i = 0; i < 16; i++) {
+ kexec_debug_idt[i] = idtentry;
+ idtentry.offset_low += KEXEC_DEBUG_EXC_HANDLER_SIZE;
+ }
+}
+
int machine_kexec_prepare(struct kimage *image)
{
void *control_page = page_address(image->control_code_page);
@@ -321,6 +356,9 @@ int machine_kexec_prepare(struct kimage *image)
if (image->type == KEXEC_TYPE_DEFAULT)
kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT;
+ prepare_debug_idt((unsigned long)__pa(control_page),
+ (unsigned long)kexec_debug_exc_vectors - reloc_start);
+
__memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start);
set_memory_rox((unsigned long)control_page, 1);
@@ -396,16 +434,10 @@ void __nocfi machine_kexec(struct kimage *image)
* with from a table in memory. At no other time is the
* descriptor table in memory accessed.
*
- * I take advantage of this here by force loading the
- * segments, before I zap the gdt with an invalid value.
+ * Take advantage of this here by force loading the segments,
+ * before the GDT is zapped with an invalid value.
*/
load_segments();
- /*
- * The gdt & idt are now invalid.
- * If you want to load them you must set up your own idt & gdt.
- */
- native_idt_invalidate();
- native_gdt_invalidate();
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index a7998f351701..231d6326d1fd 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -206,7 +206,7 @@ static int write_relocate_add(Elf64_Shdr *sechdrs,
write, apply);
if (!early) {
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
mutex_unlock(&text_mutex);
}
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c7c4b1917336..57276f134d12 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -263,17 +263,17 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
movl %edx, %edi
movl $1024, %ecx
- rep ; movsl
+ rep movsl
movl %ebp, %edi
movl %eax, %esi
movl $1024, %ecx
- rep ; movsl
+ rep movsl
movl %eax, %edi
movl %edx, %esi
movl $1024, %ecx
- rep ; movsl
+ rep movsl
lea PAGE_SIZE(%ebp), %esi
jmp 0b
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index ac058971a382..ea604f4d0b52 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -39,6 +39,8 @@ SYM_DATA(kexec_va_control_page, .quad 0)
SYM_DATA(kexec_pa_table_page, .quad 0)
SYM_DATA(kexec_pa_swap_page, .quad 0)
SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
+SYM_DATA(kexec_debug_8250_mmio32, .quad 0)
+SYM_DATA(kexec_debug_8250_port, .word 0)
.balign 16
SYM_DATA_START_LOCAL(kexec_debug_gdt)
@@ -50,6 +52,11 @@ SYM_DATA_START_LOCAL(kexec_debug_gdt)
.quad 0x00cf92000000ffff /* __KERNEL_DS */
SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)
+ .balign 8
+SYM_DATA_START(kexec_debug_idt)
+ .skip 0x100, 0x00
+SYM_DATA_END(kexec_debug_idt)
+
.section .text..relocate_kernel,"ax";
.code64
SYM_CODE_START_NOALIGN(relocate_kernel)
@@ -72,8 +79,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
pushq %r15
pushf
- /* zero out flags, and disable interrupts */
- pushq $0
+ /* Invalidate GDT/IDT, zero out flags */
+ pushq $0
+ pushq $0
+
+ lidt (%rsp)
+ lgdt (%rsp)
+ addq $8, %rsp
popfq
/* Switch to the identity mapped page tables */
@@ -139,6 +151,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movq %ds, %rax
movq %rax, %ds
+ /* Now an IDTR on the stack to load the IDT the kernel created */
+ leaq kexec_debug_idt(%rip), %rsi
+ pushq %rsi
+ pushw $0xff
+ lidt (%rsp)
+ addq $10, %rsp
+
+ //int3
+
/*
* Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
* below.
@@ -342,20 +363,20 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
/* copy source page to swap page */
movq kexec_pa_swap_page(%rip), %rdi
movl $512, %ecx
- rep ; movsq
+ rep movsq
/* copy destination page to source page */
movq %rax, %rdi
movq %rdx, %rsi
movl $512, %ecx
- rep ; movsq
+ rep movsq
/* copy swap page to destination page */
movq %rdx, %rdi
movq kexec_pa_swap_page(%rip), %rsi
.Lnoswap:
movl $512, %ecx
- rep ; movsq
+ rep movsq
lea PAGE_SIZE(%rax), %rsi
jmp .Lloop
@@ -364,3 +385,222 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
ret
int3
SYM_CODE_END(swap_pages)
+
+/*
+ * Generic 'print character' routine
+ * - %al: Character to be printed (may clobber %rax)
+ * - %rdx: MMIO address or port.
+ */
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ addw $LSR, %dx
+ xchg %al, %ah
+.Lxmtrdy_loop:
+ inb %dx, %al
+ testb $XMTRDY, %al
+ jnz .Lready
+ pause
+ jmp .Lxmtrdy_loop
+
+.Lready:
+ subw $LSR, %dx
+ xchg %al, %ah
+ outb %al, %dx
+pr_char_null:
+ ANNOTATE_NOENDBR
+
+ ANNOTATE_UNRET_SAFE
+ ret
+SYM_CODE_END(pr_char_8250)
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+.Lxmtrdy_loop_mmio:
+ movb (LSR*4)(%rdx), %ah
+ testb $XMTRDY, %ah
+ jnz .Lready_mmio
+ pause
+ jmp .Lxmtrdy_loop_mmio
+
+.Lready_mmio:
+ movb %al, (%rdx)
+ ANNOTATE_UNRET_SAFE
+ ret
+SYM_CODE_END(pr_char_8250_mmio32)
+
+/*
+ * Load pr_char function pointer into %rsi and load %rdx with whatever
+ * that function wants to see there (typically port/MMIO address).
+ */
+.macro pr_setup
+ leaq pr_char_8250(%rip), %rsi
+ movw kexec_debug_8250_port(%rip), %dx
+ testw %dx, %dx
+ jnz 1f
+
+ leaq pr_char_8250_mmio32(%rip), %rsi
+ movq kexec_debug_8250_mmio32(%rip), %rdx
+ testq %rdx, %rdx
+ jnz 1f
+
+ leaq pr_char_null(%rip), %rsi
+1:
+.endm
+
+/* Print the nybble in %bl, clobber %rax */
+SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
+ UNWIND_HINT_FUNC
+ movb %bl, %al
+ nop
+ andb $0x0f, %al
+ addb $0x30, %al
+ cmpb $0x3a, %al
+ jb 1f
+ addb $('a' - '0' - 10), %al
+ ANNOTATE_RETPOLINE_SAFE
+1: jmp *%rsi
+SYM_CODE_END(pr_nybble)
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
+ UNWIND_HINT_FUNC
+ movq $16, %rcx
+1: rolq $4, %rbx
+ call pr_nybble
+ loop 1b
+ movb $'\n', %al
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rsi
+SYM_CODE_END(pr_qword)
+
+.macro print_reg a, b, c, d, r
+ movb $\a, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\b, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\c, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\d, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movq \r, %rbx
+ call pr_qword
+.endm
+
+SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
+ /* Each of these is 6 bytes. */
+.macro vec_err exc
+ UNWIND_HINT_ENTRY
+ . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
+ nop
+ nop
+ pushq $\exc
+ jmp exc_handler
+.endm
+
+.macro vec_noerr exc
+ UNWIND_HINT_ENTRY
+ . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
+ pushq $0
+ pushq $\exc
+ jmp exc_handler
+.endm
+
+ ANNOTATE_NOENDBR
+ vec_noerr 0 // #DE
+ vec_noerr 1 // #DB
+ vec_noerr 2 // #NMI
+ vec_noerr 3 // #BP
+ vec_noerr 4 // #OF
+ vec_noerr 5 // #BR
+ vec_noerr 6 // #UD
+ vec_noerr 7 // #NM
+ vec_err 8 // #DF
+ vec_noerr 9
+ vec_err 10 // #TS
+ vec_err 11 // #NP
+ vec_err 12 // #SS
+ vec_err 13 // #GP
+ vec_err 14 // #PF
+ vec_noerr 15
+SYM_CODE_END(kexec_debug_exc_vectors)
+
+SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
+ /* No need for RET mitigations during kexec */
+ VALIDATE_UNRET_END
+
+ pushq %rax
+ pushq %rbx
+ pushq %rcx
+ pushq %rdx
+ pushq %rsi
+
+ /* Stack frame */
+#define EXC_SS 0x58 /* Architectural... */
+#define EXC_RSP 0x50
+#define EXC_EFLAGS 0x48
+#define EXC_CS 0x40
+#define EXC_RIP 0x38
+#define EXC_ERRORCODE 0x30 /* Either architectural or zero pushed by handler */
+#define EXC_EXCEPTION 0x28 /* Pushed by handler entry point */
+#define EXC_RAX 0x20 /* Pushed just above in exc_handler */
+#define EXC_RBX 0x18
+#define EXC_RCX 0x10
+#define EXC_RDX 0x08
+#define EXC_RSI 0x00
+
+ /* Set up %rdx/%rsi for debug output */
+ pr_setup
+
+ /* rip and exception info */
+ print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp)
+ print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp)
+ print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp)
+ print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp)
+
+ /* We spilled these to the stack */
+ print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp)
+ print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp)
+ print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp)
+ print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp)
+ print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp)
+
+ /* Other registers untouched */
+ print_reg 'r', 'd', 'i', ':', %rdi
+ print_reg 'r', '8', ' ', ':', %r8
+ print_reg 'r', '9', ' ', ':', %r9
+ print_reg 'r', '1', '0', ':', %r10
+ print_reg 'r', '1', '1', ':', %r11
+ print_reg 'r', '1', '2', ':', %r12
+ print_reg 'r', '1', '3', ':', %r13
+ print_reg 'r', '1', '4', ':', %r14
+ print_reg 'r', '1', '5', ':', %r15
+ print_reg 'c', 'r', '2', ':', %cr2
+
+ /* Only return from INT3 */
+ cmpq $3, EXC_EXCEPTION(%rsp)
+ jne .Ldie
+
+ popq %rsi
+ popq %rdx
+ popq %rcx
+ popq %rbx
+ popq %rax
+
+ addq $16, %rsp
+ iretq
+
+.Ldie:
+ hlt
+ jmp .Ldie
+
+SYM_CODE_END(exc_handler)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9d2a13b37833..e0cf1595a0ab 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -134,6 +134,7 @@ struct ist_info ist_info;
struct cpuinfo_x86 boot_cpu_data __read_mostly;
EXPORT_SYMBOL(boot_cpu_data);
+SYM_PIC_ALIAS(boot_cpu_data);
#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
__visible unsigned long mmu_cr4_features __ro_after_init;
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index a59c72e77645..8164a7323c17 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -108,7 +108,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
if (system_state == SYSTEM_BOOTING || modinit)
return text_poke_early(insn, code, size);
- text_poke_bp(insn, code, size, emulate);
+ smp_text_poke_single(insn, code, size, emulate);
}
static void __static_call_validate(u8 *insn, bool tail, bool tramp)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9f88b8a78e50..d67407c623f3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -882,16 +882,16 @@ static void do_int3_user(struct pt_regs *regs)
DEFINE_IDTENTRY_RAW(exc_int3)
{
/*
- * poke_int3_handler() is completely self contained code; it does (and
+ * smp_text_poke_int3_handler() is completely self contained code; it does (and
* must) *NOT* call out to anything, lest it hits upon yet another
* INT3.
*/
- if (poke_int3_handler(regs))
+ if (smp_text_poke_int3_handler(regs))
return;
/*
* irqentry_enter_from_user_mode() uses static_branch_{,un}likely()
- * and therefore can trigger INT3, hence poke_int3_handler() must
+ * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must
* be done before. If the entry came from kernel mode, then use
* nmi_enter() because the INT3 could have been hit in any context
* including NMI.
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ccdc45e5b759..d813f64a89d6 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -79,11 +79,13 @@ const_cpu_current_top_of_stack = cpu_current_top_of_stack;
#define BSS_DECRYPTED \
. = ALIGN(PMD_SIZE); \
__start_bss_decrypted = .; \
+ __pi___start_bss_decrypted = .; \
*(.bss..decrypted); \
. = ALIGN(PAGE_SIZE); \
__start_bss_decrypted_unused = .; \
. = ALIGN(PMD_SIZE); \
__end_bss_decrypted = .; \
+ __pi___end_bss_decrypted = .; \
#else
@@ -128,6 +130,7 @@ SECTIONS
/* Text and read-only data */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
_text = .;
+ __pi__text = .;
_stext = .;
ALIGN_ENTRY_TEXT_BEGIN
*(.text..__x86.rethunk_untrain)
@@ -391,6 +394,7 @@ SECTIONS
. = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */
_end = .;
+ __pi__end = .;
#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
@@ -466,10 +470,18 @@ SECTIONS
}
/*
- * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ * COMPILE_TEST kernels can be large - CONFIG_KASAN, for example, can cause
+ * this. Let's assume that nobody will be running a COMPILE_TEST kernel and
+ * let's assert that fuller build coverage is more valuable than being able to
+ * run a COMPILE_TEST kernel.
+ */
+#ifndef CONFIG_COMPILE_TEST
+/*
+ * The ASSERT() sync to . is intentional, for binutils 2.14 compatibility:
*/
. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
"kernel image bigger than KERNEL_IMAGE_SIZE");
+#endif
/* needed for Clang - see arch/x86/entry/entry.S */
PROVIDE(__ref_stack_chk_guard = __stack_chk_guard);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 050a0e229a4d..f2b36d32ef40 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -104,6 +104,9 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
+ if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ kvm_mmu_free_obsolete_roots(vcpu);
+
/*
* Checking root.hpa is sufficient even when KVM has mirror root.
* We can have either:
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 63bb77ee1bb1..8d1b632e33d2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5974,6 +5974,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots);
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
int *bytes)
@@ -7669,9 +7670,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
}
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+}
+
bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range)
{
+ struct kvm_memory_slot *slot = range->slot;
+ int level;
+
/*
* Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
* supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
@@ -7686,6 +7708,38 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
return false;
+ if (WARN_ON_ONCE(range->end <= range->start))
+ return false;
+
+ /*
+ * If the head and tail pages of the range currently allow a hugepage,
+ * i.e. reside fully in the slot and don't have mixed attributes, then
+ * add each corresponding hugepage range to the ongoing invalidation,
+ * e.g. to prevent KVM from creating a hugepage in response to a fault
+ * for a gfn whose attributes aren't changing. Note, only the range
+ * of gfns whose attributes are being modified needs to be explicitly
+ * unmapped, as that will unmap any existing hugepages.
+ */
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ gfn_t start = gfn_round_for_level(range->start, level);
+ gfn_t end = gfn_round_for_level(range->end - 1, level);
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+
+ if ((start != range->start || start + nr_pages > range->end) &&
+ start >= slot->base_gfn &&
+ start + nr_pages <= slot->base_gfn + slot->npages &&
+ !hugepage_test_mixed(slot, start, level))
+ kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages);
+
+ if (end == start)
+ continue;
+
+ if ((end + nr_pages) > range->end &&
+ (end + nr_pages) <= (slot->base_gfn + slot->npages) &&
+ !hugepage_test_mixed(slot, end, level))
+ kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages);
+ }
+
/* Unmap the old attribute page. */
if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)
range->attr_filter = KVM_FILTER_SHARED;
@@ -7695,23 +7749,7 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
return kvm_unmap_gfn_range(kvm, range);
}
-static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
- int level)
-{
- return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
-}
-
-static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
- int level)
-{
- lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
-}
-static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
- int level)
-{
- lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
-}
static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, int level, unsigned long attrs)
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index 699e551ec93b..9864c057187d 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
kvm_mmu_reset_context(vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_smm_changed);
void process_smi(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0bc708ee2788..a7a7dc507336 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3173,9 +3173,14 @@ skip_vmsa_free:
kvfree(svm->sev_es.ghcb_sa);
}
+static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
+{
+ return (((u64)control->exit_code_hi) << 32) | control->exit_code;
+}
+
static void dump_ghcb(struct vcpu_svm *svm)
{
- struct ghcb *ghcb = svm->sev_es.ghcb;
+ struct vmcb_control_area *control = &svm->vmcb->control;
unsigned int nbits;
/* Re-use the dump_invalid_vmcb module parameter */
@@ -3184,18 +3189,24 @@ static void dump_ghcb(struct vcpu_svm *svm)
return;
}
- nbits = sizeof(ghcb->save.valid_bitmap) * 8;
+ nbits = sizeof(svm->sev_es.valid_bitmap) * 8;
- pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa);
+ /*
+ * Print KVM's snapshot of the GHCB values that were (unsuccessfully)
+ * used to handle the exit. If the guest has since modified the GHCB
+ * itself, dumping the raw GHCB won't help debug why KVM was unable to
+ * handle the VMGEXIT that KVM observed.
+ */
+ pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
- ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb));
+ kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
- ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb));
+ control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
- ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb));
+ control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
- ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb));
- pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap);
+ svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm));
+ pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap);
}
static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
@@ -3266,11 +3277,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
-static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
-{
- return (((u64)control->exit_code_hi) << 32) | control->exit_code;
-}
-
static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d5d0c5c3300b..a89c271a1951 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void)
kvm_cpu_svm_disable();
amd_pmu_disable_virt();
-
- if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
- msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
}
static int svm_enable_virtualization_cpu(void)
@@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void)
rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
}
- if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
- msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
-
return 0;
}
@@ -1518,6 +1512,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
+#ifdef CONFIG_CPU_MITIGATIONS
+static DEFINE_SPINLOCK(srso_lock);
+static atomic_t srso_nr_vms;
+
+static void svm_srso_clear_bp_spec_reduce(void *ign)
+{
+ struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);
+
+ if (!sd->bp_spec_reduce_set)
+ return;
+
+ msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ sd->bp_spec_reduce_set = false;
+}
+
+static void svm_srso_vm_destroy(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ if (atomic_dec_return(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ /*
+ * Verify a new VM didn't come along, acquire the lock, and increment
+ * the count before this task acquired the lock.
+ */
+ if (atomic_read(&srso_nr_vms))
+ return;
+
+ on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
+}
+
+static void svm_srso_vm_init(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ /*
+ * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
+ * transition, i.e. destroying the last VM, is fully complete, e.g. so
+ * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
+ */
+ if (atomic_inc_not_zero(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ atomic_inc(&srso_nr_vms);
+}
+#else
+static void svm_srso_vm_init(void) { }
+static void svm_srso_vm_destroy(void) { }
+#endif
+
static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1550,6 +1601,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
(!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
+ !sd->bp_spec_reduce_set) {
+ sd->bp_spec_reduce_set = true;
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ }
svm->guest_state_loaded = true;
}
@@ -2231,6 +2287,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
*/
if (!sev_es_guest(vcpu->kvm)) {
clear_page(svm->vmcb);
+#ifdef CONFIG_KVM_SMM
+ if (is_smm(vcpu))
+ kvm_smm_changed(vcpu, false);
+#endif
kvm_vcpu_reset(vcpu, true);
}
@@ -5036,6 +5096,8 @@ static void svm_vm_destroy(struct kvm *kvm)
{
avic_vm_destroy(kvm);
sev_vm_destroy(kvm);
+
+ svm_srso_vm_destroy();
}
static int svm_vm_init(struct kvm *kvm)
@@ -5061,6 +5123,7 @@ static int svm_vm_init(struct kvm *kvm)
return ret;
}
+ svm_srso_vm_init();
return 0;
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d4490eaed55d..f16b068c4228 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -335,6 +335,8 @@ struct svm_cpu_data {
u32 next_asid;
u32 min_asid;
+ bool bp_spec_reduce_set;
+
struct vmcb *save_area;
unsigned long save_area_pa;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5c5766467a61..0aba4719ae0a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -273,6 +273,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
case L1TF_MITIGATION_OFF:
l1tf = VMENTER_L1D_FLUSH_NEVER;
break;
+ case L1TF_MITIGATION_AUTO:
case L1TF_MITIGATION_FLUSH_NOWARN:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
@@ -7358,10 +7359,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
* mitigation for MDS is done late in VMentry and is still
* executed in spite of L1D Flush. This is because an extra VERW
* should not matter much after the big hammer L1D Flush.
+ *
+ * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA,
+ * and is affected by MMIO Stale Data. In such cases mitigation in only
+ * needed against an MMIO capable guest.
*/
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
- else if (static_branch_unlikely(&mmio_stale_data_clear) &&
+ else if (static_branch_unlikely(&cpu_buf_vm_clear) &&
kvm_arch_has_assigned_device(vcpu->kvm))
mds_clear_cpu_buffers();
@@ -7700,6 +7705,7 @@ int vmx_vm_init(struct kvm *kvm)
case L1TF_MITIGATION_FLUSH_NOWARN:
/* 'I explicitly don't care' is set */
break;
+ case L1TF_MITIGATION_AUTO:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
case L1TF_MITIGATION_FULL:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index df5b99ea1f18..9896fd574bfc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4597,7 +4597,7 @@ static bool kvm_is_vm_type_supported(unsigned long type)
return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
}
-static inline u32 kvm_sync_valid_fields(struct kvm *kvm)
+static inline u64 kvm_sync_valid_fields(struct kvm *kvm)
{
return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS;
}
@@ -11493,7 +11493,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
struct kvm_run *kvm_run = vcpu->run;
- u32 sync_valid_fields;
+ u64 sync_valid_fields;
int r;
r = kvm_mmu_post_init_vm(vcpu->kvm);
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index e86eda2c0b04..eb2d2e1cbddd 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -75,7 +75,7 @@ static void delay_tsc(u64 cycles)
/* Allow RT tasks to run */
preempt_enable();
- rep_nop();
+ native_pause();
preempt_disable();
/*
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 98631c0e7a11..f786401ac15d 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -631,14 +631,21 @@ static bool get_desc(struct desc_struct *out, unsigned short sel)
/* Bits [15:3] contain the index of the desired entry. */
sel >>= 3;
- mutex_lock(&current->active_mm->context.lock);
- ldt = current->active_mm->context.ldt;
+ /*
+ * If we're not in a valid context with a real (not just lazy)
+ * user mm, then don't even try.
+ */
+ if (!nmi_uaccess_okay())
+ return false;
+
+ mutex_lock(&current->mm->context.lock);
+ ldt = current->mm->context.ldt;
if (ldt && sel < ldt->nr_entries) {
*out = ldt->entries[sel];
success = true;
}
- mutex_unlock(&current->active_mm->context.lock);
+ mutex_unlock(&current->mm->context.lock);
return success;
}
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 6ffb931b9fb1..149a57e334ab 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -324,6 +324,11 @@ int insn_get_opcode(struct insn *insn)
}
insn->attr = inat_get_opcode_attribute(op);
+ if (insn->x86_64 && inat_is_invalid64(insn->attr)) {
+ /* This instruction is invalid, like UD2. Stop decoding. */
+ insn->attr &= INAT_INV64;
+ }
+
while (inat_is_escape(insn->attr)) {
/* Get escaped opcode */
op = get_next(insn_byte_t, insn);
@@ -337,6 +342,7 @@ int insn_get_opcode(struct insn *insn)
insn->attr = 0;
return -EINVAL;
}
+
end:
opcode->got = 1;
return 0;
@@ -658,7 +664,6 @@ int insn_get_immediate(struct insn *insn)
}
if (!inat_has_immediate(insn->attr))
- /* no immediates */
goto done;
switch (inat_immediate_size(insn->attr)) {
diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c
index 5eecb45d05d5..c20e04764edc 100644
--- a/arch/x86/lib/iomem.c
+++ b/arch/x86/lib/iomem.c
@@ -10,7 +10,7 @@
static __always_inline void rep_movs(void *to, const void *from, size_t n)
{
unsigned long d0, d1, d2;
- asm volatile("rep ; movsl\n\t"
+ asm volatile("rep movsl\n\t"
"testb $2,%b4\n\t"
"je 1f\n\t"
"movsw\n"
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 0ae2e1712e2e..12a23fa7c44c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -41,6 +41,7 @@ SYM_FUNC_END(__memcpy)
EXPORT_SYMBOL(__memcpy)
SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
+SYM_PIC_ALIAS(memcpy)
EXPORT_SYMBOL(memcpy)
SYM_FUNC_START_LOCAL(memcpy_orig)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index d66b710d628f..fb5a03cf5ab7 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -42,6 +42,7 @@ SYM_FUNC_END(__memset)
EXPORT_SYMBOL(__memset)
SYM_FUNC_ALIAS_MEMFUNC(memset, __memset)
+SYM_PIC_ALIAS(memset)
EXPORT_SYMBOL(memset)
SYM_FUNC_START_LOCAL(memset_orig)
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index a26c43abd47d..9f3116609c8c 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -40,6 +40,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
ALTERNATIVE_2 __stringify(RETPOLINE \reg), \
__stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_LFENCE, \
__stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), ALT_NOT(X86_FEATURE_RETPOLINE)
+SYM_PIC_ALIAS(__x86_indirect_thunk_\reg)
.endm
@@ -394,6 +395,7 @@ SYM_CODE_START(__x86_return_thunk)
#endif
int3
SYM_CODE_END(__x86_return_thunk)
+SYM_PIC_ALIAS(__x86_return_thunk)
EXPORT_SYMBOL(__x86_return_thunk)
#endif /* CONFIG_MITIGATION_RETHUNK */
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 53b3f202267c..f87ec24fa579 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -40,8 +40,7 @@ char *strncpy(char *dest, const char *src, size_t count)
"stosb\n\t"
"testb %%al,%%al\n\t"
"jne 1b\n\t"
- "rep\n\t"
- "stosb\n"
+ "rep stosb\n"
"2:"
: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
: "0" (src), "1" (dest), "2" (count) : "memory");
@@ -54,8 +53,7 @@ EXPORT_SYMBOL(strncpy);
char *strcat(char *dest, const char *src)
{
int d0, d1, d2, d3;
- asm volatile("repne\n\t"
- "scasb\n\t"
+ asm volatile("repne scasb\n\t"
"decl %1\n"
"1:\tlodsb\n\t"
"stosb\n\t"
@@ -72,8 +70,7 @@ EXPORT_SYMBOL(strcat);
char *strncat(char *dest, const char *src, size_t count)
{
int d0, d1, d2, d3;
- asm volatile("repne\n\t"
- "scasb\n\t"
+ asm volatile("repne scasb\n\t"
"decl %1\n\t"
"movl %8,%3\n"
"1:\tdecl %3\n\t"
@@ -167,8 +164,7 @@ size_t strlen(const char *s)
{
int d0;
size_t res;
- asm volatile("repne\n\t"
- "scasb"
+ asm volatile("repne scasb"
: "=c" (res), "=&D" (d0)
: "1" (s), "a" (0), "0" (0xffffffffu)
: "memory");
@@ -184,8 +180,7 @@ void *memchr(const void *cs, int c, size_t count)
void *res;
if (!count)
return NULL;
- asm volatile("repne\n\t"
- "scasb\n\t"
+ asm volatile("repne scasb\n\t"
"je 1f\n\t"
"movl $1,%0\n"
"1:\tdecl %0"
@@ -202,7 +197,7 @@ void *memscan(void *addr, int c, size_t size)
{
if (!size)
return addr;
- asm volatile("repnz; scasb\n\t"
+ asm volatile("repnz scasb\n\t"
"jnz 1f\n\t"
"dec %%edi\n"
"1:"
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
index 38f37df056f7..28267985e85f 100644
--- a/arch/x86/lib/strstr_32.c
+++ b/arch/x86/lib/strstr_32.c
@@ -8,16 +8,14 @@ int d0, d1;
register char *__res;
__asm__ __volatile__(
"movl %6,%%edi\n\t"
- "repne\n\t"
- "scasb\n\t"
+ "repne scasb\n\t"
"notl %%ecx\n\t"
"decl %%ecx\n\t" /* NOTE! This also sets Z if searchstring='' */
"movl %%ecx,%%edx\n"
"1:\tmovl %6,%%edi\n\t"
"movl %%esi,%%eax\n\t"
"movl %%edx,%%ecx\n\t"
- "repe\n\t"
- "cmpsb\n\t"
+ "repe cmpsb\n\t"
"je 2f\n\t" /* also works for empty string, see above */
"xchgl %%eax,%%esi\n\t"
"incl %%esi\n\t"
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 422257c350c6..f6f436f1d573 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -38,9 +38,9 @@ do { \
might_fault(); \
__asm__ __volatile__( \
ASM_STAC "\n" \
- "0: rep; stosl\n" \
+ "0: rep stosl\n" \
" movl %2,%0\n" \
- "1: rep; stosb\n" \
+ "1: rep stosb\n" \
"2: " ASM_CLAC "\n" \
_ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %2) \
_ASM_EXTABLE_UA(1b, 2b) \
@@ -140,9 +140,9 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
" shrl $2, %0\n"
" andl $3, %%eax\n"
" cld\n"
- "99: rep; movsl\n"
+ "99: rep movsl\n"
"36: movl %%eax, %0\n"
- "37: rep; movsb\n"
+ "37: rep movsb\n"
"100:\n"
_ASM_EXTABLE_UA(1b, 100b)
_ASM_EXTABLE_UA(2b, 100b)
@@ -242,9 +242,9 @@ static unsigned long __copy_user_intel_nocache(void *to,
" shrl $2, %0\n"
" andl $3, %%eax\n"
" cld\n"
- "6: rep; movsl\n"
+ "6: rep movsl\n"
" movl %%eax,%0\n"
- "7: rep; movsb\n"
+ "7: rep movsb\n"
"8:\n"
_ASM_EXTABLE_UA(0b, 8b)
_ASM_EXTABLE_UA(1b, 8b)
@@ -293,14 +293,14 @@ do { \
" negl %0\n" \
" andl $7,%0\n" \
" subl %0,%3\n" \
- "4: rep; movsb\n" \
+ "4: rep movsb\n" \
" movl %3,%0\n" \
" shrl $2,%0\n" \
" andl $3,%3\n" \
" .align 2,0x90\n" \
- "0: rep; movsl\n" \
+ "0: rep movsl\n" \
" movl %3,%0\n" \
- "1: rep; movsb\n" \
+ "1: rep movsb\n" \
"2:\n" \
_ASM_EXTABLE_TYPE_REG(4b, 2b, EX_TYPE_UCOPY_LEN1, %3) \
_ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %3) \
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index f5dd84eb55dc..262f7ca1fb95 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -35,7 +35,7 @@
# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
# - (66&F2): Both 0x66 and 0xF2 prefixes are specified.
#
-# REX2 Prefix
+# REX2 Prefix Superscripts
# - (!REX2): REX2 is not allowed
# - (REX2): REX2 variant e.g. JMPABS
@@ -147,7 +147,7 @@ AVXcode:
# 0x60 - 0x6f
60: PUSHA/PUSHAD (i64)
61: POPA/POPAD (i64)
-62: BOUND Gv,Ma (i64) | EVEX (Prefix)
+62: BOUND Gv,Ma (i64) | EVEX (Prefix),(o64)
63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
64: SEG=FS (Prefix)
65: SEG=GS (Prefix)
@@ -253,8 +253,8 @@ c0: Grp2 Eb,Ib (1A)
c1: Grp2 Ev,Ib (1A)
c2: RETN Iw (f64)
c3: RETN
-c4: LES Gz,Mp (i64) | VEX+2byte (Prefix)
-c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix)
+c4: LES Gz,Mp (i64) | VEX+2byte (Prefix),(o64)
+c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix),(o64)
c6: Grp11A Eb,Ib (1A)
c7: Grp11B Ev,Iz (1A)
c8: ENTER Iw,Ib
@@ -286,10 +286,10 @@ df: ESC
# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
-e0: LOOPNE/LOOPNZ Jb (f64) (!REX2)
-e1: LOOPE/LOOPZ Jb (f64) (!REX2)
-e2: LOOP Jb (f64) (!REX2)
-e3: JrCXZ Jb (f64) (!REX2)
+e0: LOOPNE/LOOPNZ Jb (f64),(!REX2)
+e1: LOOPE/LOOPZ Jb (f64),(!REX2)
+e2: LOOP Jb (f64),(!REX2)
+e3: JrCXZ Jb (f64),(!REX2)
e4: IN AL,Ib (!REX2)
e5: IN eAX,Ib (!REX2)
e6: OUT Ib,AL (!REX2)
@@ -298,10 +298,10 @@ e7: OUT Ib,eAX (!REX2)
# in "near" jumps and calls is 16-bit. For CALL,
# push of return address is 16-bit wide, RSP is decremented by 2
# but is not truncated to 16 bits, unlike RIP.
-e8: CALL Jz (f64) (!REX2)
-e9: JMP-near Jz (f64) (!REX2)
-ea: JMP-far Ap (i64) (!REX2)
-eb: JMP-short Jb (f64) (!REX2)
+e8: CALL Jz (f64),(!REX2)
+e9: JMP-near Jz (f64),(!REX2)
+ea: JMP-far Ap (i64),(!REX2)
+eb: JMP-short Jb (f64),(!REX2)
ec: IN AL,DX (!REX2)
ed: IN eAX,DX (!REX2)
ee: OUT DX,AL (!REX2)
@@ -478,22 +478,22 @@ AVXcode: 1
7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev)
# 0x0f 0x80-0x8f
# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
-80: JO Jz (f64) (!REX2)
-81: JNO Jz (f64) (!REX2)
-82: JB/JC/JNAE Jz (f64) (!REX2)
-83: JAE/JNB/JNC Jz (f64) (!REX2)
-84: JE/JZ Jz (f64) (!REX2)
-85: JNE/JNZ Jz (f64) (!REX2)
-86: JBE/JNA Jz (f64) (!REX2)
-87: JA/JNBE Jz (f64) (!REX2)
-88: JS Jz (f64) (!REX2)
-89: JNS Jz (f64) (!REX2)
-8a: JP/JPE Jz (f64) (!REX2)
-8b: JNP/JPO Jz (f64) (!REX2)
-8c: JL/JNGE Jz (f64) (!REX2)
-8d: JNL/JGE Jz (f64) (!REX2)
-8e: JLE/JNG Jz (f64) (!REX2)
-8f: JNLE/JG Jz (f64) (!REX2)
+80: JO Jz (f64),(!REX2)
+81: JNO Jz (f64),(!REX2)
+82: JB/JC/JNAE Jz (f64),(!REX2)
+83: JAE/JNB/JNC Jz (f64),(!REX2)
+84: JE/JZ Jz (f64),(!REX2)
+85: JNE/JNZ Jz (f64),(!REX2)
+86: JBE/JNA Jz (f64),(!REX2)
+87: JA/JNBE Jz (f64),(!REX2)
+88: JS Jz (f64),(!REX2)
+89: JNS Jz (f64),(!REX2)
+8a: JP/JPE Jz (f64),(!REX2)
+8b: JNP/JPO Jz (f64),(!REX2)
+8c: JL/JNGE Jz (f64),(!REX2)
+8d: JNL/JGE Jz (f64),(!REX2)
+8e: JLE/JNG Jz (f64),(!REX2)
+8f: JNLE/JG Jz (f64),(!REX2)
# 0x0f 0x90-0x9f
90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66)
91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 32035d5be5a0..3faa60f13a61 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -3,12 +3,10 @@
KCOV_INSTRUMENT_tlb.o := n
KCOV_INSTRUMENT_mem_encrypt.o := n
KCOV_INSTRUMENT_mem_encrypt_amd.o := n
-KCOV_INSTRUMENT_mem_encrypt_identity.o := n
KCOV_INSTRUMENT_pgprot.o := n
KASAN_SANITIZE_mem_encrypt.o := n
KASAN_SANITIZE_mem_encrypt_amd.o := n
-KASAN_SANITIZE_mem_encrypt_identity.o := n
KASAN_SANITIZE_pgprot.o := n
# Disable KCSAN entirely, because otherwise we get warnings that some functions
@@ -16,12 +14,10 @@ KASAN_SANITIZE_pgprot.o := n
KCSAN_SANITIZE := n
# Avoid recursion by not calling KMSAN hooks for CEA code.
KMSAN_SANITIZE_cpu_entry_area.o := n
-KMSAN_SANITIZE_mem_encrypt_identity.o := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg
CFLAGS_REMOVE_mem_encrypt_amd.o = -pg
-CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
CFLAGS_REMOVE_pgprot.o = -pg
endif
@@ -32,7 +28,6 @@ obj-y += pat/
# Make sure __phys_addr has no stackprotector
CFLAGS_physaddr.o := -fno-stack-protector
-CFLAGS_mem_encrypt_identity.o := -fno-stack-protector
CFLAGS_fault.o := -I $(src)/../include/asm/trace
@@ -63,5 +58,4 @@ obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) += pti.o
obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o
-obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index bfa444a7dbb0..aa56d9ac0b8f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -28,6 +28,7 @@
#include <asm/text-patching.h>
#include <asm/memtype.h>
#include <asm/paravirt.h>
+#include <asm/mmu_context.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -824,31 +825,33 @@ void __init poking_init(void)
spinlock_t *ptl;
pte_t *ptep;
- poking_mm = mm_alloc();
- BUG_ON(!poking_mm);
+ text_poke_mm = mm_alloc();
+ BUG_ON(!text_poke_mm);
/* Xen PV guests need the PGD to be pinned. */
- paravirt_enter_mmap(poking_mm);
+ paravirt_enter_mmap(text_poke_mm);
+
+ set_notrack_mm(text_poke_mm);
/*
* Randomize the poking address, but make sure that the following page
* will be mapped at the same PMD. We need 2 pages, so find space for 3,
* and adjust the address if the PMD ends after the first one.
*/
- poking_addr = TASK_UNMAPPED_BASE;
+ text_poke_mm_addr = TASK_UNMAPPED_BASE;
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
- poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
+ text_poke_mm_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
(TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
- if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
- poking_addr += PAGE_SIZE;
+ if (((text_poke_mm_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
+ text_poke_mm_addr += PAGE_SIZE;
/*
* We need to trigger the allocation of the page-tables that will be
* needed for poking now. Later, poking may be performed in an atomic
* section, which might cause allocation to fail.
*/
- ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
+ ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
BUG_ON(!ptep);
pte_unmap_unlock(ptep, ptl);
}
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 7490ff6d83b1..faf3a13fb6ba 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -40,7 +40,9 @@
* section is later cleared.
*/
u64 sme_me_mask __section(".data") = 0;
+SYM_PIC_ALIAS(sme_me_mask);
u64 sev_status __section(".data") = 0;
+SYM_PIC_ALIAS(sev_status);
u64 sev_check_data __section(".data") = 0;
EXPORT_SYMBOL(sme_me_mask);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index f7ae44d3dd9e..e62af72923e8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -10,6 +10,7 @@
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
EXPORT_SYMBOL(physical_mask);
+SYM_PIC_ALIAS(physical_mask);
#endif
pgtable_t pte_alloc_one(struct mm_struct *mm)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index eb83348f9305..2ea8cec6cd49 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -847,7 +847,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
* mm_cpumask. The TLB shootdown code can figure out from
* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
*/
- if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm &&
+ if (IS_ENABLED(CONFIG_DEBUG_VM) &&
+ WARN_ON_ONCE(prev != &init_mm && !is_notrack_mm(prev) &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
@@ -899,20 +900,13 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
cond_mitigation(tsk);
/*
- * Let nmi_uaccess_okay() and finish_asid_transition()
- * know that CR3 is changing.
+ * Indicate that CR3 is about to change. nmi_uaccess_okay()
+ * and others are sensitive to the window where mm_cpumask(),
+ * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
*/
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();
- /*
- * Leave this CPU in prev's mm_cpumask. Atomic writes to
- * mm_cpumask can be expensive under contention. The CPU
- * will be removed lazily at TLB flush time.
- */
- VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu,
- mm_cpumask(prev)));
-
/* Start receiving IPIs and then read tlb_gen (and LAM below) */
if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))
cpumask_set_cpu(cpu, mm_cpumask(next));
@@ -972,6 +966,77 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
}
/*
+ * Using a temporary mm allows to set temporary mappings that are not accessible
+ * by other CPUs. Such mappings are needed to perform sensitive memory writes
+ * that override the kernel memory protections (e.g., W^X), without exposing the
+ * temporary page-table mappings that are required for these write operations to
+ * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
+ * mapping is torn down. Temporary mms can also be used for EFI runtime service
+ * calls or similar functionality.
+ *
+ * It is illegal to schedule while using a temporary mm -- the context switch
+ * code is unaware of the temporary mm and does not know how to context switch.
+ * Use a real (non-temporary) mm in a kernel thread if you need to sleep.
+ *
+ * Note: For sensitive memory writes, the temporary mm needs to be used
+ * exclusively by a single core, and IRQs should be disabled while the
+ * temporary mm is loaded, thereby preventing interrupt handler bugs from
+ * overriding the kernel memory protection.
+ */
+struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm)
+{
+ struct mm_struct *prev_mm;
+
+ lockdep_assert_preemption_disabled();
+ guard(irqsave)();
+
+ /*
+ * Make sure not to be in TLB lazy mode, as otherwise we'll end up
+ * with a stale address space WITHOUT being in lazy mode after
+ * restoring the previous mm.
+ */
+ if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
+ leave_mm();
+
+ prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ switch_mm_irqs_off(NULL, temp_mm, current);
+
+ /*
+ * If breakpoints are enabled, disable them while the temporary mm is
+ * used. Userspace might set up watchpoints on addresses that are used
+ * in the temporary mm, which would lead to wrong signals being sent or
+ * crashes.
+ *
+ * Note that breakpoints are not disabled selectively, which also causes
+ * kernel breakpoints (e.g., perf's) to be disabled. This might be
+ * undesirable, but still seems reasonable as the code that runs in the
+ * temporary mm should be short.
+ */
+ if (hw_breakpoint_active())
+ hw_breakpoint_disable();
+
+ return prev_mm;
+}
+
+void unuse_temporary_mm(struct mm_struct *prev_mm)
+{
+ lockdep_assert_preemption_disabled();
+ guard(irqsave)();
+
+ /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
+ cpumask_clear_cpu(smp_processor_id(), mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm)));
+
+ switch_mm_irqs_off(NULL, prev_mm, current);
+
+ /*
+ * Restore the breakpoints if they were disabled before the temporary mm
+ * was loaded.
+ */
+ if (hw_breakpoint_active())
+ hw_breakpoint_restore();
+}
+
+/*
* Call this when reinitializing a CPU. It fixes the following potential
* problems:
*
@@ -1204,8 +1269,16 @@ done:
static bool should_flush_tlb(int cpu, void *data)
{
+ struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu);
struct flush_tlb_info *info = data;
+ /*
+ * Order the 'loaded_mm' and 'is_lazy' against their
+ * write ordering in switch_mm_irqs_off(). Ensure
+ * 'is_lazy' is at least as new as 'loaded_mm'.
+ */
+ smp_rmb();
+
/* Lazy TLB will get flushed at the next context switch. */
if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
return false;
@@ -1214,8 +1287,15 @@ static bool should_flush_tlb(int cpu, void *data)
if (!info->mm)
return true;
+ /*
+ * While switching, the remote CPU could have state from
+ * either the prev or next mm. Assume the worst and flush.
+ */
+ if (loaded_mm == LOADED_MM_SWITCHING)
+ return true;
+
/* The target mm is loaded, and the CPU is not lazy. */
- if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+ if (loaded_mm == info->mm)
return true;
/* In cpumask, but not the loaded mm? Periodically remove by flushing. */
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 9e5fe2ba858f..e2b9991c3326 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -629,7 +629,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
goto out;
ret = 1;
if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
- text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
+ smp_text_poke_single(ip, new_insn, X86_PATCH_SIZE, NULL);
ret = 0;
}
out:
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index a4b4ebd41b8f..e7e8f77f77f8 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -89,6 +89,7 @@ int __init efi_alloc_page_tables(void)
efi_mm.pgd = efi_pgd;
mm_init_cpumask(&efi_mm);
init_new_context(NULL, &efi_mm);
+ set_notrack_mm(&efi_mm);
return 0;
@@ -434,15 +435,12 @@ void __init efi_dump_pagetable(void)
*/
static void efi_enter_mm(void)
{
- efi_prev_mm = current->active_mm;
- current->active_mm = &efi_mm;
- switch_mm(efi_prev_mm, &efi_mm, NULL);
+ efi_prev_mm = use_temporary_mm(&efi_mm);
}
static void efi_leave_mm(void)
{
- current->active_mm = efi_prev_mm;
- switch_mm(&efi_mm, efi_prev_mm, NULL);
+ unuse_temporary_mm(efi_prev_mm);
}
void arch_efi_call_virt_setup(void)
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index cfa18ec7d55f..1d78e5631bb8 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -87,8 +87,7 @@ SYM_CODE_START(pvh_start_xen)
mov %ebx, %esi
movl rva(pvh_start_info_sz)(%ebp), %ecx
shr $2,%ecx
- rep
- movsl
+ rep movsl
leal rva(early_stack_end)(%ebp), %esp
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index 5606a15cf9a1..fb910d9f8471 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -69,8 +69,7 @@ copy_loop:
movl pbe_orig_address(%edx), %edi
movl $(PAGE_SIZE >> 2), %ecx
- rep
- movsl
+ rep movsl
movl pbe_next(%edx), %edx
jmp copy_loop
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index 66f066b8feda..c73be0a02a6c 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -138,8 +138,7 @@ SYM_FUNC_START(core_restore_code)
movq pbe_address(%rdx), %rsi
movq pbe_orig_address(%rdx), %rdi
movq $(PAGE_SIZE >> 3), %rcx
- rep
- movsq
+ rep movsq
/* progress to the next pbe */
movq pbe_next(%rdx), %rdx
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index 5770c8097f32..2c19d7fc8a85 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -64,6 +64,8 @@ BEGIN {
modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
force64_expr = "\\([df]64\\)"
+ invalid64_expr = "\\(i64\\)"
+ only64_expr = "\\(o64\\)"
rex_expr = "^((REX(\\.[XRWB]+)+)|(REX$))"
rex2_expr = "\\(REX2\\)"
no_rex2_expr = "\\(!REX2\\)"
@@ -319,6 +321,11 @@ function convert_operands(count,opnd, i,j,imm,mod)
if (match(ext, force64_expr))
flags = add_flags(flags, "INAT_FORCE64")
+ # check invalid in 64-bit (and no only64)
+ if (match(ext, invalid64_expr) &&
+ !match($0, only64_expr))
+ flags = add_flags(flags, "INAT_INV64")
+
# check REX2 not allowed
if (match(ext, no_rex2_expr))
flags = add_flags(flags, "INAT_NO_REX2")
diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h
index ab5c8e47049c..9193a7790a71 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_32.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_32.h
@@ -31,8 +31,8 @@ struct faultinfo {
#define ___backtrack_faulted(_faulted) \
asm volatile ( \
- "mov $0, %0\n" \
"movl $__get_kernel_nofault_faulted_%=,%1\n" \
+ "mov $0, %0\n" \
"jmp _end_%=\n" \
"__get_kernel_nofault_faulted_%=:\n" \
"mov $1, %0;" \
diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h
index 26fb4835d3e9..61e4ca1e0ab5 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_64.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_64.h
@@ -31,8 +31,8 @@ struct faultinfo {
#define ___backtrack_faulted(_faulted) \
asm volatile ( \
- "mov $0, %0\n" \
"movq $__get_kernel_nofault_faulted_%=,%1\n" \
+ "mov $0, %0\n" \
"jmp _end_%=\n" \
"__get_kernel_nofault_faulted_%=:\n" \
"mov $1, %0;" \