diff options
Diffstat (limited to 'arch')
153 files changed, 5304 insertions, 4076 deletions
diff --git a/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi b/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi index f2386dcb9ff2..dda4fa91b2f2 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi @@ -40,6 +40,9 @@ reg = <1>; interrupt-parent = <&gpio4>; interrupts = <16 IRQ_TYPE_LEVEL_LOW>; + micrel,led-mode = <1>; + clocks = <&clks IMX6UL_CLK_ENET_REF>; + clock-names = "rmii-ref"; status = "okay"; }; }; diff --git a/arch/arm64/boot/dts/arm/morello.dtsi b/arch/arm64/boot/dts/arm/morello.dtsi index 0bab0b3ea969..5bc1c725dc86 100644 --- a/arch/arm64/boot/dts/arm/morello.dtsi +++ b/arch/arm64/boot/dts/arm/morello.dtsi @@ -44,7 +44,7 @@ next-level-cache = <&l2_0>; clocks = <&scmi_dvfs 0>; - l2_0: l2-cache-0 { + l2_0: l2-cache { compatible = "cache"; cache-level = <2>; /* 8 ways set associative */ @@ -53,13 +53,6 @@ cache-sets = <2048>; cache-unified; next-level-cache = <&l3_0>; - - l3_0: l3-cache { - compatible = "cache"; - cache-level = <3>; - cache-size = <0x100000>; - cache-unified; - }; }; }; @@ -78,7 +71,7 @@ next-level-cache = <&l2_1>; clocks = <&scmi_dvfs 0>; - l2_1: l2-cache-1 { + l2_1: l2-cache { compatible = "cache"; cache-level = <2>; /* 8 ways set associative */ @@ -105,7 +98,7 @@ next-level-cache = <&l2_2>; clocks = <&scmi_dvfs 1>; - l2_2: l2-cache-2 { + l2_2: l2-cache { compatible = "cache"; cache-level = <2>; /* 8 ways set associative */ @@ -132,7 +125,7 @@ next-level-cache = <&l2_3>; clocks = <&scmi_dvfs 1>; - l2_3: l2-cache-3 { + l2_3: l2-cache { compatible = "cache"; cache-level = <2>; /* 8 ways set associative */ @@ -143,6 +136,13 @@ next-level-cache = <&l3_0>; }; }; + + l3_0: l3-cache { + compatible = "cache"; + cache-level = <3>; + cache-size = <0x100000>; + cache-unified; + }; }; firmware { diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index 7251ad3a0017..b46566f3ce20 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -144,6 +144,19 @@ startup-delay-us = <20000>; }; + reg_usdhc2_vqmmc: regulator-usdhc2-vqmmc { + compatible = "regulator-gpio"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_usdhc2_vsel>; + gpios = <&gpio1 4 GPIO_ACTIVE_HIGH>; + regulator-max-microvolt = <3300000>; + regulator-min-microvolt = <1800000>; + states = <1800000 0x1>, + <3300000 0x0>; + regulator-name = "PMIC_USDHC_VSELECT"; + vin-supply = <®_nvcc_sd>; + }; + reserved-memory { #address-cells = <2>; #size-cells = <2>; @@ -269,7 +282,7 @@ "SODIMM_19", "", "", - "", + "PMIC_USDHC_VSELECT", "", "", "", @@ -785,6 +798,7 @@ pinctrl-2 = <&pinctrl_usdhc2_200mhz>, <&pinctrl_usdhc2_cd>; pinctrl-3 = <&pinctrl_usdhc2_sleep>, <&pinctrl_usdhc2_cd_sleep>; vmmc-supply = <®_usdhc2_vmmc>; + vqmmc-supply = <®_usdhc2_vqmmc>; }; &wdog1 { @@ -1206,13 +1220,17 @@ <MX8MM_IOMUXC_NAND_CLE_GPIO3_IO5 0x6>; /* SODIMM 76 */ }; + pinctrl_usdhc2_vsel: usdhc2vselgrp { + fsl,pins = + <MX8MM_IOMUXC_GPIO1_IO04_GPIO1_IO4 0x10>; /* PMIC_USDHC_VSELECT */ + }; + /* * Note: Due to ERR050080 we use discrete external on-module resistors pulling-up to the * on-module +V3.3_1.8_SD (LDO5) rail and explicitly disable the internal pull-ups here. */ pinctrl_usdhc2: usdhc2grp { fsl,pins = - <MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT 0x10>, <MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK 0x90>, /* SODIMM 78 */ <MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD 0x90>, /* SODIMM 74 */ <MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0 0x90>, /* SODIMM 80 */ @@ -1223,7 +1241,6 @@ pinctrl_usdhc2_100mhz: usdhc2-100mhzgrp { fsl,pins = - <MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT 0x10>, <MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK 0x94>, <MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD 0x94>, <MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0 0x94>, @@ -1234,7 +1251,6 @@ pinctrl_usdhc2_200mhz: usdhc2-200mhzgrp { fsl,pins = - <MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT 0x10>, <MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK 0x96>, <MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD 0x96>, <MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0 0x96>, @@ -1246,7 +1262,6 @@ /* Avoid backfeeding with removed card power */ pinctrl_usdhc2_sleep: usdhc2slpgrp { fsl,pins = - <MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT 0x0>, <MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK 0x0>, <MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD 0x0>, <MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0 0x0>, diff --git a/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi index a1b75c9068b2..dc0ccd723c6d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi @@ -24,6 +24,20 @@ fsl,operating-mode = "nominal"; }; +&gpu2d { + assigned-clocks = <&clk IMX8MP_CLK_GPU2D_CORE>; + assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>; + assigned-clock-rates = <800000000>; +}; + +&gpu3d { + assigned-clocks = <&clk IMX8MP_CLK_GPU3D_CORE>, + <&clk IMX8MP_CLK_GPU3D_SHADER_CORE>; + assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>, + <&clk IMX8MP_SYS_PLL1_800M>; + assigned-clock-rates = <800000000>, <800000000>; +}; + &pgc_hdmimix { assigned-clocks = <&clk IMX8MP_CLK_HDMI_AXI>, <&clk IMX8MP_CLK_HDMI_APB>; @@ -46,6 +60,18 @@ assigned-clock-rates = <600000000>, <300000000>; }; +&pgc_mlmix { + assigned-clocks = <&clk IMX8MP_CLK_ML_CORE>, + <&clk IMX8MP_CLK_ML_AXI>, + <&clk IMX8MP_CLK_ML_AHB>; + assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>, + <&clk IMX8MP_SYS_PLL1_800M>, + <&clk IMX8MP_SYS_PLL1_800M>; + assigned-clock-rates = <800000000>, + <800000000>, + <300000000>; +}; + &media_blk_ctrl { assigned-clocks = <&clk IMX8MP_CLK_MEDIA_AXI>, <&clk IMX8MP_CLK_MEDIA_APB>, diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi index 9bb26b466a06..59f057ba6fa7 100644 --- a/arch/arm64/boot/dts/freescale/imx95.dtsi +++ b/arch/arm64/boot/dts/freescale/imx95.dtsi @@ -1626,7 +1626,7 @@ reg = <0 0x4c300000 0 0x10000>, <0 0x60100000 0 0xfe00000>, <0 0x4c360000 0 0x10000>, - <0 0x4c340000 0 0x2000>; + <0 0x4c340000 0 0x4000>; reg-names = "dbi", "config", "atu", "app"; ranges = <0x81000000 0x0 0x00000000 0x0 0x6ff00000 0 0x00100000>, <0x82000000 0x0 0x10000000 0x9 0x10000000 0 0x10000000>; @@ -1673,7 +1673,7 @@ reg = <0 0x4c300000 0 0x10000>, <0 0x4c360000 0 0x1000>, <0 0x4c320000 0 0x1000>, - <0 0x4c340000 0 0x2000>, + <0 0x4c340000 0 0x4000>, <0 0x4c370000 0 0x10000>, <0x9 0 1 0>; reg-names = "dbi","atu", "dbi2", "app", "dma", "addr_space"; @@ -1700,7 +1700,7 @@ reg = <0 0x4c380000 0 0x10000>, <8 0x80100000 0 0xfe00000>, <0 0x4c3e0000 0 0x10000>, - <0 0x4c3c0000 0 0x2000>; + <0 0x4c3c0000 0 0x4000>; reg-names = "dbi", "config", "atu", "app"; ranges = <0x81000000 0 0x00000000 0x8 0x8ff00000 0 0x00100000>, <0x82000000 0 0x10000000 0xa 0x10000000 0 0x10000000>; @@ -1749,7 +1749,7 @@ reg = <0 0x4c380000 0 0x10000>, <0 0x4c3e0000 0 0x1000>, <0 0x4c3a0000 0 0x1000>, - <0 0x4c3c0000 0 0x2000>, + <0 0x4c3c0000 0 0x4000>, <0 0x4c3f0000 0 0x10000>, <0xa 0 1 0>; reg-names = "dbi", "atu", "dbi2", "app", "dma", "addr_space"; diff --git a/arch/arm64/boot/dts/st/stm32mp211.dtsi b/arch/arm64/boot/dts/st/stm32mp211.dtsi index 6dd1377f3e1d..bf888d60cd4f 100644 --- a/arch/arm64/boot/dts/st/stm32mp211.dtsi +++ b/arch/arm64/boot/dts/st/stm32mp211.dtsi @@ -116,11 +116,11 @@ }; intc: interrupt-controller@4ac10000 { - compatible = "arm,cortex-a7-gic"; + compatible = "arm,gic-400"; reg = <0x4ac10000 0x0 0x1000>, - <0x4ac20000 0x0 0x2000>, - <0x4ac40000 0x0 0x2000>, - <0x4ac60000 0x0 0x2000>; + <0x4ac20000 0x0 0x20000>, + <0x4ac40000 0x0 0x20000>, + <0x4ac60000 0x0 0x20000>; #interrupt-cells = <3>; interrupt-controller; }; diff --git a/arch/arm64/boot/dts/st/stm32mp231.dtsi b/arch/arm64/boot/dts/st/stm32mp231.dtsi index 8820d219a33e..75697acd1345 100644 --- a/arch/arm64/boot/dts/st/stm32mp231.dtsi +++ b/arch/arm64/boot/dts/st/stm32mp231.dtsi @@ -1201,13 +1201,12 @@ }; intc: interrupt-controller@4ac10000 { - compatible = "arm,cortex-a7-gic"; + compatible = "arm,gic-400"; reg = <0x4ac10000 0x1000>, - <0x4ac20000 0x2000>, - <0x4ac40000 0x2000>, - <0x4ac60000 0x2000>; + <0x4ac20000 0x20000>, + <0x4ac40000 0x20000>, + <0x4ac60000 0x20000>; #interrupt-cells = <3>; - #address-cells = <1>; interrupt-controller; }; }; diff --git a/arch/arm64/boot/dts/st/stm32mp251.dtsi b/arch/arm64/boot/dts/st/stm32mp251.dtsi index f3c6cdfd7008..87110f91e489 100644 --- a/arch/arm64/boot/dts/st/stm32mp251.dtsi +++ b/arch/arm64/boot/dts/st/stm32mp251.dtsi @@ -115,14 +115,13 @@ }; intc: interrupt-controller@4ac00000 { - compatible = "arm,cortex-a7-gic"; + compatible = "arm,gic-400"; #interrupt-cells = <3>; - #address-cells = <1>; interrupt-controller; reg = <0x0 0x4ac10000 0x0 0x1000>, - <0x0 0x4ac20000 0x0 0x2000>, - <0x0 0x4ac40000 0x0 0x2000>, - <0x0 0x4ac60000 0x0 0x2000>; + <0x0 0x4ac20000 0x0 0x20000>, + <0x0 0x4ac40000 0x0 0x20000>, + <0x0 0x4ac60000 0x0 0x20000>; }; psci { diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index ebceaae3c749..d40e427ddad9 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -52,7 +52,7 @@ mrs x0, id_aa64mmfr1_el1 ubfx x0, x0, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4 cbz x0, .Lskip_hcrx_\@ - mov_q x0, HCRX_HOST_FLAGS + mov_q x0, (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM) /* Enable GCS if supported */ mrs_s x1, SYS_ID_AA64PFR1_EL1 diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 974d72b5905b..e9c8a581e16f 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -100,9 +100,8 @@ HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID1) #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) -#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) +#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H | HCR_AMO | HCR_IMO | HCR_FMO) -#define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM) #define MPAMHCR_HOST_FLAGS 0 /* TCR_EL2 Registers bits */ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index 92a2b59a9f3d..3322c7047d84 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -99,6 +99,19 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, return res; } +#if IS_ENABLED(CONFIG_CC_IS_GCC) && IS_ENABLED(CONFIG_PAGE_SIZE_64KB) +static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) +{ + const struct vdso_time_data *ret = &vdso_u_time_data; + + /* Work around invalid absolute relocations */ + OPTIMIZER_HIDE_VAR(ret); + + return ret; +} +#define __arch_get_vdso_u_time_data __arch_get_vdso_u_time_data +#endif /* IS_ENABLED(CONFIG_CC_IS_GCC) && IS_ENABLED(CONFIG_PAGE_SIZE_64KB) */ + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9c4d6d552b25..4c46d80aa64b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -114,7 +114,14 @@ static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NC DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS); -bool arm64_use_ng_mappings = false; +/* + * arm64_use_ng_mappings must be placed in the .data section, otherwise it + * ends up in the .bss section where it is initialized in early_map_kernel() + * after the MMU (with the idmap) was enabled. create_init_idmap() - which + * runs before early_map_kernel() and reads the variable via PTE_MAYBE_NG - + * may end up generating an incorrect idmap page table attributes. + */ +bool arm64_use_ng_mappings __read_mostly = false; EXPORT_SYMBOL(arm64_use_ng_mappings); DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index b741ea6aefa5..96f625dc7256 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -235,6 +235,8 @@ static inline void __deactivate_traps_mpam(void) static inline void __activate_traps_common(struct kvm_vcpu *vcpu) { + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ write_sysreg(1 << 15, hstr_el2); @@ -245,11 +247,8 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * EL1 instead of being trapped to EL2. */ if (system_supports_pmuv3()) { - struct kvm_cpu_context *hctxt; - write_sysreg(0, pmselr_el0); - hctxt = host_data_ptr(host_ctxt); ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); @@ -269,6 +268,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) hcrx &= ~clr; } + ctxt_sys_reg(hctxt, HCRX_EL2) = read_sysreg_s(SYS_HCRX_EL2); write_sysreg_s(hcrx, SYS_HCRX_EL2); } @@ -278,19 +278,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); write_sysreg(0, hstr_el2); if (system_supports_pmuv3()) { - struct kvm_cpu_context *hctxt; - - hctxt = host_data_ptr(host_ctxt); write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); } if (cpus_have_final_cap(ARM64_HAS_HCX)) - write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2); + write_sysreg_s(ctxt_sys_reg(hctxt, HCRX_EL2), SYS_HCRX_EL2); __deactivate_traps_hfgxtr(vcpu); __deactivate_traps_mpam(); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 2a5284f749b4..e80f3ebd3e2a 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -503,7 +503,7 @@ int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) { int ret; - if (!addr_is_memory(addr)) + if (!range_is_memory(addr, addr + size)) return -EPERM; ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index ed363aa3027e..50aa8dbcae75 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -429,23 +429,27 @@ u64 __vgic_v3_get_gic_config(void) /* * To check whether we have a MMIO-based (GICv2 compatible) * CPU interface, we need to disable the system register - * view. To do that safely, we have to prevent any interrupt - * from firing (which would be deadly). + * view. * - * Note that this only makes sense on VHE, as interrupts are - * already masked for nVHE as part of the exception entry to - * EL2. - */ - if (has_vhe()) - flags = local_daif_save(); - - /* * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates * that to be able to set ICC_SRE_EL1.SRE to 0, all the * interrupt overrides must be set. You've got to love this. + * + * As we always run VHE with HCR_xMO set, no extra xMO + * manipulation is required in that case. + * + * To safely disable SRE, we have to prevent any interrupt + * from firing (which would be deadly). This only makes sense + * on VHE, as interrupts are already masked for nVHE as part + * of the exception entry to EL2. */ - sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO); - isb(); + if (has_vhe()) { + flags = local_daif_save(); + } else { + sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO); + isb(); + } + write_gicreg(0, ICC_SRE_EL1); isb(); @@ -453,11 +457,13 @@ u64 __vgic_v3_get_gic_config(void) write_gicreg(sre, ICC_SRE_EL1); isb(); - sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0); - isb(); - if (has_vhe()) + if (has_vhe()) { local_daif_restore(flags); + } else { + sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0); + isb(); + } val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); val |= read_gicreg(ICH_VTR_EL2); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 754f2fe0cc67..eeda92330ade 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1501,6 +1501,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } + if (!is_protected_kvm_enabled()) + memcache = &vcpu->arch.mmu_page_cache; + else + memcache = &vcpu->arch.pkvm_memcache; + /* * Permission faults just need to update the existing leaf entry, * and so normally don't require allocations from the memcache. The @@ -1510,13 +1515,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (!fault_is_perm || (logging_active && write_fault)) { int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); - if (!is_protected_kvm_enabled()) { - memcache = &vcpu->arch.mmu_page_cache; + if (!is_protected_kvm_enabled()) ret = kvm_mmu_topup_memory_cache(memcache, min_pages); - } else { - memcache = &vcpu->arch.pkvm_memcache; + else ret = topup_hyp_memcache(memcache, min_pages); - } + if (ret) return ret; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 005ad28f7306..5dde9285afc8 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1945,6 +1945,12 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, if ((hw_val & mpam_mask) == (user_val & mpam_mask)) user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + /* Fail the guest's request to disable the AA64 ISA at EL{0,1,2} */ + if (!FIELD_GET(ID_AA64PFR0_EL1_EL0, user_val) || + !FIELD_GET(ID_AA64PFR0_EL1_EL1, user_val) || + (vcpu_has_nv(vcpu) && !FIELD_GET(ID_AA64PFR0_EL1_EL2, user_val))) + return -EINVAL; + return set_id_reg(vcpu, rd, user_val); } diff --git a/arch/mips/include/asm/idle.h b/arch/mips/include/asm/idle.h index 0992cad9c632..c7d75807d13f 100644 --- a/arch/mips/include/asm/idle.h +++ b/arch/mips/include/asm/idle.h @@ -6,11 +6,10 @@ #include <linux/linkage.h> extern void (*cpu_wait)(void); -extern void r4k_wait(void); -extern asmlinkage void __r4k_wait(void); +extern asmlinkage void r4k_wait(void); extern void r4k_wait_irqoff(void); -static inline int using_rollback_handler(void) +static inline int using_skipover_handler(void) { return cpu_wait == r4k_wait; } diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index 85fa9962266a..ef72c46b5568 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -65,7 +65,8 @@ static inline void instruction_pointer_set(struct pt_regs *regs, /* Query offset/name of register from its name/offset */ extern int regs_query_register_offset(const char *name); -#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last)) +#define MAX_REG_OFFSET \ + (offsetof(struct pt_regs, __last) - sizeof(unsigned long)) /** * regs_get_register() - get register value from its offset diff --git a/arch/mips/kernel/genex.S b/arch/mips/kernel/genex.S index a572ce36a24f..08c0a01d9a29 100644 --- a/arch/mips/kernel/genex.S +++ b/arch/mips/kernel/genex.S @@ -104,48 +104,59 @@ handle_vcei: __FINIT - .align 5 /* 32 byte rollback region */ -LEAF(__r4k_wait) - .set push - .set noreorder - /* start of rollback region */ - LONG_L t0, TI_FLAGS($28) - nop - andi t0, _TIF_NEED_RESCHED - bnez t0, 1f - nop - nop - nop -#ifdef CONFIG_CPU_MICROMIPS - nop - nop - nop - nop -#endif + .section .cpuidle.text,"ax" + /* Align to 32 bytes for the maximum idle interrupt region size. */ + .align 5 +LEAF(r4k_wait) + /* Keep the ISA bit clear for calculations on local labels here. */ +0: .fill 0 + /* Start of idle interrupt region. */ + local_irq_enable + /* + * If an interrupt lands here, before going idle on the next + * instruction, we must *NOT* go idle since the interrupt could + * have set TIF_NEED_RESCHED or caused a timer to need resched. + * Fall through -- see skipover_handler below -- and have the + * idle loop take care of things. + */ +1: .fill 0 + /* The R2 EI/EHB sequence takes 8 bytes, otherwise pad up. */ + .if 1b - 0b > 32 + .error "overlong idle interrupt region" + .elseif 1b - 0b > 8 + .align 4 + .endif +2: .fill 0 + .equ r4k_wait_idle_size, 2b - 0b + /* End of idle interrupt region; size has to be a power of 2. */ .set MIPS_ISA_ARCH_LEVEL_RAW +r4k_wait_insn: wait - /* end of rollback region (the region size must be power of two) */ -1: +r4k_wait_exit: + .set mips0 + local_irq_disable jr ra - nop - .set pop - END(__r4k_wait) + END(r4k_wait) + .previous - .macro BUILD_ROLLBACK_PROLOGUE handler - FEXPORT(rollback_\handler) + .macro BUILD_SKIPOVER_PROLOGUE handler + FEXPORT(skipover_\handler) .set push .set noat MFC0 k0, CP0_EPC - PTR_LA k1, __r4k_wait - ori k0, 0x1f /* 32 byte rollback region */ - xori k0, 0x1f + /* Subtract/add 2 to let the ISA bit propagate through the mask. */ + PTR_LA k1, r4k_wait_insn - 2 + ori k0, r4k_wait_idle_size - 2 + .set noreorder bne k0, k1, \handler + PTR_ADDIU k0, r4k_wait_exit - r4k_wait_insn + 2 + .set reorder MTC0 k0, CP0_EPC .set pop .endm .align 5 -BUILD_ROLLBACK_PROLOGUE handle_int +BUILD_SKIPOVER_PROLOGUE handle_int NESTED(handle_int, PT_SIZE, sp) .cfi_signal_frame #ifdef CONFIG_TRACE_IRQFLAGS @@ -265,7 +276,7 @@ NESTED(except_vec_ejtag_debug, 0, sp) * This prototype is copied to ebase + n*IntCtl.VS and patched * to invoke the handler */ -BUILD_ROLLBACK_PROLOGUE except_vec_vi +BUILD_SKIPOVER_PROLOGUE except_vec_vi NESTED(except_vec_vi, 0, sp) SAVE_SOME docfi=1 SAVE_AT docfi=1 diff --git a/arch/mips/kernel/idle.c b/arch/mips/kernel/idle.c index 5abc8b7340f8..80e8a04a642e 100644 --- a/arch/mips/kernel/idle.c +++ b/arch/mips/kernel/idle.c @@ -35,13 +35,6 @@ static void __cpuidle r3081_wait(void) write_c0_conf(cfg | R30XX_CONF_HALT); } -void __cpuidle r4k_wait(void) -{ - raw_local_irq_enable(); - __r4k_wait(); - raw_local_irq_disable(); -} - /* * This variant is preferable as it allows testing need_resched and going to * sleep depending on the outcome atomically. Unfortunately the "It is diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c index e85bd087467e..cc26d56f3ab6 100644 --- a/arch/mips/kernel/smp-cps.c +++ b/arch/mips/kernel/smp-cps.c @@ -332,6 +332,8 @@ static void __init cps_prepare_cpus(unsigned int max_cpus) mips_cps_cluster_bootcfg = kcalloc(nclusters, sizeof(*mips_cps_cluster_bootcfg), GFP_KERNEL); + if (!mips_cps_cluster_bootcfg) + goto err_out; if (nclusters > 1) mips_cm_update_property(); @@ -348,6 +350,8 @@ static void __init cps_prepare_cpus(unsigned int max_cpus) mips_cps_cluster_bootcfg[cl].core_power = kcalloc(BITS_TO_LONGS(ncores), sizeof(unsigned long), GFP_KERNEL); + if (!mips_cps_cluster_bootcfg[cl].core_power) + goto err_out; /* Allocate VPE boot configuration structs */ for (c = 0; c < ncores; c++) { diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 39e248d0ed59..8ec1e185b35c 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -77,7 +77,7 @@ #include "access-helper.h" extern void check_wait(void); -extern asmlinkage void rollback_handle_int(void); +extern asmlinkage void skipover_handle_int(void); extern asmlinkage void handle_int(void); extern asmlinkage void handle_adel(void); extern asmlinkage void handle_ades(void); @@ -2066,7 +2066,7 @@ void *set_vi_handler(int n, vi_handler_t addr) { extern const u8 except_vec_vi[]; extern const u8 except_vec_vi_ori[], except_vec_vi_end[]; - extern const u8 rollback_except_vec_vi[]; + extern const u8 skipover_except_vec_vi[]; unsigned long handler; unsigned long old_handler = vi_handlers[n]; int srssets = current_cpu_data.srsets; @@ -2095,7 +2095,7 @@ void *set_vi_handler(int n, vi_handler_t addr) change_c0_srsmap(0xf << n*4, 0 << n*4); } - vec_start = using_rollback_handler() ? rollback_except_vec_vi : + vec_start = using_skipover_handler() ? skipover_except_vec_vi : except_vec_vi; #if defined(CONFIG_CPU_MICROMIPS) || defined(CONFIG_CPU_BIG_ENDIAN) ori_offset = except_vec_vi_ori - vec_start + 2; @@ -2426,8 +2426,8 @@ void __init trap_init(void) if (board_be_init) board_be_init(); - set_except_vector(EXCCODE_INT, using_rollback_handler() ? - rollback_handle_int : handle_int); + set_except_vector(EXCCODE_INT, using_skipover_handler() ? + skipover_handle_int : handle_int); set_except_vector(EXCCODE_MOD, handle_tlbm); set_except_vector(EXCCODE_TLBL, handle_tlbl); set_except_vector(EXCCODE_TLBS, handle_tlbs); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 7c244de77180..15d8f75902f8 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -275,6 +275,9 @@ long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg) unsigned long pmm; u8 pmlen; + if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM)) + return -EINVAL; + if (is_compat_thread(ti)) return -EINVAL; @@ -330,6 +333,9 @@ long get_tagged_addr_ctrl(struct task_struct *task) struct thread_info *ti = task_thread_info(task); long ret = 0; + if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM)) + return -EINVAL; + if (is_compat_thread(ti)) return -EINVAL; diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 8ff8e8b36524..9c83848797a7 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -198,47 +198,57 @@ asmlinkage __visible __trap_section void do_trap_insn_illegal(struct pt_regs *re DO_ERROR_INFO(do_trap_load_fault, SIGSEGV, SEGV_ACCERR, "load access fault"); -asmlinkage __visible __trap_section void do_trap_load_misaligned(struct pt_regs *regs) +enum misaligned_access_type { + MISALIGNED_STORE, + MISALIGNED_LOAD, +}; +static const struct { + const char *type_str; + int (*handler)(struct pt_regs *regs); +} misaligned_handler[] = { + [MISALIGNED_STORE] = { + .type_str = "Oops - store (or AMO) address misaligned", + .handler = handle_misaligned_store, + }, + [MISALIGNED_LOAD] = { + .type_str = "Oops - load address misaligned", + .handler = handle_misaligned_load, + }, +}; + +static void do_trap_misaligned(struct pt_regs *regs, enum misaligned_access_type type) { + irqentry_state_t state; + if (user_mode(regs)) { irqentry_enter_from_user_mode(regs); + local_irq_enable(); + } else { + state = irqentry_nmi_enter(regs); + } - if (handle_misaligned_load(regs)) - do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, - "Oops - load address misaligned"); + if (misaligned_handler[type].handler(regs)) + do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, + misaligned_handler[type].type_str); + if (user_mode(regs)) { + local_irq_disable(); irqentry_exit_to_user_mode(regs); } else { - irqentry_state_t state = irqentry_nmi_enter(regs); - - if (handle_misaligned_load(regs)) - do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, - "Oops - load address misaligned"); - irqentry_nmi_exit(regs, state); } } -asmlinkage __visible __trap_section void do_trap_store_misaligned(struct pt_regs *regs) +asmlinkage __visible __trap_section void do_trap_load_misaligned(struct pt_regs *regs) { - if (user_mode(regs)) { - irqentry_enter_from_user_mode(regs); - - if (handle_misaligned_store(regs)) - do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, - "Oops - store (or AMO) address misaligned"); - - irqentry_exit_to_user_mode(regs); - } else { - irqentry_state_t state = irqentry_nmi_enter(regs); - - if (handle_misaligned_store(regs)) - do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, - "Oops - store (or AMO) address misaligned"); + do_trap_misaligned(regs, MISALIGNED_LOAD); +} - irqentry_nmi_exit(regs, state); - } +asmlinkage __visible __trap_section void do_trap_store_misaligned(struct pt_regs *regs) +{ + do_trap_misaligned(regs, MISALIGNED_STORE); } + DO_ERROR_INFO(do_trap_store_fault, SIGSEGV, SEGV_ACCERR, "store (or AMO) access fault"); DO_ERROR_INFO(do_trap_ecall_s, diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 4354c87c0376..77c788660223 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -88,6 +88,13 @@ #define INSN_MATCH_C_FSWSP 0xe002 #define INSN_MASK_C_FSWSP 0xe003 +#define INSN_MATCH_C_LHU 0x8400 +#define INSN_MASK_C_LHU 0xfc43 +#define INSN_MATCH_C_LH 0x8440 +#define INSN_MASK_C_LH 0xfc43 +#define INSN_MATCH_C_SH 0x8c00 +#define INSN_MASK_C_SH 0xfc43 + #define INSN_LEN(insn) ((((insn) & 0x3) < 0x3) ? 2 : 4) #if defined(CONFIG_64BIT) @@ -268,7 +275,7 @@ static unsigned long get_f32_rs(unsigned long insn, u8 fp_reg_offset, int __ret; \ \ if (user_mode(regs)) { \ - __ret = __get_user(insn, (type __user *) insn_addr); \ + __ret = get_user(insn, (type __user *) insn_addr); \ } else { \ insn = *(type *)insn_addr; \ __ret = 0; \ @@ -431,6 +438,13 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs) fp = 1; len = 4; #endif + } else if ((insn & INSN_MASK_C_LHU) == INSN_MATCH_C_LHU) { + len = 2; + insn = RVC_RS2S(insn) << SH_RD; + } else if ((insn & INSN_MASK_C_LH) == INSN_MATCH_C_LH) { + len = 2; + shift = 8 * (sizeof(ulong) - len); + insn = RVC_RS2S(insn) << SH_RD; } else { regs->epc = epc; return -1; @@ -530,6 +544,9 @@ static int handle_scalar_misaligned_store(struct pt_regs *regs) len = 4; val.data_ulong = GET_F32_RS2C(insn, regs); #endif + } else if ((insn & INSN_MASK_C_SH) == INSN_MATCH_C_SH) { + len = 2; + val.data_ulong = GET_RS2S(insn, regs); } else { regs->epc = epc; return -1; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 60d684c76c58..02635bac91f1 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -77,6 +77,8 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) memcpy(cntx, reset_cntx, sizeof(*cntx)); spin_unlock(&vcpu->arch.reset_cntx_lock); + memset(&vcpu->arch.smstateen_csr, 0, sizeof(vcpu->arch.smstateen_csr)); + kvm_riscv_vcpu_fp_reset(vcpu); kvm_riscv_vcpu_vector_reset(vcpu); diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 6f2c9ce1b154..24b22f6a9e99 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -38,7 +38,6 @@ CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_EXPERT=y -# CONFIG_SYSFS_SYSCALL is not set CONFIG_PROFILING=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y @@ -92,7 +91,6 @@ CONFIG_UNIXWARE_DISKLABEL=y CONFIG_IOSCHED_BFQ=y CONFIG_BINFMT_MISC=m CONFIG_ZSWAP=y -CONFIG_ZSMALLOC=y CONFIG_ZSMALLOC_STAT=y CONFIG_SLAB_BUCKETS=y CONFIG_SLUB_STATS=y @@ -395,6 +393,9 @@ CONFIG_CLS_U32_MARK=y CONFIG_NET_CLS_FLOW=m CONFIG_NET_CLS_CGROUP=y CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m @@ -405,6 +406,9 @@ CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m CONFIG_NET_ACT_GATE=m CONFIG_NET_TC_SKB_EXT=y CONFIG_DNS_RESOLVER=y @@ -628,8 +632,16 @@ CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_MEM=m CONFIG_VIRTIO_INPUT=y +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_VDPA_SIM_NET=m +CONFIG_VDPA_SIM_BLOCK=m +CONFIG_VDPA_USER=m +CONFIG_MLX5_VDPA_NET=m +CONFIG_VP_VDPA=m CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m +CONFIG_VHOST_VDPA=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -654,7 +666,6 @@ CONFIG_NILFS2_FS=m CONFIG_BCACHEFS_FS=y CONFIG_BCACHEFS_QUOTA=y CONFIG_BCACHEFS_POSIX_ACL=y -CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y CONFIG_FS_VERITY=y @@ -724,11 +735,10 @@ CONFIG_NLS_UTF8=m CONFIG_DLM=m CONFIG_UNICODE=y CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y CONFIG_ENCRYPTED_KEYS=m CONFIG_KEY_NOTIFICATIONS=y CONFIG_SECURITY=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_FORTIFY_SOURCE=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_LOCKDOWN_LSM=y @@ -741,6 +751,8 @@ CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y +CONFIG_FORTIFY_SOURCE=y +CONFIG_HARDENED_USERCOPY=y CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_USER=m # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set @@ -756,7 +768,6 @@ CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m CONFIG_CRYPTO_ARIA=m CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_DES=m @@ -801,7 +812,6 @@ CONFIG_CRYPTO_SHA3_512_S390=m CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_DES_S390=m -CONFIG_CRYPTO_CHACHA_S390=m CONFIG_CRYPTO_HMAC_S390=m CONFIG_ZCRYPT=m CONFIG_PKEY=m @@ -812,9 +822,9 @@ CONFIG_PKEY_UV=m CONFIG_CRYPTO_PAES_S390=m CONFIG_CRYPTO_DEV_VIRTIO=m CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_CRYPTO_KRB5=m +CONFIG_CRYPTO_KRB5_SELFTESTS=y CONFIG_CORDIC=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m CONFIG_RANDOM32_SELFTEST=y CONFIG_XZ_DEC_MICROLZMA=y CONFIG_DMA_CMA=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index f18a7d97ac21..2b8b42d569bc 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -36,7 +36,6 @@ CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_EXPERT=y -# CONFIG_SYSFS_SYSCALL is not set CONFIG_PROFILING=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y @@ -86,7 +85,6 @@ CONFIG_UNIXWARE_DISKLABEL=y CONFIG_IOSCHED_BFQ=y CONFIG_BINFMT_MISC=m CONFIG_ZSWAP=y -CONFIG_ZSMALLOC=y CONFIG_ZSMALLOC_STAT=y CONFIG_SLAB_BUCKETS=y # CONFIG_COMPAT_BRK is not set @@ -385,6 +383,9 @@ CONFIG_CLS_U32_MARK=y CONFIG_NET_CLS_FLOW=m CONFIG_NET_CLS_CGROUP=y CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m @@ -395,6 +396,9 @@ CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m CONFIG_NET_ACT_GATE=m CONFIG_NET_TC_SKB_EXT=y CONFIG_DNS_RESOLVER=y @@ -618,8 +622,16 @@ CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_MEM=m CONFIG_VIRTIO_INPUT=y +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_VDPA_SIM_NET=m +CONFIG_VDPA_SIM_BLOCK=m +CONFIG_VDPA_USER=m +CONFIG_MLX5_VDPA_NET=m +CONFIG_VP_VDPA=m CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m +CONFIG_VHOST_VDPA=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -641,7 +653,6 @@ CONFIG_NILFS2_FS=m CONFIG_BCACHEFS_FS=m CONFIG_BCACHEFS_QUOTA=y CONFIG_BCACHEFS_POSIX_ACL=y -CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y CONFIG_FS_VERITY=y @@ -711,6 +722,7 @@ CONFIG_NLS_UTF8=m CONFIG_DLM=m CONFIG_UNICODE=y CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y CONFIG_ENCRYPTED_KEYS=m CONFIG_KEY_NOTIFICATIONS=y CONFIG_SECURITY=y @@ -742,7 +754,6 @@ CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m CONFIG_CRYPTO_ARIA=m CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_DES=m @@ -788,7 +799,6 @@ CONFIG_CRYPTO_SHA3_512_S390=m CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_DES_S390=m -CONFIG_CRYPTO_CHACHA_S390=m CONFIG_CRYPTO_HMAC_S390=m CONFIG_ZCRYPT=m CONFIG_PKEY=m @@ -799,10 +809,10 @@ CONFIG_PKEY_UV=m CONFIG_CRYPTO_PAES_S390=m CONFIG_CRYPTO_DEV_VIRTIO=m CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_CRYPTO_KRB5=m +CONFIG_CRYPTO_KRB5_SELFTESTS=y CONFIG_CORDIC=m CONFIG_PRIME_NUMBERS=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m CONFIG_XZ_DEC_MICROLZMA=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=0 diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 853b2326a171..8163c1702720 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -70,7 +70,6 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_INFO_DWARF4=y CONFIG_DEBUG_FS=y CONFIG_PANIC_ON_OOPS=y -# CONFIG_SCHED_DEBUG is not set CONFIG_RCU_CPU_STALL_TIMEOUT=60 # CONFIG_RCU_TRACE is not set # CONFIG_FTRACE is not set diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index dd291c9ad6a6..9980c17ba22d 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -602,7 +602,8 @@ SYM_CODE_START(stack_invalid) stmg %r0,%r7,__PT_R0(%r11) stmg %r8,%r9,__PT_PSW(%r11) mvc __PT_R8(64,%r11),0(%r14) - stg %r10,__PT_ORIG_GPR2(%r11) # store last break to orig_gpr2 + GET_LC %r2 + mvc __PT_ORIG_GPR2(8,%r11),__LC_PGM_LAST_BREAK(%r2) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs jg kernel_stack_invalid diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 9a929bbcc397..241f7251c873 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -428,6 +428,8 @@ static void __clp_add(struct clp_fh_list_entry *entry, void *data) return; } zdev = zpci_create_device(entry->fid, entry->fh, entry->config_state); + if (IS_ERR(zdev)) + return; list_add_tail(&zdev->entry, scan_list); } diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h index 3a08f9029a3f..1c6e0ae41b0c 100644 --- a/arch/um/include/asm/uaccess.h +++ b/arch/um/include/asm/uaccess.h @@ -55,6 +55,7 @@ do { \ goto err_label; \ } \ *((type *)dst) = get_unaligned((type *)(src)); \ + barrier(); \ current->thread.segv_continue = NULL; \ } while (0) @@ -66,6 +67,7 @@ do { \ if (__faulted) \ goto err_label; \ put_unaligned(*((type *)src), (type *)(dst)); \ + barrier(); \ current->thread.segv_continue = NULL; \ } while (0) diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index ce073150dc20..ef2272e92a43 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -225,20 +225,20 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, panic("Failed to sync kernel TLBs: %d", err); goto out; } - else if (current->mm == NULL) { - if (current->pagefault_disabled) { - if (!mc) { - show_regs(container_of(regs, struct pt_regs, regs)); - panic("Segfault with pagefaults disabled but no mcontext"); - } - if (!current->thread.segv_continue) { - show_regs(container_of(regs, struct pt_regs, regs)); - panic("Segfault without recovery target"); - } - mc_set_rip(mc, current->thread.segv_continue); - current->thread.segv_continue = NULL; - goto out; + else if (current->pagefault_disabled) { + if (!mc) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault with pagefaults disabled but no mcontext"); } + if (!current->thread.segv_continue) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault without recovery target"); + } + mc_set_rip(mc, current->thread.segv_continue); + current->thread.segv_continue = NULL; + goto out; + } + else if (current->mm == NULL) { show_regs(container_of(regs, struct pt_regs, regs)); panic("Segfault with no mm"); } diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index d4b3b6742ec8..2f5ee045bc7a 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -477,7 +477,7 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len) return text_poke(addr, opcode, len); } -void text_poke_sync(void) +void smp_text_poke_sync_each_cpu(void) { } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4b9f378e05f6..297e33cfa760 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -153,6 +153,7 @@ config X86 select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH + select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select BUILDTIME_TABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_WATCHDOG @@ -2368,6 +2369,7 @@ config STRICT_SIGALTSTACK_SIZE config CFI_AUTO_DEFAULT bool "Attempt to use FineIBT by default at boot time" depends on FINEIBT + depends on !RUST || RUSTC_VERSION >= 108800 default y help Attempt to use FineIBT by default at boot time. If enabled, diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 594723005d95..53daf4654f6c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -281,6 +281,7 @@ archprepare: $(cpufeaturemasks.hdr) ### # Kernel objects +core-y += arch/x86/boot/startup/ libs-y += arch/x86/lib/ # drivers-y are linked after core-y diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S index aa9b96457584..cf4a6155714e 100644 --- a/arch/x86/boot/bioscall.S +++ b/arch/x86/boot/bioscall.S @@ -32,7 +32,7 @@ intcall: movw %dx, %si movw %sp, %di movw $11, %cx - rep; movsl + rep movsl /* Pop full state from the stack */ popal @@ -67,7 +67,7 @@ intcall: jz 4f movw %sp, %si movw $11, %cx - rep; movsl + rep movsl 4: addw $44, %sp /* Restore state and return */ diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 38f17a1e1e36..60580836daf7 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -34,7 +34,7 @@ extern struct setup_header hdr; extern struct boot_params boot_params; -#define cpu_relax() asm volatile("rep; nop") +#define cpu_relax() asm volatile("pause") static inline void io_delay(void) { @@ -155,14 +155,14 @@ static inline void wrgs32(u32 v, addr_t addr) static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len) { bool diff; - asm volatile("fs; repe; cmpsb" CC_SET(nz) + asm volatile("fs repe cmpsb" CC_SET(nz) : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len) { bool diff; - asm volatile("gs; repe; cmpsb" CC_SET(nz) + asm volatile("gs repe cmpsb" CC_SET(nz) : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fdbce022db55..f4f7b22d8113 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -44,10 +44,10 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h -# sev.c indirectly includes inat-table.h which is generated during +# sev-decode-insn.c indirectly includes inat-table.c which is generated during # compilation and stored in $(objtree). Add the directory to the includes so # that the compiler finds it even with out-of-tree builds (make O=/some/path). -CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/ +CFLAGS_sev-handle-vc.o += -I$(objtree)/arch/x86/lib/ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ @@ -73,7 +73,7 @@ LDFLAGS_vmlinux += -T hostprogs := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ @@ -96,8 +96,7 @@ ifdef CONFIG_X86_64 vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o - vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o - vmlinux-objs-y += $(obj)/la57toggle.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o $(obj)/sev-handle-vc.o endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o @@ -106,6 +105,7 @@ vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a $(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index eafd4f185e77..d9dab940ff62 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -35,7 +35,6 @@ #include <asm/bootparam.h> #include <asm/desc_defs.h> #include <asm/trapnr.h> -#include "pgtable.h" /* * Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 1cdcd4aaf395..94b5991da001 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -14,7 +14,6 @@ #include "misc.h" #include "error.h" -#include "pgtable.h" #include "../string.h" #include "../voffset.h" #include <asm/bootparam_utils.h> diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index dd8d1a85f671..db1048621ea2 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -136,6 +136,9 @@ static inline void console_init(void) #endif #ifdef CONFIG_AMD_MEM_ENCRYPT +struct es_em_ctxt; +struct insn; + void sev_enable(struct boot_params *bp); void snp_check_features(void); void sev_es_shutdown_ghcb(void); @@ -143,6 +146,11 @@ extern bool sev_es_check_ghcb_fault(unsigned long address); void snp_set_page_private(unsigned long paddr); void snp_set_page_shared(unsigned long paddr); void sev_prep_identity_maps(unsigned long top_level_pgt); + +enum es_result vc_decode_insn(struct es_em_ctxt *ctxt); +bool insn_has_rep_prefix(struct insn *insn); +void sev_insn_decode_init(void); +bool early_setup_ghcb(void); #else static inline void sev_enable(struct boot_params *bp) { diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h deleted file mode 100644 index 6d595abe06b3..000000000000 --- a/arch/x86/boot/compressed/pgtable.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef BOOT_COMPRESSED_PAGETABLE_H -#define BOOT_COMPRESSED_PAGETABLE_H - -#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) - -#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE -#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0 - -#ifndef __ASSEMBLER__ - -extern unsigned long *trampoline_32bit; - -extern void trampoline_32bit_src(void *trampoline, bool enable_5lvl); - -extern const u16 trampoline_ljmp_imm_offset; - -#endif /* __ASSEMBLER__ */ -#endif /* BOOT_COMPRESSED_PAGETABLE_H */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index d8c5de40669d..5a6c7a190e5b 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -4,7 +4,6 @@ #include <asm/bootparam_utils.h> #include <asm/e820/types.h> #include <asm/processor.h> -#include "pgtable.h" #include "../string.h" #include "efi.h" diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c new file mode 100644 index 000000000000..89dd02de2a0f --- /dev/null +++ b/arch/x86/boot/compressed/sev-handle-vc.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "misc.h" +#include "sev.h" + +#include <linux/kernel.h> +#include <linux/string.h> +#include <asm/insn.h> +#include <asm/pgtable_types.h> +#include <asm/ptrace.h> +#include <asm/sev.h> +#include <asm/trapnr.h> +#include <asm/trap_pf.h> +#include <asm/fpu/xcr.h> + +#define __BOOT_COMPRESSED + +/* Basic instruction decoding support needed */ +#include "../../lib/inat.c" +#include "../../lib/insn.c" + +/* + * Copy a version of this function here - insn-eval.c can't be used in + * pre-decompression code. + */ +bool insn_has_rep_prefix(struct insn *insn) +{ + insn_byte_t p; + int i; + + insn_get_prefixes(insn); + + for_each_insn_prefix(insn, i, p) { + if (p == 0xf2 || p == 0xf3) + return true; + } + + return false; +} + +enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int ret; + + memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); + + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + + return ES_OK; +} + +extern void sev_insn_decode_init(void) __alias(inat_init_tables); + +/* + * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and + * doesn't use segments. + */ +static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ + return 0UL; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + void *dst, char *buf, size_t size) +{ + memcpy(dst, buf, size); + + return ES_OK; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + void *src, char *buf, size_t size) +{ + memcpy(buf, src, size); + + return ES_OK; +} + +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + return ES_OK; +} + +static bool fault_in_kernel_space(unsigned long address) +{ + return false; +} + +#define sev_printk(fmt, ...) + +#include "../../coco/sev/vc-shared.c" + +void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) +{ + struct es_em_ctxt ctxt; + enum es_result result; + + if (!boot_ghcb && !early_setup_ghcb()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + vc_ghcb_invalidate(boot_ghcb); + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result != ES_OK) + goto finish; + + result = vc_check_opcode_bytes(&ctxt, exit_code); + if (result != ES_OK) + goto finish; + + switch (exit_code) { + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(boot_ghcb, &ctxt); + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(boot_ghcb, &ctxt); + break; + default: + result = ES_UNSUPPORTED; + break; + } + +finish: + if (result == ES_OK) + vc_finish_insn(&ctxt); + else if (result != ES_RETRY) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 0003e4416efd..612b443296d3 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -24,96 +24,11 @@ #include <asm/cpuid.h> #include "error.h" -#include "../msr.h" +#include "sev.h" static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); struct ghcb *boot_ghcb; -/* - * Copy a version of this function here - insn-eval.c can't be used in - * pre-decompression code. - */ -static bool insn_has_rep_prefix(struct insn *insn) -{ - insn_byte_t p; - int i; - - insn_get_prefixes(insn); - - for_each_insn_prefix(insn, i, p) { - if (p == 0xf2 || p == 0xf3) - return true; - } - - return false; -} - -/* - * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and - * doesn't use segments. - */ -static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) -{ - return 0UL; -} - -static inline u64 sev_es_rd_ghcb_msr(void) -{ - struct msr m; - - boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); - - return m.q; -} - -static inline void sev_es_wr_ghcb_msr(u64 val) -{ - struct msr m; - - m.q = val; - boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); -} - -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int ret; - - memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); - - ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); - if (ret < 0) - return ES_DECODE_FAILED; - - return ES_OK; -} - -static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, - void *dst, char *buf, size_t size) -{ - memcpy(dst, buf, size); - - return ES_OK; -} - -static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, - void *src, char *buf, size_t size) -{ - memcpy(buf, src, size); - - return ES_OK; -} - -static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) -{ - return ES_OK; -} - -static bool fault_in_kernel_space(unsigned long address) -{ - return false; -} - #undef __init #define __init @@ -122,24 +37,27 @@ static bool fault_in_kernel_space(unsigned long address) #define __BOOT_COMPRESSED -/* Basic instruction decoding support needed */ -#include "../../lib/inat.c" -#include "../../lib/insn.c" +extern struct svsm_ca *boot_svsm_caa; +extern u64 boot_svsm_caa_pa; -/* Include code for early handlers */ -#include "../../coco/sev/shared.c" - -static struct svsm_ca *svsm_get_caa(void) +struct svsm_ca *svsm_get_caa(void) { return boot_svsm_caa; } -static u64 svsm_get_caa_pa(void) +u64 svsm_get_caa_pa(void) { return boot_svsm_caa_pa; } -static int svsm_perform_call_protocol(struct svsm_call *call) +int svsm_perform_call_protocol(struct svsm_call *call); + +u8 snp_vmpl; + +/* Include code for early handlers */ +#include "../../boot/startup/sev-shared.c" + +int svsm_perform_call_protocol(struct svsm_call *call) { struct ghcb *ghcb; int ret; @@ -157,7 +75,7 @@ static int svsm_perform_call_protocol(struct svsm_call *call) return ret; } -bool sev_snp_enabled(void) +static bool sev_snp_enabled(void) { return sev_status & MSR_AMD64_SEV_SNP_ENABLED; } @@ -212,7 +130,7 @@ void snp_set_page_shared(unsigned long paddr) __page_state_change(paddr, SNP_PAGE_STATE_SHARED); } -static bool early_setup_ghcb(void) +bool early_setup_ghcb(void) { if (set_page_decrypted((unsigned long)&boot_ghcb_page)) return false; @@ -223,7 +141,7 @@ static bool early_setup_ghcb(void) boot_ghcb = &boot_ghcb_page; /* Initialize lookup tables for the instruction decoder */ - inat_init_tables(); + sev_insn_decode_init(); /* SNP guest requires the GHCB GPA must be registered */ if (sev_snp_enabled()) @@ -296,46 +214,6 @@ bool sev_es_check_ghcb_fault(unsigned long address) return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page); } -void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) -{ - struct es_em_ctxt ctxt; - enum es_result result; - - if (!boot_ghcb && !early_setup_ghcb()) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - - vc_ghcb_invalidate(boot_ghcb); - result = vc_init_em_ctxt(&ctxt, regs, exit_code); - if (result != ES_OK) - goto finish; - - result = vc_check_opcode_bytes(&ctxt, exit_code); - if (result != ES_OK) - goto finish; - - switch (exit_code) { - case SVM_EXIT_RDTSC: - case SVM_EXIT_RDTSCP: - result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); - break; - case SVM_EXIT_IOIO: - result = vc_handle_ioio(boot_ghcb, &ctxt); - break; - case SVM_EXIT_CPUID: - result = vc_handle_cpuid(boot_ghcb, &ctxt); - break; - default: - result = ES_UNSUPPORTED; - break; - } - -finish: - if (result == ES_OK) - vc_finish_insn(&ctxt); - else if (result != ES_RETRY) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); -} - /* * SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need * guest side implementation for proper functioning of the guest. If any diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h index d3900384b8ab..92f79c21939c 100644 --- a/arch/x86/boot/compressed/sev.h +++ b/arch/x86/boot/compressed/sev.h @@ -10,14 +10,31 @@ #ifdef CONFIG_AMD_MEM_ENCRYPT -bool sev_snp_enabled(void); +#include "../msr.h" + void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 sev_get_status(void); bool early_is_sevsnp_guest(void); +static inline u64 sev_es_rd_ghcb_msr(void) +{ + struct msr m; + + boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); + + return m.q; +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ + struct msr m; + + m.q = val; + boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); +} + #else -static inline bool sev_snp_enabled(void) { return false; } static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 sev_get_status(void) { return 0; } static inline bool early_is_sevsnp_guest(void) { return false; } diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 81fc1eaa3229..9af19d9614cb 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -15,9 +15,9 @@ static void *____memcpy(void *dest, const void *src, size_t n) { int d0, d1, d2; asm volatile( - "rep ; movsl\n\t" + "rep movsl\n\t" "movl %4,%%ecx\n\t" - "rep ; movsb\n\t" + "rep movsb" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src) : "memory"); @@ -29,9 +29,9 @@ static void *____memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile( - "rep ; movsq\n\t" + "rep movsq\n\t" "movq %4,%%rcx\n\t" - "rep ; movsb\n\t" + "rep movsb" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src) : "memory"); diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S index 6afd05e819d2..3973a67cd04e 100644 --- a/arch/x86/boot/copy.S +++ b/arch/x86/boot/copy.S @@ -22,10 +22,10 @@ SYM_FUNC_START_NOALIGN(memcpy) movw %dx, %si pushw %cx shrw $2, %cx - rep; movsl + rep movsl popw %cx andw $3, %cx - rep; movsb + rep movsb popw %di popw %si retl @@ -38,10 +38,10 @@ SYM_FUNC_START_NOALIGN(memset) imull $0x01010101,%eax pushw %cx shrw $2, %cx - rep; stosl + rep stosl popw %cx andw $3, %cx - rep; stosb + rep stosb popw %di retl SYM_FUNC_END(memset) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index b5c79f43359b..9cb91421b4e4 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -585,7 +585,7 @@ start_of_setup: xorl %eax, %eax subw %di, %cx shrw $2, %cx - rep; stosl + rep stosl # Jump to C code (should not return) calll main diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile new file mode 100644 index 000000000000..b514f7e81332 --- /dev/null +++ b/arch/x86/boot/startup/Makefile @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0 + +KBUILD_AFLAGS += -D__DISABLE_EXPORTS +KBUILD_CFLAGS += -D__DISABLE_EXPORTS -mcmodel=small -fPIC \ + -Os -DDISABLE_BRANCH_PROFILING \ + $(DISABLE_STACKLEAK_PLUGIN) \ + -fno-stack-protector -D__NO_FORTIFY \ + -fno-jump-tables \ + -include $(srctree)/include/linux/hidden.h + +# disable ftrace hooks and LTO +KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS)) +KASAN_SANITIZE := n +KCSAN_SANITIZE := n +KMSAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += sme.o sev-startup.o + +lib-$(CONFIG_X86_64) += la57toggle.o +lib-$(CONFIG_EFI_MIXED) += efi-mixed.o + +# +# Disable objtool validation for all library code, which is intended +# to be linked into the decompressor or the EFI stub but not vmlinux +# +$(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y diff --git a/arch/x86/boot/startup/efi-mixed.S b/arch/x86/boot/startup/efi-mixed.S new file mode 100644 index 000000000000..e04ed99bc449 --- /dev/null +++ b/arch/x86/boot/startup/efi-mixed.S @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming + * + * Early support for invoking 32-bit EFI services from a 64-bit kernel. + * + * Because this thunking occurs before ExitBootServices() we have to + * restore the firmware's 32-bit GDT and IDT before we make EFI service + * calls. + * + * On the plus side, we don't have to worry about mangling 64-bit + * addresses into 32-bits because we're executing with an identity + * mapped pagetable and haven't transitioned to 64-bit virtual addresses + * yet. + */ + +#include <linux/linkage.h> +#include <asm/desc_defs.h> +#include <asm/msr.h> +#include <asm/page_types.h> +#include <asm/pgtable_types.h> +#include <asm/processor-flags.h> +#include <asm/segment.h> + + .text + .code32 +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL +SYM_FUNC_START(efi32_stub_entry) + call 1f +1: popl %ecx + + /* Clear BSS */ + xorl %eax, %eax + leal (_bss - 1b)(%ecx), %edi + leal (_ebss - 1b)(%ecx), %ecx + subl %edi, %ecx + shrl $2, %ecx + cld + rep stosl + + add $0x4, %esp /* Discard return address */ + movl 8(%esp), %ebx /* struct boot_params pointer */ + jmp efi32_startup +SYM_FUNC_END(efi32_stub_entry) +#endif + +/* + * Called using a far call from __efi64_thunk() below, using the x86_64 SysV + * ABI (except for R8/R9 which are inaccessible to 32-bit code - EAX/EBX are + * used instead). EBP+16 points to the arguments passed via the stack. + * + * The first argument (EDI) is a pointer to the boot service or protocol, to + * which the remaining arguments are passed, each truncated to 32 bits. + */ +SYM_FUNC_START_LOCAL(efi_enter32) + /* + * Convert x86-64 SysV ABI params to i386 ABI + */ + pushl 32(%ebp) /* Up to 3 args passed via the stack */ + pushl 24(%ebp) + pushl 16(%ebp) + pushl %ebx /* R9 */ + pushl %eax /* R8 */ + pushl %ecx + pushl %edx + pushl %esi + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Disable long mode via EFER */ + movl $MSR_EFER, %ecx + rdmsr + btrl $_EFER_LME, %eax + wrmsr + + call *%edi + + /* We must preserve return value */ + movl %eax, %edi + + call efi32_enable_long_mode + + addl $32, %esp + movl %edi, %eax + lret +SYM_FUNC_END(efi_enter32) + + .code64 +SYM_FUNC_START(__efi64_thunk) + push %rbp + movl %esp, %ebp + push %rbx + + /* Move args #5 and #6 into 32-bit accessible registers */ + movl %r8d, %eax + movl %r9d, %ebx + + lcalll *efi32_call(%rip) + + pop %rbx + pop %rbp + RET +SYM_FUNC_END(__efi64_thunk) + + .code32 +SYM_FUNC_START_LOCAL(efi32_enable_long_mode) + movl %cr4, %eax + btsl $(X86_CR4_PAE_BIT), %eax + movl %eax, %cr4 + + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Disable interrupts - the firmware's IDT does not work in long mode */ + cli + + /* Enable paging */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + ret +SYM_FUNC_END(efi32_enable_long_mode) + +/* + * This is the common EFI stub entry point for mixed mode. It sets up the GDT + * and page tables needed for 64-bit execution, after which it calls the + * common 64-bit EFI entrypoint efi_stub_entry(). + * + * Arguments: 0(%esp) image handle + * 4(%esp) EFI system table pointer + * %ebx struct boot_params pointer (or NULL) + * + * Since this is the point of no return for ordinary execution, no registers + * are considered live except for the function parameters. [Note that the EFI + * stub may still exit and return to the firmware using the Exit() EFI boot + * service.] + */ +SYM_FUNC_START_LOCAL(efi32_startup) + movl %esp, %ebp + + subl $8, %esp + sgdtl (%esp) /* Save GDT descriptor to the stack */ + movl 2(%esp), %esi /* Existing GDT pointer */ + movzwl (%esp), %ecx /* Existing GDT limit */ + inc %ecx /* Existing GDT size */ + andl $~7, %ecx /* Ensure size is multiple of 8 */ + + subl %ecx, %esp /* Allocate new GDT */ + andl $~15, %esp /* Realign the stack */ + movl %esp, %edi /* New GDT address */ + leal 7(%ecx), %eax /* New GDT limit */ + pushw %cx /* Push 64-bit CS (for LJMP below) */ + pushl %edi /* Push new GDT address */ + pushw %ax /* Push new GDT limit */ + + /* Copy GDT to the stack and add a 64-bit code segment at the end */ + movl $GDT_ENTRY(DESC_CODE64, 0, 0xfffff) & 0xffffffff, (%edi,%ecx) + movl $GDT_ENTRY(DESC_CODE64, 0, 0xfffff) >> 32, 4(%edi,%ecx) + shrl $2, %ecx + cld + rep movsl /* Copy the firmware GDT */ + lgdtl (%esp) /* Switch to the new GDT */ + + call 1f +1: pop %edi + + /* Record mixed mode entry */ + movb $0x0, (efi_is64 - 1b)(%edi) + + /* Set up indirect far call to re-enter 32-bit mode */ + leal (efi32_call - 1b)(%edi), %eax + addl %eax, (%eax) + movw %cs, 4(%eax) + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Set up 1:1 mapping */ + leal (pte - 1b)(%edi), %eax + movl $_PAGE_PRESENT | _PAGE_RW | _PAGE_PSE, %ecx + leal (_PAGE_PRESENT | _PAGE_RW)(%eax), %edx +2: movl %ecx, (%eax) + addl $8, %eax + addl $PMD_SIZE, %ecx + jnc 2b + + movl $PAGE_SIZE, %ecx + .irpc l, 0123 + movl %edx, \l * 8(%eax) + addl %ecx, %edx + .endr + addl %ecx, %eax + movl %edx, (%eax) + movl %eax, %cr3 + + call efi32_enable_long_mode + + /* Set up far jump to 64-bit mode (CS is already on the stack) */ + leal (efi_stub_entry - 1b)(%edi), %eax + movl %eax, 2(%esp) + + movl 0(%ebp), %edi + movl 4(%ebp), %esi + movl %ebx, %edx + ljmpl *2(%esp) +SYM_FUNC_END(efi32_startup) + +/* + * efi_status_t efi32_pe_entry(efi_handle_t image_handle, + * efi_system_table_32_t *sys_table) + */ +SYM_FUNC_START(efi32_pe_entry) + pushl %ebx // save callee-save registers + + /* Check whether the CPU supports long mode */ + movl $0x80000001, %eax // assume extended info support + cpuid + btl $29, %edx // check long mode bit + jnc 1f + leal 8(%esp), %esp // preserve stack alignment + xor %ebx, %ebx // no struct boot_params pointer + jmp efi32_startup // only ESP and EBX remain live +1: movl $0x80000003, %eax // EFI_UNSUPPORTED + popl %ebx + RET +SYM_FUNC_END(efi32_pe_entry) + +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL + .org efi32_stub_entry + 0x200 + .code64 +SYM_FUNC_START_NOALIGN(efi64_stub_entry) + jmp efi_handover_entry +SYM_FUNC_END(efi64_stub_entry) +#endif + + .data + .balign 8 +SYM_DATA_START_LOCAL(efi32_call) + .long efi_enter32 - . + .word 0x0 +SYM_DATA_END(efi32_call) +SYM_DATA(efi_is64, .byte 1) + + .bss + .balign PAGE_SIZE +SYM_DATA_LOCAL(pte, .fill 6 * PAGE_SIZE, 1, 0) diff --git a/arch/x86/boot/startup/gdt_idt.c b/arch/x86/boot/startup/gdt_idt.c new file mode 100644 index 000000000000..a3112a69b06a --- /dev/null +++ b/arch/x86/boot/startup/gdt_idt.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/linkage.h> +#include <linux/types.h> + +#include <asm/desc.h> +#include <asm/init.h> +#include <asm/setup.h> +#include <asm/sev.h> +#include <asm/trapnr.h> + +/* + * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is + * used until the idt_table takes over. On the boot CPU this happens in + * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases + * this happens in the functions called from head_64.S. + * + * The idt_table can't be used that early because all the code modifying it is + * in idt.c and can be instrumented by tracing or KASAN, which both don't work + * during early CPU bringup. Also the idt_table has the runtime vectors + * configured which require certain CPU state to be setup already (like TSS), + * which also hasn't happened yet in early CPU bringup. + */ +static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; + +/* This may run while still in the direct mapping */ +void __head startup_64_load_idt(void *vc_handler) +{ + struct desc_ptr desc = { + .address = (unsigned long)rip_rel_ptr(bringup_idt_table), + .size = sizeof(bringup_idt_table) - 1, + }; + struct idt_data data; + gate_desc idt_desc; + + /* @vc_handler is set only for a VMM Communication Exception */ + if (vc_handler) { + init_idt_data(&data, X86_TRAP_VC, vc_handler); + idt_init_desc(&idt_desc, &data); + native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc); + } + + native_load_idt(&desc); +} + +/* + * Setup boot CPU state needed before kernel switches to virtual addresses. + */ +void __head startup_64_setup_gdt_idt(void) +{ + struct gdt_page *gp = rip_rel_ptr((void *)(__force unsigned long)&gdt_page); + void *handler = NULL; + + struct desc_ptr startup_gdt_descr = { + .address = (unsigned long)gp->gdt, + .size = GDT_SIZE - 1, + }; + + /* Load GDT */ + native_load_gdt(&startup_gdt_descr); + + /* New GDT is live - reload data segment registers */ + asm volatile("movl %%eax, %%ds\n" + "movl %%eax, %%ss\n" + "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + handler = rip_rel_ptr(vc_no_ghcb); + + startup_64_load_idt(handler); +} diff --git a/arch/x86/boot/compressed/la57toggle.S b/arch/x86/boot/startup/la57toggle.S index 9ee002387eb1..370075b4d95b 100644 --- a/arch/x86/boot/compressed/la57toggle.S +++ b/arch/x86/boot/startup/la57toggle.S @@ -5,7 +5,6 @@ #include <asm/boot.h> #include <asm/msr.h> #include <asm/processor-flags.h> -#include "pgtable.h" /* * This is the 32-bit trampoline that will be copied over to low memory. It diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c new file mode 100644 index 000000000000..099ae2559336 --- /dev/null +++ b/arch/x86/boot/startup/map_kernel.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/pgtable.h> + +#include <asm/init.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/sev.h> + +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +extern unsigned int next_early_pgt; + +static inline bool check_la57_support(void) +{ + if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + return false; + + /* + * 5-level paging is detected and enabled at kernel decompression + * stage. Only check if it has been enabled there. + */ + if (!(native_read_cr4() & X86_CR4_LA57)) + return false; + + __pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; + page_offset_base = __PAGE_OFFSET_BASE_L5; + vmalloc_base = __VMALLOC_BASE_L5; + vmemmap_base = __VMEMMAP_BASE_L5; + + return true; +} + +static unsigned long __head sme_postprocess_startup(struct boot_params *bp, + pmdval_t *pmd, + unsigned long p2v_offset) +{ + unsigned long paddr, paddr_end; + int i; + + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); + + /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (sme_get_me_mask()) { + paddr = (unsigned long)rip_rel_ptr(__start_bss_decrypted); + paddr_end = (unsigned long)rip_rel_ptr(__end_bss_decrypted); + + for (; paddr < paddr_end; paddr += PMD_SIZE) { + /* + * On SNP, transition the page to shared in the RMP table so that + * it is consistent with the page table attribute change. + * + * __start_bss_decrypted has a virtual address in the high range + * mapping (kernel .text). PVALIDATE, by way of + * early_snp_set_memory_shared(), requires a valid virtual + * address but the kernel is currently running off of the identity + * mapping so use the PA to get a *currently* valid virtual address. + */ + early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD); + + i = pmd_index(paddr - p2v_offset); + pmd[i] -= sme_get_me_mask(); + } + } + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +/* + * This code is compiled using PIC codegen because it will execute from the + * early 1:1 mapping of memory, which deviates from the mapping expected by the + * linker. Due to this deviation, taking the address of a global variable will + * produce an ambiguous result when using the plain & operator. Instead, + * rip_rel_ptr() must be used, which will return the RIP-relative address in + * the 1:1 mapping of memory. Kernel virtual addresses can be determined by + * subtracting p2v_offset from the RIP-relative address. + */ +unsigned long __head __startup_64(unsigned long p2v_offset, + struct boot_params *bp) +{ + pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts); + unsigned long physaddr = (unsigned long)rip_rel_ptr(_text); + unsigned long va_text, va_end; + unsigned long pgtable_flags; + unsigned long load_delta; + pgdval_t *pgd; + p4dval_t *p4d; + pudval_t *pud; + pmdval_t *pmd, pmd_entry; + bool la57; + int i; + + la57 = check_la57_support(); + + /* Is the address too large? */ + if (physaddr >> MAX_PHYSMEM_BITS) + for (;;); + + /* + * Compute the delta between the address I am compiled to run at + * and the address I am actually running at. + */ + phys_base = load_delta = __START_KERNEL_map + p2v_offset; + + /* Is the address not 2M aligned? */ + if (load_delta & ~PMD_MASK) + for (;;); + + va_text = physaddr - p2v_offset; + va_end = (unsigned long)rip_rel_ptr(_end) - p2v_offset; + + /* Include the SME encryption mask in the fixup value */ + load_delta += sme_get_me_mask(); + + /* Fixup the physical addresses in the page table */ + + pgd = rip_rel_ptr(early_top_pgt); + pgd[pgd_index(__START_KERNEL_map)] += load_delta; + + if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) { + p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt); + p4d[MAX_PTRS_PER_P4D - 1] += load_delta; + + pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE; + } + + level3_kernel_pgt[PTRS_PER_PUD - 2].pud += load_delta; + level3_kernel_pgt[PTRS_PER_PUD - 1].pud += load_delta; + + for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) + level2_fixmap_pgt[i].pmd += load_delta; + + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ + + pud = &early_pgts[0]->pmd; + pmd = &early_pgts[1]->pmd; + next_early_pgt = 2; + + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); + + if (la57) { + p4d = &early_pgts[next_early_pgt++]->pmd; + + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; + + i = physaddr >> P4D_SHIFT; + p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; + p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; + } else { + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; + } + + i = physaddr >> PUD_SHIFT; + pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; + pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; + + pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + pmd_entry += sme_get_me_mask(); + pmd_entry += physaddr; + + for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) { + int idx = i + (physaddr >> PMD_SHIFT); + + pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; + } + + /* + * Fixup the kernel text+data virtual addresses. Note that + * we might write invalid pmds, when the kernel is relocated + * cleanup_highmap() fixes this up along with the mappings + * beyond _end. + * + * Only the region occupied by the kernel image has so far + * been checked against the table of usable memory regions + * provided by the firmware, so invalidate pages outside that + * region. A page table entry that maps to a reserved area of + * memory would allow processor speculation into that area, + * and on some hardware (particularly the UV platform) even + * speculative access to some reserved areas is caught as an + * error, causing the BIOS to halt the system. + */ + + pmd = rip_rel_ptr(level2_kernel_pgt); + + /* invalidate pages before the kernel image */ + for (i = 0; i < pmd_index(va_text); i++) + pmd[i] &= ~_PAGE_PRESENT; + + /* fixup pages that are part of the kernel image */ + for (; i <= pmd_index(va_end); i++) + if (pmd[i] & _PAGE_PRESENT) + pmd[i] += load_delta; + + /* invalidate pages after the kernel image */ + for (; i < PTRS_PER_PMD; i++) + pmd[i] &= ~_PAGE_PRESENT; + + return sme_postprocess_startup(bp, pmd, p2v_offset); +} diff --git a/arch/x86/coco/sev/shared.c b/arch/x86/boot/startup/sev-shared.c index 2e4122f8aa6b..7a706db87b93 100644 --- a/arch/x86/coco/sev/shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -14,76 +14,23 @@ #ifndef __BOOT_COMPRESSED #define error(v) pr_err(v) #define has_cpuflag(f) boot_cpu_has(f) -#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) -#define sev_printk_rtl(fmt, ...) printk_ratelimited(fmt, ##__VA_ARGS__) #else #undef WARN #define WARN(condition, format...) (!!(condition)) -#define sev_printk(fmt, ...) -#define sev_printk_rtl(fmt, ...) #undef vc_forward_exception #define vc_forward_exception(c) panic("SNP: Hypervisor requested exception\n") #endif /* * SVSM related information: - * When running under an SVSM, the VMPL that Linux is executing at must be - * non-zero. The VMPL is therefore used to indicate the presence of an SVSM. - * * During boot, the page tables are set up as identity mapped and later * changed to use kernel virtual addresses. Maintain separate virtual and * physical addresses for the CAA to allow SVSM functions to be used during * early boot, both with identity mapped virtual addresses and proper kernel * virtual addresses. */ -u8 snp_vmpl __ro_after_init; -EXPORT_SYMBOL_GPL(snp_vmpl); -static struct svsm_ca *boot_svsm_caa __ro_after_init; -static u64 boot_svsm_caa_pa __ro_after_init; - -static struct svsm_ca *svsm_get_caa(void); -static u64 svsm_get_caa_pa(void); -static int svsm_perform_call_protocol(struct svsm_call *call); - -/* I/O parameters for CPUID-related helpers */ -struct cpuid_leaf { - u32 fn; - u32 subfn; - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; -}; - -/* - * Individual entries of the SNP CPUID table, as defined by the SNP - * Firmware ABI, Revision 0.9, Section 7.1, Table 14. - */ -struct snp_cpuid_fn { - u32 eax_in; - u32 ecx_in; - u64 xcr0_in; - u64 xss_in; - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u64 __reserved; -} __packed; - -/* - * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, - * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit - * of 64 entries per CPUID table. - */ -#define SNP_CPUID_COUNT_MAX 64 - -struct snp_cpuid_table { - u32 count; - u32 __reserved1; - u64 __reserved2; - struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; -} __packed; +struct svsm_ca *boot_svsm_caa __ro_after_init; +u64 boot_svsm_caa_pa __ro_after_init; /* * Since feature negotiation related variables are set early in the boot @@ -107,7 +54,7 @@ static u32 cpuid_std_range_max __ro_after_init; static u32 cpuid_hyp_range_max __ro_after_init; static u32 cpuid_ext_range_max __ro_after_init; -static bool __init sev_es_check_cpu_features(void) +bool __init sev_es_check_cpu_features(void) { if (!has_cpuflag(X86_FEATURE_RDRAND)) { error("RDRAND instruction not supported - no trusted source of randomness available\n"); @@ -117,7 +64,7 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void __head __noreturn +void __head __noreturn sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; @@ -136,7 +83,7 @@ sev_es_terminate(unsigned int set, unsigned int reason) /* * The hypervisor features are available from GHCB version 2 onward. */ -static u64 get_hv_features(void) +u64 get_hv_features(void) { u64 val; @@ -153,7 +100,7 @@ static u64 get_hv_features(void) return GHCB_MSR_HV_FT_RESP_VAL(val); } -static void snp_register_ghcb_early(unsigned long paddr) +void snp_register_ghcb_early(unsigned long paddr) { unsigned long pfn = paddr >> PAGE_SHIFT; u64 val; @@ -169,7 +116,7 @@ static void snp_register_ghcb_early(unsigned long paddr) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); } -static bool sev_es_negotiate_protocol(void) +bool sev_es_negotiate_protocol(void) { u64 val; @@ -190,39 +137,6 @@ static bool sev_es_negotiate_protocol(void) return true; } -static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) -{ - ghcb->save.sw_exit_code = 0; - __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); -} - -static bool vc_decoding_needed(unsigned long exit_code) -{ - /* Exceptions don't require to decode the instruction */ - return !(exit_code >= SVM_EXIT_EXCP_BASE && - exit_code <= SVM_EXIT_LAST_EXCP); -} - -static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, - struct pt_regs *regs, - unsigned long exit_code) -{ - enum es_result ret = ES_OK; - - memset(ctxt, 0, sizeof(*ctxt)); - ctxt->regs = regs; - - if (vc_decoding_needed(exit_code)) - ret = vc_decode_insn(ctxt); - - return ret; -} - -static void vc_finish_insn(struct es_em_ctxt *ctxt) -{ - ctxt->regs->ip += ctxt->insn.length; -} - static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { u32 ret; @@ -344,7 +258,7 @@ static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) * Fill in protocol and format specifiers. This can be called very early * in the boot, so use rip-relative references as needed. */ - ghcb->protocol_version = RIP_REL_REF(ghcb_version); + ghcb->protocol_version = ghcb_version; ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL); @@ -371,10 +285,10 @@ static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) return svsm_process_result_codes(call); } -static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) +enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) { /* Fill in protocol and format specifiers */ ghcb->protocol_version = ghcb_version; @@ -473,9 +387,9 @@ static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid * while running with the initial identity mapping as well as the * switch-over to kernel virtual addresses later. */ -static const struct snp_cpuid_table *snp_cpuid_get_table(void) +const struct snp_cpuid_table *snp_cpuid_get_table(void) { - return &RIP_REL_REF(cpuid_table_copy); + return rip_rel_ptr(&cpuid_table_copy); } /* @@ -672,7 +586,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value * should be treated as fatal by caller. */ -static int __head +int __head snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -701,9 +615,9 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; /* Skip post-processing for out-of-range zero leafs. */ - if (!(leaf->fn <= RIP_REL_REF(cpuid_std_range_max) || - (leaf->fn >= 0x40000000 && leaf->fn <= RIP_REL_REF(cpuid_hyp_range_max)) || - (leaf->fn >= 0x80000000 && leaf->fn <= RIP_REL_REF(cpuid_ext_range_max)))) + if (!(leaf->fn <= cpuid_std_range_max || + (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) || + (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max))) return 0; } @@ -782,391 +696,6 @@ fail: sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } -static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt, - unsigned long address, - bool write) -{ - if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_USER; - ctxt->fi.cr2 = address; - if (write) - ctxt->fi.error_code |= X86_PF_WRITE; - - return ES_EXCEPTION; - } - - return ES_OK; -} - -static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, - void *src, char *buf, - unsigned int data_size, - unsigned int count, - bool backwards) -{ - int i, b = backwards ? -1 : 1; - unsigned long address = (unsigned long)src; - enum es_result ret; - - ret = vc_insn_string_check(ctxt, address, false); - if (ret != ES_OK) - return ret; - - for (i = 0; i < count; i++) { - void *s = src + (i * data_size * b); - char *d = buf + (i * data_size); - - ret = vc_read_mem(ctxt, s, d, data_size); - if (ret != ES_OK) - break; - } - - return ret; -} - -static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, - void *dst, char *buf, - unsigned int data_size, - unsigned int count, - bool backwards) -{ - int i, s = backwards ? -1 : 1; - unsigned long address = (unsigned long)dst; - enum es_result ret; - - ret = vc_insn_string_check(ctxt, address, true); - if (ret != ES_OK) - return ret; - - for (i = 0; i < count; i++) { - void *d = dst + (i * data_size * s); - char *b = buf + (i * data_size); - - ret = vc_write_mem(ctxt, d, b, data_size); - if (ret != ES_OK) - break; - } - - return ret; -} - -#define IOIO_TYPE_STR BIT(2) -#define IOIO_TYPE_IN 1 -#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) -#define IOIO_TYPE_OUT 0 -#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) - -#define IOIO_REP BIT(3) - -#define IOIO_ADDR_64 BIT(9) -#define IOIO_ADDR_32 BIT(8) -#define IOIO_ADDR_16 BIT(7) - -#define IOIO_DATA_32 BIT(6) -#define IOIO_DATA_16 BIT(5) -#define IOIO_DATA_8 BIT(4) - -#define IOIO_SEG_ES (0 << 10) -#define IOIO_SEG_DS (3 << 10) - -static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) -{ - struct insn *insn = &ctxt->insn; - size_t size; - u64 port; - - *exitinfo = 0; - - switch (insn->opcode.bytes[0]) { - /* INS opcodes */ - case 0x6c: - case 0x6d: - *exitinfo |= IOIO_TYPE_INS; - *exitinfo |= IOIO_SEG_ES; - port = ctxt->regs->dx & 0xffff; - break; - - /* OUTS opcodes */ - case 0x6e: - case 0x6f: - *exitinfo |= IOIO_TYPE_OUTS; - *exitinfo |= IOIO_SEG_DS; - port = ctxt->regs->dx & 0xffff; - break; - - /* IN immediate opcodes */ - case 0xe4: - case 0xe5: - *exitinfo |= IOIO_TYPE_IN; - port = (u8)insn->immediate.value & 0xffff; - break; - - /* OUT immediate opcodes */ - case 0xe6: - case 0xe7: - *exitinfo |= IOIO_TYPE_OUT; - port = (u8)insn->immediate.value & 0xffff; - break; - - /* IN register opcodes */ - case 0xec: - case 0xed: - *exitinfo |= IOIO_TYPE_IN; - port = ctxt->regs->dx & 0xffff; - break; - - /* OUT register opcodes */ - case 0xee: - case 0xef: - *exitinfo |= IOIO_TYPE_OUT; - port = ctxt->regs->dx & 0xffff; - break; - - default: - return ES_DECODE_FAILED; - } - - *exitinfo |= port << 16; - - switch (insn->opcode.bytes[0]) { - case 0x6c: - case 0x6e: - case 0xe4: - case 0xe6: - case 0xec: - case 0xee: - /* Single byte opcodes */ - *exitinfo |= IOIO_DATA_8; - size = 1; - break; - default: - /* Length determined by instruction parsing */ - *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 - : IOIO_DATA_32; - size = (insn->opnd_bytes == 2) ? 2 : 4; - } - - switch (insn->addr_bytes) { - case 2: - *exitinfo |= IOIO_ADDR_16; - break; - case 4: - *exitinfo |= IOIO_ADDR_32; - break; - case 8: - *exitinfo |= IOIO_ADDR_64; - break; - } - - if (insn_has_rep_prefix(insn)) - *exitinfo |= IOIO_REP; - - return vc_ioio_check(ctxt, (u16)port, size); -} - -static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - u64 exit_info_1, exit_info_2; - enum es_result ret; - - ret = vc_ioio_exitinfo(ctxt, &exit_info_1); - if (ret != ES_OK) - return ret; - - if (exit_info_1 & IOIO_TYPE_STR) { - - /* (REP) INS/OUTS */ - - bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); - unsigned int io_bytes, exit_bytes; - unsigned int ghcb_count, op_count; - unsigned long es_base; - u64 sw_scratch; - - /* - * For the string variants with rep prefix the amount of in/out - * operations per #VC exception is limited so that the kernel - * has a chance to take interrupts and re-schedule while the - * instruction is emulated. - */ - io_bytes = (exit_info_1 >> 4) & 0x7; - ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; - - op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; - exit_info_2 = min(op_count, ghcb_count); - exit_bytes = exit_info_2 * io_bytes; - - es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); - - /* Read bytes of OUTS into the shared buffer */ - if (!(exit_info_1 & IOIO_TYPE_IN)) { - ret = vc_insn_string_read(ctxt, - (void *)(es_base + regs->si), - ghcb->shared_buffer, io_bytes, - exit_info_2, df); - if (ret) - return ret; - } - - /* - * Issue an VMGEXIT to the HV to consume the bytes from the - * shared buffer or to have it write them into the shared buffer - * depending on the instruction: OUTS or INS. - */ - sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); - ghcb_set_sw_scratch(ghcb, sw_scratch); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, - exit_info_1, exit_info_2); - if (ret != ES_OK) - return ret; - - /* Read bytes from shared buffer into the guest's destination. */ - if (exit_info_1 & IOIO_TYPE_IN) { - ret = vc_insn_string_write(ctxt, - (void *)(es_base + regs->di), - ghcb->shared_buffer, io_bytes, - exit_info_2, df); - if (ret) - return ret; - - if (df) - regs->di -= exit_bytes; - else - regs->di += exit_bytes; - } else { - if (df) - regs->si -= exit_bytes; - else - regs->si += exit_bytes; - } - - if (exit_info_1 & IOIO_REP) - regs->cx -= exit_info_2; - - ret = regs->cx ? ES_RETRY : ES_OK; - - } else { - - /* IN/OUT into/from rAX */ - - int bits = (exit_info_1 & 0x70) >> 1; - u64 rax = 0; - - if (!(exit_info_1 & IOIO_TYPE_IN)) - rax = lower_bits(regs->ax, bits); - - ghcb_set_rax(ghcb, rax); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); - if (ret != ES_OK) - return ret; - - if (exit_info_1 & IOIO_TYPE_IN) { - if (!ghcb_rax_is_valid(ghcb)) - return ES_VMM_ERROR; - regs->ax = lower_bits(ghcb->save.rax, bits); - } - } - - return ret; -} - -static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - struct cpuid_leaf leaf; - int ret; - - leaf.fn = regs->ax; - leaf.subfn = regs->cx; - ret = snp_cpuid(ghcb, ctxt, &leaf); - if (!ret) { - regs->ax = leaf.eax; - regs->bx = leaf.ebx; - regs->cx = leaf.ecx; - regs->dx = leaf.edx; - } - - return ret; -} - -static enum es_result vc_handle_cpuid(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - u32 cr4 = native_read_cr4(); - enum es_result ret; - int snp_cpuid_ret; - - snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt); - if (!snp_cpuid_ret) - return ES_OK; - if (snp_cpuid_ret != -EOPNOTSUPP) - return ES_VMM_ERROR; - - ghcb_set_rax(ghcb, regs->ax); - ghcb_set_rcx(ghcb, regs->cx); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #GP - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - regs->ax = ghcb->save.rax; - regs->bx = ghcb->save.rbx; - regs->cx = ghcb->save.rcx; - regs->dx = ghcb->save.rdx; - - return ES_OK; -} - -static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - unsigned long exit_code) -{ - bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); - enum es_result ret; - - /* - * The hypervisor should not be intercepting RDTSC/RDTSCP when Secure - * TSC is enabled. A #VC exception will be generated if the RDTSC/RDTSCP - * instructions are being intercepted. If this should occur and Secure - * TSC is enabled, guest execution should be terminated as the guest - * cannot rely on the TSC value provided by the hypervisor. - */ - if (sev_status & MSR_AMD64_SNP_SECURE_TSC) - return ES_VMM_ERROR; - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && - (!rdtscp || ghcb_rcx_is_valid(ghcb)))) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - ctxt->regs->dx = ghcb->save.rdx; - if (rdtscp) - ctxt->regs->cx = ghcb->save.rcx; - - return ES_OK; -} - struct cc_setup_data { struct setup_data header; u32 cc_blob_address; @@ -1224,36 +753,14 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; if (fn->eax_in == 0x0) - RIP_REL_REF(cpuid_std_range_max) = fn->eax; + cpuid_std_range_max = fn->eax; else if (fn->eax_in == 0x40000000) - RIP_REL_REF(cpuid_hyp_range_max) = fn->eax; + cpuid_hyp_range_max = fn->eax; else if (fn->eax_in == 0x80000000) - RIP_REL_REF(cpuid_ext_range_max) = fn->eax; + cpuid_ext_range_max = fn->eax; } } -static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size, - int ret, u64 svsm_ret) -{ - WARN(1, "PVALIDATE failure: pfn: 0x%llx, action: %u, size: %u, ret: %d, svsm_ret: 0x%llx\n", - pfn, action, page_size, ret, svsm_ret); - - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); -} - -static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svsm_ret) -{ - unsigned int page_size; - bool action; - u64 pfn; - - pfn = pc->entry[pc->cur_index].pfn; - action = pc->entry[pc->cur_index].action; - page_size = pc->entry[pc->cur_index].page_size; - - __pval_terminate(pfn, action, page_size, ret, svsm_ret); -} - static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) { struct svsm_pvalidate_call *pc; @@ -1296,11 +803,7 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, { int ret; - /* - * This can be called very early during boot, so use rIP-relative - * references as needed. - */ - if (RIP_REL_REF(snp_vmpl)) { + if (snp_vmpl) { svsm_pval_4k_page(paddr, validate); } else { ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); @@ -1309,351 +812,6 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, } } -static void pval_pages(struct snp_psc_desc *desc) -{ - struct psc_entry *e; - unsigned long vaddr; - unsigned int size; - unsigned int i; - bool validate; - u64 pfn; - int rc; - - for (i = 0; i <= desc->hdr.end_entry; i++) { - e = &desc->entries[i]; - - pfn = e->gfn; - vaddr = (unsigned long)pfn_to_kaddr(pfn); - size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; - validate = e->operation == SNP_PAGE_STATE_PRIVATE; - - rc = pvalidate(vaddr, size, validate); - if (!rc) - continue; - - if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) { - unsigned long vaddr_end = vaddr + PMD_SIZE; - - for (; vaddr < vaddr_end; vaddr += PAGE_SIZE, pfn++) { - rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); - if (rc) - __pval_terminate(pfn, validate, RMP_PG_SIZE_4K, rc, 0); - } - } else { - __pval_terminate(pfn, validate, size, rc, 0); - } - } -} - -static u64 svsm_build_ca_from_pfn_range(u64 pfn, u64 pfn_end, bool action, - struct svsm_pvalidate_call *pc) -{ - struct svsm_pvalidate_entry *pe; - - /* Nothing in the CA yet */ - pc->num_entries = 0; - pc->cur_index = 0; - - pe = &pc->entry[0]; - - while (pfn < pfn_end) { - pe->page_size = RMP_PG_SIZE_4K; - pe->action = action; - pe->ignore_cf = 0; - pe->pfn = pfn; - - pe++; - pfn++; - - pc->num_entries++; - if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT) - break; - } - - return pfn; -} - -static int svsm_build_ca_from_psc_desc(struct snp_psc_desc *desc, unsigned int desc_entry, - struct svsm_pvalidate_call *pc) -{ - struct svsm_pvalidate_entry *pe; - struct psc_entry *e; - - /* Nothing in the CA yet */ - pc->num_entries = 0; - pc->cur_index = 0; - - pe = &pc->entry[0]; - e = &desc->entries[desc_entry]; - - while (desc_entry <= desc->hdr.end_entry) { - pe->page_size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; - pe->action = e->operation == SNP_PAGE_STATE_PRIVATE; - pe->ignore_cf = 0; - pe->pfn = e->gfn; - - pe++; - e++; - - desc_entry++; - pc->num_entries++; - if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT) - break; - } - - return desc_entry; -} - -static void svsm_pval_pages(struct snp_psc_desc *desc) -{ - struct svsm_pvalidate_entry pv_4k[VMGEXIT_PSC_MAX_ENTRY]; - unsigned int i, pv_4k_count = 0; - struct svsm_pvalidate_call *pc; - struct svsm_call call = {}; - unsigned long flags; - bool action; - u64 pc_pa; - int ret; - - /* - * This can be called very early in the boot, use native functions in - * order to avoid paravirt issues. - */ - flags = native_local_irq_save(); - - /* - * The SVSM calling area (CA) can support processing 510 entries at a - * time. Loop through the Page State Change descriptor until the CA is - * full or the last entry in the descriptor is reached, at which time - * the SVSM is invoked. This repeats until all entries in the descriptor - * are processed. - */ - call.caa = svsm_get_caa(); - - pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer; - pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer); - - /* Protocol 0, Call ID 1 */ - call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE); - call.rcx = pc_pa; - - for (i = 0; i <= desc->hdr.end_entry;) { - i = svsm_build_ca_from_psc_desc(desc, i, pc); - - do { - ret = svsm_perform_call_protocol(&call); - if (!ret) - continue; - - /* - * Check if the entry failed because of an RMP mismatch (a - * PVALIDATE at 2M was requested, but the page is mapped in - * the RMP as 4K). - */ - - if (call.rax_out == SVSM_PVALIDATE_FAIL_SIZEMISMATCH && - pc->entry[pc->cur_index].page_size == RMP_PG_SIZE_2M) { - /* Save this entry for post-processing at 4K */ - pv_4k[pv_4k_count++] = pc->entry[pc->cur_index]; - - /* Skip to the next one unless at the end of the list */ - pc->cur_index++; - if (pc->cur_index < pc->num_entries) - ret = -EAGAIN; - else - ret = 0; - } - } while (ret == -EAGAIN); - - if (ret) - svsm_pval_terminate(pc, ret, call.rax_out); - } - - /* Process any entries that failed to be validated at 2M and validate them at 4K */ - for (i = 0; i < pv_4k_count; i++) { - u64 pfn, pfn_end; - - action = pv_4k[i].action; - pfn = pv_4k[i].pfn; - pfn_end = pfn + 512; - - while (pfn < pfn_end) { - pfn = svsm_build_ca_from_pfn_range(pfn, pfn_end, action, pc); - - ret = svsm_perform_call_protocol(&call); - if (ret) - svsm_pval_terminate(pc, ret, call.rax_out); - } - } - - native_local_irq_restore(flags); -} - -static void pvalidate_pages(struct snp_psc_desc *desc) -{ - if (snp_vmpl) - svsm_pval_pages(desc); - else - pval_pages(desc); -} - -static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) -{ - int cur_entry, end_entry, ret = 0; - struct snp_psc_desc *data; - struct es_em_ctxt ctxt; - - vc_ghcb_invalidate(ghcb); - - /* Copy the input desc into GHCB shared buffer */ - data = (struct snp_psc_desc *)ghcb->shared_buffer; - memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); - - /* - * As per the GHCB specification, the hypervisor can resume the guest - * before processing all the entries. Check whether all the entries - * are processed. If not, then keep retrying. Note, the hypervisor - * will update the data memory directly to indicate the status, so - * reference the data->hdr everywhere. - * - * The strategy here is to wait for the hypervisor to change the page - * state in the RMP table before guest accesses the memory pages. If the - * page state change was not successful, then later memory access will - * result in a crash. - */ - cur_entry = data->hdr.cur_entry; - end_entry = data->hdr.end_entry; - - while (data->hdr.cur_entry <= data->hdr.end_entry) { - ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); - - /* This will advance the shared buffer data points to. */ - ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); - - /* - * Page State Change VMGEXIT can pass error code through - * exit_info_2. - */ - if (WARN(ret || ghcb->save.sw_exit_info_2, - "SNP: PSC failed ret=%d exit_info_2=%llx\n", - ret, ghcb->save.sw_exit_info_2)) { - ret = 1; - goto out; - } - - /* Verify that reserved bit is not set */ - if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { - ret = 1; - goto out; - } - - /* - * Sanity check that entry processing is not going backwards. - * This will happen only if hypervisor is tricking us. - */ - if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, -"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", - end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { - ret = 1; - goto out; - } - } - -out: - return ret; -} - -static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt, - unsigned long exit_code) -{ - unsigned int opcode = (unsigned int)ctxt->insn.opcode.value; - u8 modrm = ctxt->insn.modrm.value; - - switch (exit_code) { - - case SVM_EXIT_IOIO: - case SVM_EXIT_NPF: - /* handled separately */ - return ES_OK; - - case SVM_EXIT_CPUID: - if (opcode == 0xa20f) - return ES_OK; - break; - - case SVM_EXIT_INVD: - if (opcode == 0x080f) - return ES_OK; - break; - - case SVM_EXIT_MONITOR: - /* MONITOR and MONITORX instructions generate the same error code */ - if (opcode == 0x010f && (modrm == 0xc8 || modrm == 0xfa)) - return ES_OK; - break; - - case SVM_EXIT_MWAIT: - /* MWAIT and MWAITX instructions generate the same error code */ - if (opcode == 0x010f && (modrm == 0xc9 || modrm == 0xfb)) - return ES_OK; - break; - - case SVM_EXIT_MSR: - /* RDMSR */ - if (opcode == 0x320f || - /* WRMSR */ - opcode == 0x300f) - return ES_OK; - break; - - case SVM_EXIT_RDPMC: - if (opcode == 0x330f) - return ES_OK; - break; - - case SVM_EXIT_RDTSC: - if (opcode == 0x310f) - return ES_OK; - break; - - case SVM_EXIT_RDTSCP: - if (opcode == 0x010f && modrm == 0xf9) - return ES_OK; - break; - - case SVM_EXIT_READ_DR7: - if (opcode == 0x210f && - X86_MODRM_REG(ctxt->insn.modrm.value) == 7) - return ES_OK; - break; - - case SVM_EXIT_VMMCALL: - if (opcode == 0x010f && modrm == 0xd9) - return ES_OK; - - break; - - case SVM_EXIT_WRITE_DR7: - if (opcode == 0x230f && - X86_MODRM_REG(ctxt->insn.modrm.value) == 7) - return ES_OK; - break; - - case SVM_EXIT_WBINVD: - if (opcode == 0x90f) - return ES_OK; - break; - - default: - break; - } - - sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n", - opcode, exit_code, ctxt->regs->ip); - - return ES_UNSUPPORTED; -} - /* * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM * services needed when not running in VMPL0. @@ -1681,7 +839,7 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) * routine is running identity mapped when called, both by the decompressor * code and the early kernel code. */ - if (!rmpadjust((unsigned long)&RIP_REL_REF(boot_ghcb_page), RMP_PG_SIZE_4K, 1)) + if (!rmpadjust((unsigned long)rip_rel_ptr(&boot_ghcb_page), RMP_PG_SIZE_4K, 1)) return false; /* @@ -1698,7 +856,7 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) if (!secrets_page->svsm_guest_vmpl) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_VMPL0); - RIP_REL_REF(snp_vmpl) = secrets_page->svsm_guest_vmpl; + snp_vmpl = secrets_page->svsm_guest_vmpl; caa = secrets_page->svsm_caa; @@ -1713,8 +871,8 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) * The CA is identity mapped when this routine is called, both by the * decompressor code and the early kernel code. */ - RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)caa; - RIP_REL_REF(boot_svsm_caa_pa) = caa; + boot_svsm_caa = (struct svsm_ca *)caa; + boot_svsm_caa_pa = caa; /* Advertise the SVSM presence via CPUID. */ cpuid_table = (struct snp_cpuid_table *)snp_cpuid_get_table(); diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c new file mode 100644 index 000000000000..435853a55768 --- /dev/null +++ b/arch/x86/boot/startup/sev-startup.c @@ -0,0 +1,368 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#define pr_fmt(fmt) "SEV: " fmt + +#include <linux/percpu-defs.h> +#include <linux/cc_platform.h> +#include <linux/printk.h> +#include <linux/mm_types.h> +#include <linux/set_memory.h> +#include <linux/memblock.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/cpumask.h> +#include <linux/efi.h> +#include <linux/io.h> +#include <linux/psp-sev.h> +#include <uapi/linux/sev-guest.h> + +#include <asm/init.h> +#include <asm/cpu_entry_area.h> +#include <asm/stacktrace.h> +#include <asm/sev.h> +#include <asm/sev-internal.h> +#include <asm/insn-eval.h> +#include <asm/fpu/xcr.h> +#include <asm/processor.h> +#include <asm/realmode.h> +#include <asm/setup.h> +#include <asm/traps.h> +#include <asm/svm.h> +#include <asm/smp.h> +#include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/cpuid.h> +#include <asm/cmdline.h> + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +struct ghcb *boot_ghcb __section(".data"); + +/* Bitmap of SEV features supported by the hypervisor */ +u64 sev_hv_features __ro_after_init; + +/* Secrets page physical address from the CC blob */ +u64 sev_secrets_pa __ro_after_init; + +/* For early boot SVSM communication */ +struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); + +DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); +DEFINE_PER_CPU(u64, svsm_caa_pa); + +/* + * Nothing shall interrupt this code path while holding the per-CPU + * GHCB. The backup GHCB is only for NMIs interrupting this path. + * + * Callers must disable local interrupts around it. + */ +noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) { + /* + * Backup-GHCB is also already in use. There is no way + * to continue here so just kill the machine. To make + * panic() work, mark GHCBs inactive so that messages + * can be printed out. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + instrumentation_begin(); + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + instrumentation_end(); + } + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +/* Include code shared with pre-decompression boot stage */ +#include "sev-shared.c" + +noinstr void __sev_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + /* + * Invalidate the GHCB so a VMGEXIT instruction issued + * from userspace won't appear to be valid. + */ + vc_ghcb_invalidate(ghcb); + data->ghcb_active = false; + } +} + +int svsm_perform_call_protocol(struct svsm_call *call) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + /* + * This can be called very early in the boot, use native functions in + * order to avoid paravirt issues. + */ + flags = native_local_irq_save(); + + if (sev_cfg.ghcbs_initialized) + ghcb = __sev_get_ghcb(&state); + else if (boot_ghcb) + ghcb = boot_ghcb; + else + ghcb = NULL; + + do { + ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) + : svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + if (sev_cfg.ghcbs_initialized) + __sev_put_ghcb(&state); + + native_local_irq_restore(flags); + + return ret; +} + +void __head +early_set_pages_state(unsigned long vaddr, unsigned long paddr, + unsigned long npages, enum psc_op op) +{ + unsigned long paddr_end; + u64 val; + + vaddr = vaddr & PAGE_MASK; + + paddr = paddr & PAGE_MASK; + paddr_end = paddr + (npages << PAGE_SHIFT); + + while (paddr < paddr_end) { + /* Page validation must be rescinded before changing to shared */ + if (op == SNP_PAGE_STATE_SHARED) + pvalidate_4k_page(vaddr, paddr, false); + + /* + * Use the MSR protocol because this function can be called before + * the GHCB is established. + */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) + goto e_term; + + if (GHCB_MSR_PSC_RESP_VAL(val)) + goto e_term; + + /* Page validation must be performed after changing to private */ + if (op == SNP_PAGE_STATE_PRIVATE) + pvalidate_4k_page(vaddr, paddr, true); + + vaddr += PAGE_SIZE; + paddr += PAGE_SIZE; + } + + return; + +e_term: + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned long npages) +{ + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* + * Ask the hypervisor to mark the memory pages as private in the RMP + * table. + */ + early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); +} + +void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned long npages) +{ + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* Ask hypervisor to mark the memory pages shared in the RMP table. */ + early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the kernel + * in the following ways, depending on how it is booted: + * + * - when booted via the boot/decompress kernel: + * - via boot_params + * + * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): + * - via a setup_data entry, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + /* Boot kernel would have passed the CC blob via boot_params. */ + if (bp->cc_blob_address) { + cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; + goto found_cc_info; + } + + /* + * If kernel was booted directly, without the use of the + * boot/decompression kernel, the CC blob may have been passed via + * setup_data instead. + */ + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + snp_abort(); + + return cc_info; +} + +static __head void svsm_setup(struct cc_blob_sev_info *cc_info) +{ + struct svsm_call call = {}; + int ret; + u64 pa; + + /* + * Record the SVSM Calling Area address (CAA) if the guest is not + * running at VMPL0. The CA will be used to communicate with the + * SVSM to perform the SVSM services. + */ + if (!svsm_setup_ca(cc_info)) + return; + + /* + * It is very early in the boot and the kernel is running identity + * mapped but without having adjusted the pagetables to where the + * kernel was loaded (physbase), so the get the CA address using + * RIP-relative addressing. + */ + pa = (u64)rip_rel_ptr(&boot_svsm_ca_page); + + /* + * Switch over to the boot SVSM CA while the current CA is still + * addressable. There is no GHCB at this point so use the MSR protocol. + * + * SVSM_CORE_REMAP_CA call: + * RAX = 0 (Protocol=0, CallID=0) + * RCX = New CA GPA + */ + call.caa = svsm_get_caa(); + call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); + call.rcx = pa; + ret = svsm_perform_call_protocol(&call); + if (ret) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); + + boot_svsm_caa = (struct svsm_ca *)pa; + boot_svsm_caa_pa = pa; +} + +bool __head snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + if (cc_info->secrets_phys && cc_info->secrets_len == PAGE_SIZE) + sev_secrets_pa = cc_info->secrets_phys; + else + return false; + + setup_cpuid_table(cc_info); + + svsm_setup(cc_info); + + /* + * The CC blob will be used later to access the secrets page. Cache + * it here like the boot kernel does. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} + +void __head __noreturn snp_abort(void) +{ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); +} diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/boot/startup/sme.c index 5eecdd92da10..753cd2094080 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/boot/startup/sme.c @@ -45,8 +45,6 @@ #include <asm/coco.h> #include <asm/sev.h> -#include "mm_internal.h" - #define PGD_FLAGS _KERNPG_TABLE_NOENC #define P4D_FLAGS _KERNPG_TABLE_NOENC #define PUD_FLAGS _KERNPG_TABLE_NOENC @@ -299,8 +297,7 @@ void __head sme_encrypt_kernel(struct boot_params *bp) * instrumentation or checking boot_cpu_data in the cc_platform_has() * function. */ - if (!sme_get_me_mask() || - RIP_REL_REF(sev_status) & MSR_AMD64_SEV_ENABLED) + if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED) return; /* @@ -318,8 +315,8 @@ void __head sme_encrypt_kernel(struct boot_params *bp) * memory from being cached. */ - kernel_start = (unsigned long)RIP_REL_REF(_text); - kernel_end = ALIGN((unsigned long)RIP_REL_REF(_end), PMD_SIZE); + kernel_start = (unsigned long)rip_rel_ptr(_text); + kernel_end = ALIGN((unsigned long)rip_rel_ptr(_end), PMD_SIZE); kernel_len = kernel_end - kernel_start; initrd_start = 0; @@ -345,7 +342,7 @@ void __head sme_encrypt_kernel(struct boot_params *bp) * pagetable structures for the encryption of the kernel * pagetable structures for workarea (in case not currently mapped) */ - execute_start = workarea_start = (unsigned long)RIP_REL_REF(sme_workarea); + execute_start = workarea_start = (unsigned long)rip_rel_ptr(sme_workarea); execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE; execute_len = execute_end - execute_start; @@ -526,7 +523,7 @@ void __head sme_enable(struct boot_params *bp) me_mask = 1UL << (ebx & 0x3f); /* Check the SEV MSR whether SEV or SME is enabled */ - RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV); + sev_status = msr = __rdmsr(MSR_AMD64_SEV); feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; /* @@ -562,8 +559,17 @@ void __head sme_enable(struct boot_params *bp) return; } - RIP_REL_REF(sme_me_mask) = me_mask; - RIP_REL_REF(physical_mask) &= ~me_mask; - RIP_REL_REF(cc_vendor) = CC_VENDOR_AMD; + sme_me_mask = me_mask; + physical_mask &= ~me_mask; + cc_vendor = CC_VENDOR_AMD; cc_set_mask(me_mask); } + +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION +/* Local version for startup code, which never operates on user page tables */ +__weak +pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +#endif diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 84f7a883ce1e..f35369bb14c5 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -32,7 +32,7 @@ int memcmp(const void *s1, const void *s2, size_t len) { bool diff; - asm("repe; cmpsb" CC_SET(nz) + asm("repe cmpsb" CC_SET(nz) : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index f2e96905b3fe..0641c8c46aee 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -292,7 +292,7 @@ static void restore_screen(void) "shrw %%cx ; " "jnc 1f ; " "stosw \n\t" - "1: rep;stosl ; " + "1: rep stosl ; " "popw %%es" : "+D" (dst), "+c" (npad) : "bdS" (video_segment), diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index 9a0ddda3aa69..d4610af68114 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -18,7 +18,9 @@ #include <asm/processor.h> enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE; +SYM_PIC_ALIAS(cc_vendor); u64 cc_mask __ro_after_init; +SYM_PIC_ALIAS(cc_mask); static struct cc_attr_flags { __u64 host_sev_snp : 1, diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile index dcb06dc8b5ae..db3255b979bd 100644 --- a/arch/x86/coco/sev/Makefile +++ b/arch/x86/coco/sev/Makefile @@ -1,22 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += core.o - -# jump tables are emitted using absolute references in non-PIC code -# so they cannot be used in the early SEV startup code -CFLAGS_core.o += -fno-jump-tables - -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_core.o = -pg -endif - -KASAN_SANITIZE_core.o := n -KMSAN_SANITIZE_core.o := n -KCOV_INSTRUMENT_core.o := n - -# With some compiler versions the generated code results in boot hangs, caused -# by several compilation units. To be safe, disable all instrumentation. -KCSAN_SANITIZE := n +obj-y += core.o sev-nmi.o vc-handle.o # Clang 14 and older may fail to respect __no_sanitize_undefined when inlining -UBSAN_SANITIZE := n +UBSAN_SANITIZE_sev-nmi.o := n + +# GCC may fail to respect __no_sanitize_address when inlining +KASAN_SANITIZE_sev-nmi.o := n diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index b0c1a7a57497..ac400525de73 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -31,6 +31,7 @@ #include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> #include <asm/sev.h> +#include <asm/sev-internal.h> #include <asm/insn-eval.h> #include <asm/fpu/xcr.h> #include <asm/processor.h> @@ -44,8 +45,6 @@ #include <asm/cpuid.h> #include <asm/cmdline.h> -#define DR7_RESET_VALUE 0x400 - /* AP INIT values as documented in the APM2 section "Processor Initialization State" */ #define AP_INIT_CS_LIMIT 0xffff #define AP_INIT_DS_LIMIT 0xffff @@ -81,21 +80,6 @@ static const char * const sev_status_feat_names[] = { [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", }; -/* For early boot hypervisor communication in SEV-ES enabled guests */ -static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); - -/* - * Needs to be in the .data section because we need it NULL before bss is - * cleared - */ -static struct ghcb *boot_ghcb __section(".data"); - -/* Bitmap of SEV features supported by the hypervisor */ -static u64 sev_hv_features __ro_after_init; - -/* Secrets page physical address from the CC blob */ -static u64 secrets_pa __ro_after_init; - /* * For Secure TSC guests, the BSP fetches TSC_INFO using SNP guest messaging and * initializes snp_tsc_scale and snp_tsc_offset. These values are replicated @@ -105,558 +89,196 @@ static u64 snp_tsc_scale __ro_after_init; static u64 snp_tsc_offset __ro_after_init; static u64 snp_tsc_freq_khz __ro_after_init; -/* #VC handler runtime per-CPU data */ -struct sev_es_runtime_data { - struct ghcb ghcb_page; - - /* - * Reserve one page per CPU as backup storage for the unencrypted GHCB. - * It is needed when an NMI happens while the #VC handler uses the real - * GHCB, and the NMI handler itself is causing another #VC exception. In - * that case the GHCB content of the first handler needs to be backed up - * and restored. - */ - struct ghcb backup_ghcb; - - /* - * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. - * There is no need for it to be atomic, because nothing is written to - * the GHCB between the read and the write of ghcb_active. So it is safe - * to use it when a nested #VC exception happens before the write. - * - * This is necessary for example in the #VC->NMI->#VC case when the NMI - * happens while the first #VC handler uses the GHCB. When the NMI code - * raises a second #VC handler it might overwrite the contents of the - * GHCB written by the first handler. To avoid this the content of the - * GHCB is saved and restored when the GHCB is detected to be in use - * already. - */ - bool ghcb_active; - bool backup_ghcb_active; - - /* - * Cached DR7 value - write it on DR7 writes and return it on reads. - * That value will never make it to the real hardware DR7 as debugging - * is currently unsupported in SEV-ES guests. - */ - unsigned long dr7; -}; - -struct ghcb_state { - struct ghcb *ghcb; -}; - -/* For early boot SVSM communication */ -static struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); - -static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); -static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); -static DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); -static DEFINE_PER_CPU(u64, svsm_caa_pa); - -static __always_inline bool on_vc_stack(struct pt_regs *regs) -{ - unsigned long sp = regs->sp; - - /* User-mode RSP is not trusted */ - if (user_mode(regs)) - return false; - - /* SYSCALL gap still has user-mode RSP */ - if (ip_within_syscall_gap(regs)) - return false; - - return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); -} +DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); /* - * This function handles the case when an NMI is raised in the #VC - * exception handler entry code, before the #VC handler has switched off - * its IST stack. In this case, the IST entry for #VC must be adjusted, - * so that any nested #VC exception will not overwrite the stack - * contents of the interrupted #VC handler. - * - * The IST entry is adjusted unconditionally so that it can be also be - * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a - * nested sev_es_ist_exit() call may adjust back the IST entry too - * early. - * - * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run - * on the NMI IST stack, as they are only called from NMI handling code - * right now. + * SVSM related information: + * When running under an SVSM, the VMPL that Linux is executing at must be + * non-zero. The VMPL is therefore used to indicate the presence of an SVSM. */ -void noinstr __sev_es_ist_enter(struct pt_regs *regs) -{ - unsigned long old_ist, new_ist; - - /* Read old IST entry */ - new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - - /* - * If NMI happened while on the #VC IST stack, set the new IST - * value below regs->sp, so that the interrupted stack frame is - * not overwritten by subsequent #VC exceptions. - */ - if (on_vc_stack(regs)) - new_ist = regs->sp; - - /* - * Reserve additional 8 bytes and store old IST value so this - * adjustment can be unrolled in __sev_es_ist_exit(). - */ - new_ist -= sizeof(old_ist); - *(unsigned long *)new_ist = old_ist; - - /* Set new IST entry */ - this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); -} +u8 snp_vmpl __ro_after_init; +EXPORT_SYMBOL_GPL(snp_vmpl); -void noinstr __sev_es_ist_exit(void) +static u64 __init get_snp_jump_table_addr(void) { - unsigned long ist; + struct snp_secrets_page *secrets; + void __iomem *mem; + u64 addr; - /* Read IST entry */ - ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + mem = ioremap_encrypted(sev_secrets_pa, PAGE_SIZE); + if (!mem) { + pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); + return 0; + } - if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) - return; + secrets = (__force struct snp_secrets_page *)mem; - /* Read back old IST entry and write it to the TSS */ - this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); + addr = secrets->os_area.ap_jump_table_pa; + iounmap(mem); + + return addr; } -/* - * Nothing shall interrupt this code path while holding the per-CPU - * GHCB. The backup GHCB is only for NMIs interrupting this path. - * - * Callers must disable local interrupts around it. - */ -static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) +static u64 __init get_jump_table_addr(void) { - struct sev_es_runtime_data *data; + struct ghcb_state state; + unsigned long flags; struct ghcb *ghcb; + u64 ret = 0; - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (unlikely(data->ghcb_active)) { - /* GHCB is already in use - save its contents */ - - if (unlikely(data->backup_ghcb_active)) { - /* - * Backup-GHCB is also already in use. There is no way - * to continue here so just kill the machine. To make - * panic() work, mark GHCBs inactive so that messages - * can be printed out. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - instrumentation_begin(); - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - instrumentation_end(); - } - - /* Mark backup_ghcb active before writing to it */ - data->backup_ghcb_active = true; + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return get_snp_jump_table_addr(); - state->ghcb = &data->backup_ghcb; + local_irq_save(flags); - /* Backup GHCB content */ - *state->ghcb = *ghcb; - } else { - state->ghcb = NULL; - data->ghcb_active = true; - } + ghcb = __sev_get_ghcb(&state); - return ghcb; -} + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_2(ghcb, 0); -static inline u64 sev_es_rd_ghcb_msr(void) -{ - return __rdmsr(MSR_AMD64_SEV_ES_GHCB); -} + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); -static __always_inline void sev_es_wr_ghcb_msr(u64 val) -{ - u32 low, high; + if (ghcb_sw_exit_info_1_is_valid(ghcb) && + ghcb_sw_exit_info_2_is_valid(ghcb)) + ret = ghcb->save.sw_exit_info_2; - low = (u32)(val); - high = (u32)(val >> 32); + __sev_put_ghcb(&state); - native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); -} + local_irq_restore(flags); -static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, - unsigned char *buffer) -{ - return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); + return ret; } -static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) +static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size, + int ret, u64 svsm_ret) { - char buffer[MAX_INSN_SIZE]; - int insn_bytes; - - insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); - if (insn_bytes == 0) { - /* Nothing could be copied */ - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } else if (insn_bytes == -EINVAL) { - /* Effective RIP could not be calculated */ - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - ctxt->fi.cr2 = 0; - return ES_EXCEPTION; - } - - if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) - return ES_DECODE_FAILED; + WARN(1, "PVALIDATE failure: pfn: 0x%llx, action: %u, size: %u, ret: %d, svsm_ret: 0x%llx\n", + pfn, action, page_size, ret, svsm_ret); - if (ctxt->insn.immediate.got) - return ES_OK; - else - return ES_DECODE_FAILED; + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); } -static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) +static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svsm_ret) { - char buffer[MAX_INSN_SIZE]; - int res, ret; + unsigned int page_size; + bool action; + u64 pfn; - res = vc_fetch_insn_kernel(ctxt, buffer); - if (res) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } + pfn = pc->entry[pc->cur_index].pfn; + action = pc->entry[pc->cur_index].action; + page_size = pc->entry[pc->cur_index].page_size; - ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); - if (ret < 0) - return ES_DECODE_FAILED; - else - return ES_OK; + __pval_terminate(pfn, action, page_size, ret, svsm_ret); } -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +static void pval_pages(struct snp_psc_desc *desc) { - if (user_mode(ctxt->regs)) - return __vc_decode_user_insn(ctxt); - else - return __vc_decode_kern_insn(ctxt); -} + struct psc_entry *e; + unsigned long vaddr; + unsigned int size; + unsigned int i; + bool validate; + u64 pfn; + int rc; -static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, - char *dst, char *buf, size_t size) -{ - unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; + for (i = 0; i <= desc->hdr.end_entry; i++) { + e = &desc->entries[i]; - /* - * This function uses __put_user() independent of whether kernel or user - * memory is accessed. This works fine because __put_user() does no - * sanity checks of the pointer being accessed. All that it does is - * to report when the access failed. - * - * Also, this function runs in atomic context, so __put_user() is not - * allowed to sleep. The page-fault handler detects that it is running - * in atomic context and will not try to take mmap_sem and handle the - * fault, so additional pagefault_enable()/disable() calls are not - * needed. - * - * The access can't be done via copy_to_user() here because - * vc_write_mem() must not use string instructions to access unsafe - * memory. The reason is that MOVS is emulated by the #VC handler by - * splitting the move up into a read and a write and taking a nested #VC - * exception on whatever of them is the MMIO access. Using string - * instructions here would cause infinite nesting. - */ - switch (size) { - case 1: { - u8 d1; - u8 __user *target = (u8 __user *)dst; - - memcpy(&d1, buf, 1); - if (__put_user(d1, target)) - goto fault; - break; - } - case 2: { - u16 d2; - u16 __user *target = (u16 __user *)dst; + pfn = e->gfn; + vaddr = (unsigned long)pfn_to_kaddr(pfn); + size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; + validate = e->operation == SNP_PAGE_STATE_PRIVATE; - memcpy(&d2, buf, 2); - if (__put_user(d2, target)) - goto fault; - break; - } - case 4: { - u32 d4; - u32 __user *target = (u32 __user *)dst; + rc = pvalidate(vaddr, size, validate); + if (!rc) + continue; - memcpy(&d4, buf, 4); - if (__put_user(d4, target)) - goto fault; - break; - } - case 8: { - u64 d8; - u64 __user *target = (u64 __user *)dst; + if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) { + unsigned long vaddr_end = vaddr + PMD_SIZE; - memcpy(&d8, buf, 8); - if (__put_user(d8, target)) - goto fault; - break; - } - default: - WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); - return ES_UNSUPPORTED; + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE, pfn++) { + rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); + if (rc) + __pval_terminate(pfn, validate, RMP_PG_SIZE_4K, rc, 0); + } + } else { + __pval_terminate(pfn, validate, size, rc, 0); + } } - - return ES_OK; - -fault: - if (user_mode(ctxt->regs)) - error_code |= X86_PF_USER; - - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = error_code; - ctxt->fi.cr2 = (unsigned long)dst; - - return ES_EXCEPTION; } -static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, - char *src, char *buf, size_t size) +static u64 svsm_build_ca_from_pfn_range(u64 pfn, u64 pfn_end, bool action, + struct svsm_pvalidate_call *pc) { - unsigned long error_code = X86_PF_PROT; - - /* - * This function uses __get_user() independent of whether kernel or user - * memory is accessed. This works fine because __get_user() does no - * sanity checks of the pointer being accessed. All that it does is - * to report when the access failed. - * - * Also, this function runs in atomic context, so __get_user() is not - * allowed to sleep. The page-fault handler detects that it is running - * in atomic context and will not try to take mmap_sem and handle the - * fault, so additional pagefault_enable()/disable() calls are not - * needed. - * - * The access can't be done via copy_from_user() here because - * vc_read_mem() must not use string instructions to access unsafe - * memory. The reason is that MOVS is emulated by the #VC handler by - * splitting the move up into a read and a write and taking a nested #VC - * exception on whatever of them is the MMIO access. Using string - * instructions here would cause infinite nesting. - */ - switch (size) { - case 1: { - u8 d1; - u8 __user *s = (u8 __user *)src; - - if (__get_user(d1, s)) - goto fault; - memcpy(buf, &d1, 1); - break; - } - case 2: { - u16 d2; - u16 __user *s = (u16 __user *)src; - - if (__get_user(d2, s)) - goto fault; - memcpy(buf, &d2, 2); - break; - } - case 4: { - u32 d4; - u32 __user *s = (u32 __user *)src; - - if (__get_user(d4, s)) - goto fault; - memcpy(buf, &d4, 4); - break; - } - case 8: { - u64 d8; - u64 __user *s = (u64 __user *)src; - if (__get_user(d8, s)) - goto fault; - memcpy(buf, &d8, 8); - break; - } - default: - WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); - return ES_UNSUPPORTED; - } - - return ES_OK; - -fault: - if (user_mode(ctxt->regs)) - error_code |= X86_PF_USER; + struct svsm_pvalidate_entry *pe; - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = error_code; - ctxt->fi.cr2 = (unsigned long)src; + /* Nothing in the CA yet */ + pc->num_entries = 0; + pc->cur_index = 0; - return ES_EXCEPTION; -} + pe = &pc->entry[0]; -static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned long vaddr, phys_addr_t *paddr) -{ - unsigned long va = (unsigned long)vaddr; - unsigned int level; - phys_addr_t pa; - pgd_t *pgd; - pte_t *pte; + while (pfn < pfn_end) { + pe->page_size = RMP_PG_SIZE_4K; + pe->action = action; + pe->ignore_cf = 0; + pe->pfn = pfn; - pgd = __va(read_cr3_pa()); - pgd = &pgd[pgd_index(va)]; - pte = lookup_address_in_pgd(pgd, va, &level); - if (!pte) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.cr2 = vaddr; - ctxt->fi.error_code = 0; + pe++; + pfn++; - if (user_mode(ctxt->regs)) - ctxt->fi.error_code |= X86_PF_USER; - - return ES_EXCEPTION; + pc->num_entries++; + if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT) + break; } - if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) - /* Emulated MMIO to/from encrypted memory not supported */ - return ES_UNSUPPORTED; - - pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; - pa |= va & ~page_level_mask(level); - - *paddr = pa; - - return ES_OK; + return pfn; } -static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +static int svsm_build_ca_from_psc_desc(struct snp_psc_desc *desc, unsigned int desc_entry, + struct svsm_pvalidate_call *pc) { - BUG_ON(size > 4); - - if (user_mode(ctxt->regs)) { - struct thread_struct *t = ¤t->thread; - struct io_bitmap *iobm = t->io_bitmap; - size_t idx; - - if (!iobm) - goto fault; - - for (idx = port; idx < port + size; ++idx) { - if (test_bit(idx, iobm->bitmap)) - goto fault; - } - } - - return ES_OK; + struct svsm_pvalidate_entry *pe; + struct psc_entry *e; -fault: - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; + /* Nothing in the CA yet */ + pc->num_entries = 0; + pc->cur_index = 0; - return ES_EXCEPTION; -} + pe = &pc->entry[0]; + e = &desc->entries[desc_entry]; -static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) -{ - long error_code = ctxt->fi.error_code; - int trapnr = ctxt->fi.vector; + while (desc_entry <= desc->hdr.end_entry) { + pe->page_size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; + pe->action = e->operation == SNP_PAGE_STATE_PRIVATE; + pe->ignore_cf = 0; + pe->pfn = e->gfn; - ctxt->regs->orig_ax = ctxt->fi.error_code; + pe++; + e++; - switch (trapnr) { - case X86_TRAP_GP: - exc_general_protection(ctxt->regs, error_code); - break; - case X86_TRAP_UD: - exc_invalid_op(ctxt->regs); - break; - case X86_TRAP_PF: - write_cr2(ctxt->fi.cr2); - exc_page_fault(ctxt->regs, error_code); - break; - case X86_TRAP_AC: - exc_alignment_check(ctxt->regs, error_code); - break; - default: - pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); - BUG(); + desc_entry++; + pc->num_entries++; + if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT) + break; } -} - -/* Include code shared with pre-decompression boot stage */ -#include "shared.c" - -static inline struct svsm_ca *svsm_get_caa(void) -{ - /* - * Use rIP-relative references when called early in the boot. If - * ->use_cas is set, then it is late in the boot and no need - * to worry about rIP-relative references. - */ - if (RIP_REL_REF(sev_cfg).use_cas) - return this_cpu_read(svsm_caa); - else - return RIP_REL_REF(boot_svsm_caa); -} - -static u64 svsm_get_caa_pa(void) -{ - /* - * Use rIP-relative references when called early in the boot. If - * ->use_cas is set, then it is late in the boot and no need - * to worry about rIP-relative references. - */ - if (RIP_REL_REF(sev_cfg).use_cas) - return this_cpu_read(svsm_caa_pa); - else - return RIP_REL_REF(boot_svsm_caa_pa); -} - -static noinstr void __sev_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - /* - * Invalidate the GHCB so a VMGEXIT instruction issued - * from userspace won't appear to be valid. - */ - vc_ghcb_invalidate(ghcb); - data->ghcb_active = false; - } + return desc_entry; } -static int svsm_perform_call_protocol(struct svsm_call *call) +static void svsm_pval_pages(struct snp_psc_desc *desc) { - struct ghcb_state state; + struct svsm_pvalidate_entry pv_4k[VMGEXIT_PSC_MAX_ENTRY]; + unsigned int i, pv_4k_count = 0; + struct svsm_pvalidate_call *pc; + struct svsm_call call = {}; unsigned long flags; - struct ghcb *ghcb; + bool action; + u64 pc_pa; int ret; /* @@ -666,180 +288,145 @@ static int svsm_perform_call_protocol(struct svsm_call *call) flags = native_local_irq_save(); /* - * Use rip-relative references when called early in the boot. If - * ghcbs_initialized is set, then it is late in the boot and no need - * to worry about rip-relative references in called functions. + * The SVSM calling area (CA) can support processing 510 entries at a + * time. Loop through the Page State Change descriptor until the CA is + * full or the last entry in the descriptor is reached, at which time + * the SVSM is invoked. This repeats until all entries in the descriptor + * are processed. */ - if (RIP_REL_REF(sev_cfg).ghcbs_initialized) - ghcb = __sev_get_ghcb(&state); - else if (RIP_REL_REF(boot_ghcb)) - ghcb = RIP_REL_REF(boot_ghcb); - else - ghcb = NULL; + call.caa = svsm_get_caa(); - do { - ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); - } while (ret == -EAGAIN); + pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer; + pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer); - if (RIP_REL_REF(sev_cfg).ghcbs_initialized) - __sev_put_ghcb(&state); + /* Protocol 0, Call ID 1 */ + call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE); + call.rcx = pc_pa; - native_local_irq_restore(flags); + for (i = 0; i <= desc->hdr.end_entry;) { + i = svsm_build_ca_from_psc_desc(desc, i, pc); - return ret; -} + do { + ret = svsm_perform_call_protocol(&call); + if (!ret) + continue; -void noinstr __sev_es_nmi_complete(void) -{ - struct ghcb_state state; - struct ghcb *ghcb; + /* + * Check if the entry failed because of an RMP mismatch (a + * PVALIDATE at 2M was requested, but the page is mapped in + * the RMP as 4K). + */ - ghcb = __sev_get_ghcb(&state); + if (call.rax_out == SVSM_PVALIDATE_FAIL_SIZEMISMATCH && + pc->entry[pc->cur_index].page_size == RMP_PG_SIZE_2M) { + /* Save this entry for post-processing at 4K */ + pv_4k[pv_4k_count++] = pc->entry[pc->cur_index]; + + /* Skip to the next one unless at the end of the list */ + pc->cur_index++; + if (pc->cur_index < pc->num_entries) + ret = -EAGAIN; + else + ret = 0; + } + } while (ret == -EAGAIN); - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); + if (ret) + svsm_pval_terminate(pc, ret, call.rax_out); + } - sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); - VMGEXIT(); + /* Process any entries that failed to be validated at 2M and validate them at 4K */ + for (i = 0; i < pv_4k_count; i++) { + u64 pfn, pfn_end; - __sev_put_ghcb(&state); -} + action = pv_4k[i].action; + pfn = pv_4k[i].pfn; + pfn_end = pfn + 512; -static u64 __init get_snp_jump_table_addr(void) -{ - struct snp_secrets_page *secrets; - void __iomem *mem; - u64 addr; + while (pfn < pfn_end) { + pfn = svsm_build_ca_from_pfn_range(pfn, pfn_end, action, pc); - mem = ioremap_encrypted(secrets_pa, PAGE_SIZE); - if (!mem) { - pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); - return 0; + ret = svsm_perform_call_protocol(&call); + if (ret) + svsm_pval_terminate(pc, ret, call.rax_out); + } } - secrets = (__force struct snp_secrets_page *)mem; - - addr = secrets->os_area.ap_jump_table_pa; - iounmap(mem); - - return addr; + native_local_irq_restore(flags); } -static u64 __init get_jump_table_addr(void) +static void pvalidate_pages(struct snp_psc_desc *desc) { - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - u64 ret = 0; - - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return get_snp_jump_table_addr(); - - local_irq_save(flags); + if (snp_vmpl) + svsm_pval_pages(desc); + else + pval_pages(desc); +} - ghcb = __sev_get_ghcb(&state); +static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) +{ + int cur_entry, end_entry, ret = 0; + struct snp_psc_desc *data; + struct es_em_ctxt ctxt; vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); - ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); - ghcb_set_sw_exit_info_2(ghcb, 0); - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - if (ghcb_sw_exit_info_1_is_valid(ghcb) && - ghcb_sw_exit_info_2_is_valid(ghcb)) - ret = ghcb->save.sw_exit_info_2; - - __sev_put_ghcb(&state); - - local_irq_restore(flags); + /* Copy the input desc into GHCB shared buffer */ + data = (struct snp_psc_desc *)ghcb->shared_buffer; + memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); - return ret; -} - -static void __head -early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op) -{ - unsigned long paddr_end; - u64 val; - - vaddr = vaddr & PAGE_MASK; + /* + * As per the GHCB specification, the hypervisor can resume the guest + * before processing all the entries. Check whether all the entries + * are processed. If not, then keep retrying. Note, the hypervisor + * will update the data memory directly to indicate the status, so + * reference the data->hdr everywhere. + * + * The strategy here is to wait for the hypervisor to change the page + * state in the RMP table before guest accesses the memory pages. If the + * page state change was not successful, then later memory access will + * result in a crash. + */ + cur_entry = data->hdr.cur_entry; + end_entry = data->hdr.end_entry; - paddr = paddr & PAGE_MASK; - paddr_end = paddr + (npages << PAGE_SHIFT); + while (data->hdr.cur_entry <= data->hdr.end_entry) { + ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); - while (paddr < paddr_end) { - /* Page validation must be rescinded before changing to shared */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(vaddr, paddr, false); + /* This will advance the shared buffer data points to. */ + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); /* - * Use the MSR protocol because this function can be called before - * the GHCB is established. + * Page State Change VMGEXIT can pass error code through + * exit_info_2. */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) - goto e_term; - - if (GHCB_MSR_PSC_RESP_VAL(val)) - goto e_term; + if (WARN(ret || ghcb->save.sw_exit_info_2, + "SNP: PSC failed ret=%d exit_info_2=%llx\n", + ret, ghcb->save.sw_exit_info_2)) { + ret = 1; + goto out; + } - /* Page validation must be performed after changing to private */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(vaddr, paddr, true); + /* Verify that reserved bit is not set */ + if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { + ret = 1; + goto out; + } - vaddr += PAGE_SIZE; - paddr += PAGE_SIZE; + /* + * Sanity check that entry processing is not going backwards. + * This will happen only if hypervisor is tricking us. + */ + if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, +"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", + end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { + ret = 1; + goto out; + } } - return; - -e_term: - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); -} - -void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, - unsigned long npages) -{ - /* - * This can be invoked in early boot while running identity mapped, so - * use an open coded check for SNP instead of using cc_platform_has(). - * This eliminates worries about jump tables or checking boot_cpu_data - * in the cc_platform_has() function. - */ - if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED)) - return; - - /* - * Ask the hypervisor to mark the memory pages as private in the RMP - * table. - */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); -} - -void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, - unsigned long npages) -{ - /* - * This can be invoked in early boot while running identity mapped, so - * use an open coded check for SNP instead of using cc_platform_has(). - * This eliminates worries about jump tables or checking boot_cpu_data - * in the cc_platform_has() function. - */ - if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED)) - return; - - /* Ask hypervisor to mark the memory pages shared in the RMP table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); +out: + return ret; } static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, @@ -1417,90 +1004,6 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd) return 0; } -/* Writes to the SVSM CAA MSR are ignored */ -static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write) -{ - if (write) - return ES_OK; - - regs->ax = lower_32_bits(this_cpu_read(svsm_caa_pa)); - regs->dx = upper_32_bits(this_cpu_read(svsm_caa_pa)); - - return ES_OK; -} - -/* - * TSC related accesses should not exit to the hypervisor when a guest is - * executing with Secure TSC enabled, so special handling is required for - * accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ. - */ -static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write) -{ - u64 tsc; - - /* - * GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled. - * Terminate the SNP guest when the interception is enabled. - */ - if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ) - return ES_VMM_ERROR; - - /* - * Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC - * to return undefined values, so ignore all writes. - * - * Reads: Reads of MSR_IA32_TSC should return the current TSC value, use - * the value returned by rdtsc_ordered(). - */ - if (write) { - WARN_ONCE(1, "TSC MSR writes are verboten!\n"); - return ES_OK; - } - - tsc = rdtsc_ordered(); - regs->ax = lower_32_bits(tsc); - regs->dx = upper_32_bits(tsc); - - return ES_OK; -} - -static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - enum es_result ret; - bool write; - - /* Is it a WRMSR? */ - write = ctxt->insn.opcode.bytes[1] == 0x30; - - switch (regs->cx) { - case MSR_SVSM_CAA: - return __vc_handle_msr_caa(regs, write); - case MSR_IA32_TSC: - case MSR_AMD64_GUEST_TSC_FREQ: - if (sev_status & MSR_AMD64_SNP_SECURE_TSC) - return __vc_handle_secure_tsc_msrs(regs, write); - break; - default: - break; - } - - ghcb_set_rcx(ghcb, regs->cx); - if (write) { - ghcb_set_rax(ghcb, regs->ax); - ghcb_set_rdx(ghcb, regs->dx); - } - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, write, 0); - - if ((ret == ES_OK) && !write) { - regs->ax = ghcb->save.rax; - regs->dx = ghcb->save.rdx; - } - - return ret; -} - static void snp_register_per_cpu_ghcb(void) { struct sev_es_runtime_data *data; @@ -1713,748 +1216,6 @@ void __init sev_es_init_vc_handling(void) initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; } -static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) -{ - int trapnr = ctxt->fi.vector; - - if (trapnr == X86_TRAP_PF) - native_write_cr2(ctxt->fi.cr2); - - ctxt->regs->orig_ax = ctxt->fi.error_code; - do_early_exception(ctxt->regs, trapnr); -} - -static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) -{ - long *reg_array; - int offset; - - reg_array = (long *)ctxt->regs; - offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); - - if (offset < 0) - return NULL; - - offset /= sizeof(long); - - return reg_array + offset; -} -static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned int bytes, bool read) -{ - u64 exit_code, exit_info_1, exit_info_2; - unsigned long ghcb_pa = __pa(ghcb); - enum es_result res; - phys_addr_t paddr; - void __user *ref; - - ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); - if (ref == (void __user *)-1L) - return ES_UNSUPPORTED; - - exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; - - res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); - if (res != ES_OK) { - if (res == ES_EXCEPTION && !read) - ctxt->fi.error_code |= X86_PF_WRITE; - - return res; - } - - exit_info_1 = paddr; - /* Can never be greater than 8 */ - exit_info_2 = bytes; - - ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); - - return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); -} - -/* - * The MOVS instruction has two memory operands, which raises the - * problem that it is not known whether the access to the source or the - * destination caused the #VC exception (and hence whether an MMIO read - * or write operation needs to be emulated). - * - * Instead of playing games with walking page-tables and trying to guess - * whether the source or destination is an MMIO range, split the move - * into two operations, a read and a write with only one memory operand. - * This will cause a nested #VC exception on the MMIO address which can - * then be handled. - * - * This implementation has the benefit that it also supports MOVS where - * source _and_ destination are MMIO regions. - * - * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a - * rare operation. If it turns out to be a performance problem the split - * operations can be moved to memcpy_fromio() and memcpy_toio(). - */ -static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, - unsigned int bytes) -{ - unsigned long ds_base, es_base; - unsigned char *src, *dst; - unsigned char buffer[8]; - enum es_result ret; - bool rep; - int off; - - ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); - es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); - - if (ds_base == -1L || es_base == -1L) { - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; - } - - src = ds_base + (unsigned char *)ctxt->regs->si; - dst = es_base + (unsigned char *)ctxt->regs->di; - - ret = vc_read_mem(ctxt, src, buffer, bytes); - if (ret != ES_OK) - return ret; - - ret = vc_write_mem(ctxt, dst, buffer, bytes); - if (ret != ES_OK) - return ret; - - if (ctxt->regs->flags & X86_EFLAGS_DF) - off = -bytes; - else - off = bytes; - - ctxt->regs->si += off; - ctxt->regs->di += off; - - rep = insn_has_rep_prefix(&ctxt->insn); - if (rep) - ctxt->regs->cx -= 1; - - if (!rep || ctxt->regs->cx == 0) - return ES_OK; - else - return ES_RETRY; -} - -static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct insn *insn = &ctxt->insn; - enum insn_mmio_type mmio; - unsigned int bytes = 0; - enum es_result ret; - u8 sign_byte; - long *reg_data; - - mmio = insn_decode_mmio(insn, &bytes); - if (mmio == INSN_MMIO_DECODE_FAILED) - return ES_DECODE_FAILED; - - if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { - reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); - if (!reg_data) - return ES_DECODE_FAILED; - } - - if (user_mode(ctxt->regs)) - return ES_UNSUPPORTED; - - switch (mmio) { - case INSN_MMIO_WRITE: - memcpy(ghcb->shared_buffer, reg_data, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); - break; - case INSN_MMIO_WRITE_IMM: - memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); - break; - case INSN_MMIO_READ: - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero-extend for 32-bit operation */ - if (bytes == 4) - *reg_data = 0; - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - case INSN_MMIO_READ_ZERO_EXTEND: - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero extend based on operand size */ - memset(reg_data, 0, insn->opnd_bytes); - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - case INSN_MMIO_READ_SIGN_EXTEND: - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - if (bytes == 1) { - u8 *val = (u8 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x80) ? 0xff : 0x00; - } else { - u16 *val = (u16 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x8000) ? 0xff : 0x00; - } - - /* Sign extend based on operand size */ - memset(reg_data, sign_byte, insn->opnd_bytes); - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - case INSN_MMIO_MOVS: - ret = vc_handle_mmio_movs(ctxt, bytes); - break; - default: - ret = ES_UNSUPPORTED; - break; - } - - return ret; -} - -static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - long val, *reg = vc_insn_get_rm(ctxt); - enum es_result ret; - - if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) - return ES_VMM_ERROR; - - if (!reg) - return ES_DECODE_FAILED; - - val = *reg; - - /* Upper 32 bits must be written as zeroes */ - if (val >> 32) { - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; - } - - /* Clear out other reserved bits and set bit 10 */ - val = (val & 0xffff23ffL) | BIT(10); - - /* Early non-zero writes to DR7 are not supported */ - if (!data && (val & ~DR7_RESET_VALUE)) - return ES_UNSUPPORTED; - - /* Using a value of 0 for ExitInfo1 means RAX holds the value */ - ghcb_set_rax(ghcb, val); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); - if (ret != ES_OK) - return ret; - - if (data) - data->dr7 = val; - - return ES_OK; -} - -static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - long *reg = vc_insn_get_rm(ctxt); - - if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) - return ES_VMM_ERROR; - - if (!reg) - return ES_DECODE_FAILED; - - if (data) - *reg = data->dr7; - else - *reg = DR7_RESET_VALUE; - - return ES_OK; -} - -static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); -} - -static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - enum es_result ret; - - ghcb_set_rcx(ghcb, ctxt->regs->cx); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - ctxt->regs->dx = ghcb->save.rdx; - - return ES_OK; -} - -static enum es_result vc_handle_monitor(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* - * Treat it as a NOP and do not leak a physical address to the - * hypervisor. - */ - return ES_OK; -} - -static enum es_result vc_handle_mwait(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* Treat the same as MONITOR/MONITORX */ - return ES_OK; -} - -static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - enum es_result ret; - - ghcb_set_rax(ghcb, ctxt->regs->ax); - ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); - - if (x86_platform.hyper.sev_es_hcall_prepare) - x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); - if (ret != ES_OK) - return ret; - - if (!ghcb_rax_is_valid(ghcb)) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - - /* - * Call sev_es_hcall_finish() after regs->ax is already set. - * This allows the hypervisor handler to overwrite it again if - * necessary. - */ - if (x86_platform.hyper.sev_es_hcall_finish && - !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) - return ES_VMM_ERROR; - - return ES_OK; -} - -static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* - * Calling ecx_alignment_check() directly does not work, because it - * enables IRQs and the GHCB is active. Forward the exception and call - * it later from vc_forward_exception(). - */ - ctxt->fi.vector = X86_TRAP_AC; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; -} - -static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, - struct ghcb *ghcb, - unsigned long exit_code) -{ - enum es_result result = vc_check_opcode_bytes(ctxt, exit_code); - - if (result != ES_OK) - return result; - - switch (exit_code) { - case SVM_EXIT_READ_DR7: - result = vc_handle_dr7_read(ghcb, ctxt); - break; - case SVM_EXIT_WRITE_DR7: - result = vc_handle_dr7_write(ghcb, ctxt); - break; - case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: - result = vc_handle_trap_ac(ghcb, ctxt); - break; - case SVM_EXIT_RDTSC: - case SVM_EXIT_RDTSCP: - result = vc_handle_rdtsc(ghcb, ctxt, exit_code); - break; - case SVM_EXIT_RDPMC: - result = vc_handle_rdpmc(ghcb, ctxt); - break; - case SVM_EXIT_INVD: - pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); - result = ES_UNSUPPORTED; - break; - case SVM_EXIT_CPUID: - result = vc_handle_cpuid(ghcb, ctxt); - break; - case SVM_EXIT_IOIO: - result = vc_handle_ioio(ghcb, ctxt); - break; - case SVM_EXIT_MSR: - result = vc_handle_msr(ghcb, ctxt); - break; - case SVM_EXIT_VMMCALL: - result = vc_handle_vmmcall(ghcb, ctxt); - break; - case SVM_EXIT_WBINVD: - result = vc_handle_wbinvd(ghcb, ctxt); - break; - case SVM_EXIT_MONITOR: - result = vc_handle_monitor(ghcb, ctxt); - break; - case SVM_EXIT_MWAIT: - result = vc_handle_mwait(ghcb, ctxt); - break; - case SVM_EXIT_NPF: - result = vc_handle_mmio(ghcb, ctxt); - break; - default: - /* - * Unexpected #VC exception - */ - result = ES_UNSUPPORTED; - } - - return result; -} - -static __always_inline bool is_vc2_stack(unsigned long sp) -{ - return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); -} - -static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) -{ - unsigned long sp, prev_sp; - - sp = (unsigned long)regs; - prev_sp = regs->sp; - - /* - * If the code was already executing on the VC2 stack when the #VC - * happened, let it proceed to the normal handling routine. This way the - * code executing on the VC2 stack can cause #VC exceptions to get handled. - */ - return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); -} - -static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) -{ - struct ghcb_state state; - struct es_em_ctxt ctxt; - enum es_result result; - struct ghcb *ghcb; - bool ret = true; - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - result = vc_init_em_ctxt(&ctxt, regs, error_code); - - if (result == ES_OK) - result = vc_handle_exitcode(&ctxt, ghcb, error_code); - - __sev_put_ghcb(&state); - - /* Done - now check the result */ - switch (result) { - case ES_OK: - vc_finish_insn(&ctxt); - break; - case ES_UNSUPPORTED: - pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", - error_code, regs->ip); - ret = false; - break; - case ES_VMM_ERROR: - pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", - error_code, regs->ip); - ret = false; - break; - case ES_DECODE_FAILED: - pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", - error_code, regs->ip); - ret = false; - break; - case ES_EXCEPTION: - vc_forward_exception(&ctxt); - break; - case ES_RETRY: - /* Nothing to do */ - break; - default: - pr_emerg("Unknown result in %s():%d\n", __func__, result); - /* - * Emulating the instruction which caused the #VC exception - * failed - can't continue so print debug information - */ - BUG(); - } - - return ret; -} - -static __always_inline bool vc_is_db(unsigned long error_code) -{ - return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; -} - -/* - * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode - * and will panic when an error happens. - */ -DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) -{ - irqentry_state_t irq_state; - - /* - * With the current implementation it is always possible to switch to a - * safe stack because #VC exceptions only happen at known places, like - * intercepted instructions or accesses to MMIO areas/IO ports. They can - * also happen with code instrumentation when the hypervisor intercepts - * #DB, but the critical paths are forbidden to be instrumented, so #DB - * exceptions currently also only happen in safe places. - * - * But keep this here in case the noinstr annotations are violated due - * to bug elsewhere. - */ - if (unlikely(vc_from_invalid_context(regs))) { - instrumentation_begin(); - panic("Can't handle #VC exception from unsupported context\n"); - instrumentation_end(); - } - - /* - * Handle #DB before calling into !noinstr code to avoid recursive #DB. - */ - if (vc_is_db(error_code)) { - exc_debug(regs); - return; - } - - irq_state = irqentry_nmi_enter(regs); - - instrumentation_begin(); - - if (!vc_raw_handle_exception(regs, error_code)) { - /* Show some debug info */ - show_regs(regs); - - /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - - /* If that fails and we get here - just panic */ - panic("Returned from Terminate-Request to Hypervisor\n"); - } - - instrumentation_end(); - irqentry_nmi_exit(regs, irq_state); -} - -/* - * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode - * and will kill the current task with SIGBUS when an error happens. - */ -DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) -{ - /* - * Handle #DB before calling into !noinstr code to avoid recursive #DB. - */ - if (vc_is_db(error_code)) { - noist_exc_debug(regs); - return; - } - - irqentry_enter_from_user_mode(regs); - instrumentation_begin(); - - if (!vc_raw_handle_exception(regs, error_code)) { - /* - * Do not kill the machine if user-space triggered the - * exception. Send SIGBUS instead and let user-space deal with - * it. - */ - force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); - } - - instrumentation_end(); - irqentry_exit_to_user_mode(regs); -} - -bool __init handle_vc_boot_ghcb(struct pt_regs *regs) -{ - unsigned long exit_code = regs->orig_ax; - struct es_em_ctxt ctxt; - enum es_result result; - - vc_ghcb_invalidate(boot_ghcb); - - result = vc_init_em_ctxt(&ctxt, regs, exit_code); - if (result == ES_OK) - result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); - - /* Done - now check the result */ - switch (result) { - case ES_OK: - vc_finish_insn(&ctxt); - break; - case ES_UNSUPPORTED: - early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_VMM_ERROR: - early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_DECODE_FAILED: - early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_EXCEPTION: - vc_early_forward_exception(&ctxt); - break; - case ES_RETRY: - /* Nothing to do */ - break; - default: - BUG(); - } - - return true; - -fail: - show_regs(regs); - - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); -} - -/* - * Initial set up of SNP relies on information provided by the - * Confidential Computing blob, which can be passed to the kernel - * in the following ways, depending on how it is booted: - * - * - when booted via the boot/decompress kernel: - * - via boot_params - * - * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): - * - via a setup_data entry, as defined by the Linux Boot Protocol - * - * Scan for the blob in that order. - */ -static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) -{ - struct cc_blob_sev_info *cc_info; - - /* Boot kernel would have passed the CC blob via boot_params. */ - if (bp->cc_blob_address) { - cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; - goto found_cc_info; - } - - /* - * If kernel was booted directly, without the use of the - * boot/decompression kernel, the CC blob may have been passed via - * setup_data instead. - */ - cc_info = find_cc_blob_setup_data(bp); - if (!cc_info) - return NULL; - -found_cc_info: - if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) - snp_abort(); - - return cc_info; -} - -static __head void svsm_setup(struct cc_blob_sev_info *cc_info) -{ - struct svsm_call call = {}; - int ret; - u64 pa; - - /* - * Record the SVSM Calling Area address (CAA) if the guest is not - * running at VMPL0. The CA will be used to communicate with the - * SVSM to perform the SVSM services. - */ - if (!svsm_setup_ca(cc_info)) - return; - - /* - * It is very early in the boot and the kernel is running identity - * mapped but without having adjusted the pagetables to where the - * kernel was loaded (physbase), so the get the CA address using - * RIP-relative addressing. - */ - pa = (u64)&RIP_REL_REF(boot_svsm_ca_page); - - /* - * Switch over to the boot SVSM CA while the current CA is still - * addressable. There is no GHCB at this point so use the MSR protocol. - * - * SVSM_CORE_REMAP_CA call: - * RAX = 0 (Protocol=0, CallID=0) - * RCX = New CA GPA - */ - call.caa = svsm_get_caa(); - call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); - call.rcx = pa; - ret = svsm_perform_call_protocol(&call); - if (ret) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); - - RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa; - RIP_REL_REF(boot_svsm_caa_pa) = pa; -} - -bool __head snp_init(struct boot_params *bp) -{ - struct cc_blob_sev_info *cc_info; - - if (!bp) - return false; - - cc_info = find_cc_blob(bp); - if (!cc_info) - return false; - - if (cc_info->secrets_phys && cc_info->secrets_len == PAGE_SIZE) - secrets_pa = cc_info->secrets_phys; - else - return false; - - setup_cpuid_table(cc_info); - - svsm_setup(cc_info); - - /* - * The CC blob will be used later to access the secrets page. Cache - * it here like the boot kernel does. - */ - bp->cc_blob_address = (u32)(unsigned long)cc_info; - - return true; -} - -void __head __noreturn snp_abort(void) -{ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); -} - /* * SEV-SNP guests should only execute dmi_setup() if EFI_CONFIG_TABLES are * enabled, as the alternative (fallback) logic for DMI probing in the legacy @@ -2835,7 +1596,7 @@ struct snp_msg_desc *snp_msg_alloc(void) if (!mdesc) return ERR_PTR(-ENOMEM); - mem = ioremap_encrypted(secrets_pa, PAGE_SIZE); + mem = ioremap_encrypted(sev_secrets_pa, PAGE_SIZE); if (!mem) goto e_free_mdesc; diff --git a/arch/x86/coco/sev/sev-nmi.c b/arch/x86/coco/sev/sev-nmi.c new file mode 100644 index 000000000000..d8dfaddfb367 --- /dev/null +++ b/arch/x86/coco/sev/sev-nmi.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#define pr_fmt(fmt) "SEV: " fmt + +#include <linux/bug.h> +#include <linux/kernel.h> + +#include <asm/cpu_entry_area.h> +#include <asm/msr.h> +#include <asm/ptrace.h> +#include <asm/sev.h> +#include <asm/sev-internal.h> + +static __always_inline bool on_vc_stack(struct pt_regs *regs) +{ + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); +} + +/* + * This function handles the case when an NMI is raised in the #VC + * exception handler entry code, before the #VC handler has switched off + * its IST stack. In this case, the IST entry for #VC must be adjusted, + * so that any nested #VC exception will not overwrite the stack + * contents of the interrupted #VC handler. + * + * The IST entry is adjusted unconditionally so that it can be also be + * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a + * nested sev_es_ist_exit() call may adjust back the IST entry too + * early. + * + * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run + * on the NMI IST stack, as they are only called from NMI handling code + * right now. + */ +void noinstr __sev_es_ist_enter(struct pt_regs *regs) +{ + unsigned long old_ist, new_ist; + + /* Read old IST entry */ + new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + /* + * If NMI happened while on the #VC IST stack, set the new IST + * value below regs->sp, so that the interrupted stack frame is + * not overwritten by subsequent #VC exceptions. + */ + if (on_vc_stack(regs)) + new_ist = regs->sp; + + /* + * Reserve additional 8 bytes and store old IST value so this + * adjustment can be unrolled in __sev_es_ist_exit(). + */ + new_ist -= sizeof(old_ist); + *(unsigned long *)new_ist = old_ist; + + /* Set new IST entry */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); +} + +void noinstr __sev_es_ist_exit(void) +{ + unsigned long ist; + + /* Read IST entry */ + ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) + return; + + /* Read back old IST entry and write it to the TSS */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); +} + +void noinstr __sev_es_nmi_complete(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); + VMGEXIT(); + + __sev_put_ghcb(&state); +} diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c new file mode 100644 index 000000000000..b4895c648024 --- /dev/null +++ b/arch/x86/coco/sev/vc-handle.c @@ -0,0 +1,1061 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#define pr_fmt(fmt) "SEV: " fmt + +#include <linux/sched/debug.h> /* For show_regs() */ +#include <linux/cc_platform.h> +#include <linux/printk.h> +#include <linux/mm_types.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/psp-sev.h> +#include <uapi/linux/sev-guest.h> + +#include <asm/init.h> +#include <asm/stacktrace.h> +#include <asm/sev.h> +#include <asm/sev-internal.h> +#include <asm/insn-eval.h> +#include <asm/fpu/xcr.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/traps.h> +#include <asm/svm.h> +#include <asm/smp.h> +#include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/cpuid.h> + +static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned long vaddr, phys_addr_t *paddr) +{ + unsigned long va = (unsigned long)vaddr; + unsigned int level; + phys_addr_t pa; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd = &pgd[pgd_index(va)]; + pte = lookup_address_in_pgd(pgd, va, &level); + if (!pte) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.cr2 = vaddr; + ctxt->fi.error_code = 0; + + if (user_mode(ctxt->regs)) + ctxt->fi.error_code |= X86_PF_USER; + + return ES_EXCEPTION; + } + + if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) + /* Emulated MMIO to/from encrypted memory not supported */ + return ES_UNSUPPORTED; + + pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; + pa |= va & ~page_level_mask(level); + + *paddr = pa; + + return ES_OK; +} + +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + BUG_ON(size > 4); + + if (user_mode(ctxt->regs)) { + struct thread_struct *t = ¤t->thread; + struct io_bitmap *iobm = t->io_bitmap; + size_t idx; + + if (!iobm) + goto fault; + + for (idx = port; idx < port + size; ++idx) { + if (test_bit(idx, iobm->bitmap)) + goto fault; + } + } + + return ES_OK; + +fault: + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + + return ES_EXCEPTION; +} + +void vc_forward_exception(struct es_em_ctxt *ctxt) +{ + long error_code = ctxt->fi.error_code; + int trapnr = ctxt->fi.vector; + + ctxt->regs->orig_ax = ctxt->fi.error_code; + + switch (trapnr) { + case X86_TRAP_GP: + exc_general_protection(ctxt->regs, error_code); + break; + case X86_TRAP_UD: + exc_invalid_op(ctxt->regs); + break; + case X86_TRAP_PF: + write_cr2(ctxt->fi.cr2); + exc_page_fault(ctxt->regs, error_code); + break; + case X86_TRAP_AC: + exc_alignment_check(ctxt->regs, error_code); + break; + default: + pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); + BUG(); + } +} + +static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, + unsigned char *buffer) +{ + return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); +} + +static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int insn_bytes; + + insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (insn_bytes == 0) { + /* Nothing could be copied */ + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } else if (insn_bytes == -EINVAL) { + /* Effective RIP could not be calculated */ + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + ctxt->fi.cr2 = 0; + return ES_EXCEPTION; + } + + if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) + return ES_DECODE_FAILED; + + if (ctxt->insn.immediate.got) + return ES_OK; + else + return ES_DECODE_FAILED; +} + +static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int res, ret; + + res = vc_fetch_insn_kernel(ctxt, buffer); + if (res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + else + return ES_OK; +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + if (user_mode(ctxt->regs)) + return __vc_decode_user_insn(ctxt); + else + return __vc_decode_kern_insn(ctxt); +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + char *dst, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; + + /* + * This function uses __put_user() independent of whether kernel or user + * memory is accessed. This works fine because __put_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __put_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_to_user() here because + * vc_write_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ + switch (size) { + case 1: { + u8 d1; + u8 __user *target = (u8 __user *)dst; + + memcpy(&d1, buf, 1); + if (__put_user(d1, target)) + goto fault; + break; + } + case 2: { + u16 d2; + u16 __user *target = (u16 __user *)dst; + + memcpy(&d2, buf, 2); + if (__put_user(d2, target)) + goto fault; + break; + } + case 4: { + u32 d4; + u32 __user *target = (u32 __user *)dst; + + memcpy(&d4, buf, 4); + if (__put_user(d4, target)) + goto fault; + break; + } + case 8: { + u64 d8; + u64 __user *target = (u64 __user *)dst; + + memcpy(&d8, buf, 8); + if (__put_user(d8, target)) + goto fault; + break; + } + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)dst; + + return ES_EXCEPTION; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + char *src, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT; + + /* + * This function uses __get_user() independent of whether kernel or user + * memory is accessed. This works fine because __get_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __get_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_from_user() here because + * vc_read_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ + switch (size) { + case 1: { + u8 d1; + u8 __user *s = (u8 __user *)src; + + if (__get_user(d1, s)) + goto fault; + memcpy(buf, &d1, 1); + break; + } + case 2: { + u16 d2; + u16 __user *s = (u16 __user *)src; + + if (__get_user(d2, s)) + goto fault; + memcpy(buf, &d2, 2); + break; + } + case 4: { + u32 d4; + u32 __user *s = (u32 __user *)src; + + if (__get_user(d4, s)) + goto fault; + memcpy(buf, &d4, 4); + break; + } + case 8: { + u64 d8; + u64 __user *s = (u64 __user *)src; + if (__get_user(d8, s)) + goto fault; + memcpy(buf, &d8, 8); + break; + } + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)src; + + return ES_EXCEPTION; +} + +#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) + +#include "vc-shared.c" + +/* Writes to the SVSM CAA MSR are ignored */ +static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write) +{ + if (write) + return ES_OK; + + regs->ax = lower_32_bits(this_cpu_read(svsm_caa_pa)); + regs->dx = upper_32_bits(this_cpu_read(svsm_caa_pa)); + + return ES_OK; +} + +/* + * TSC related accesses should not exit to the hypervisor when a guest is + * executing with Secure TSC enabled, so special handling is required for + * accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ. + */ +static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write) +{ + u64 tsc; + + /* + * GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled. + * Terminate the SNP guest when the interception is enabled. + */ + if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ) + return ES_VMM_ERROR; + + /* + * Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC + * to return undefined values, so ignore all writes. + * + * Reads: Reads of MSR_IA32_TSC should return the current TSC value, use + * the value returned by rdtsc_ordered(). + */ + if (write) { + WARN_ONCE(1, "TSC MSR writes are verboten!\n"); + return ES_OK; + } + + tsc = rdtsc_ordered(); + regs->ax = lower_32_bits(tsc); + regs->dx = upper_32_bits(tsc); + + return ES_OK; +} + +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + enum es_result ret; + bool write; + + /* Is it a WRMSR? */ + write = ctxt->insn.opcode.bytes[1] == 0x30; + + switch (regs->cx) { + case MSR_SVSM_CAA: + return __vc_handle_msr_caa(regs, write); + case MSR_IA32_TSC: + case MSR_AMD64_GUEST_TSC_FREQ: + if (sev_status & MSR_AMD64_SNP_SECURE_TSC) + return __vc_handle_secure_tsc_msrs(regs, write); + break; + default: + break; + } + + ghcb_set_rcx(ghcb, regs->cx); + if (write) { + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rdx(ghcb, regs->dx); + } + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, write, 0); + + if ((ret == ES_OK) && !write) { + regs->ax = ghcb->save.rax; + regs->dx = ghcb->save.rdx; + } + + return ret; +} + +static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) +{ + int trapnr = ctxt->fi.vector; + + if (trapnr == X86_TRAP_PF) + native_write_cr2(ctxt->fi.cr2); + + ctxt->regs->orig_ax = ctxt->fi.error_code; + do_early_exception(ctxt->regs, trapnr); +} + +static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} +static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned int bytes, bool read) +{ + u64 exit_code, exit_info_1, exit_info_2; + unsigned long ghcb_pa = __pa(ghcb); + enum es_result res; + phys_addr_t paddr; + void __user *ref; + + ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); + if (ref == (void __user *)-1L) + return ES_UNSUPPORTED; + + exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; + + res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); + if (res != ES_OK) { + if (res == ES_EXCEPTION && !read) + ctxt->fi.error_code |= X86_PF_WRITE; + + return res; + } + + exit_info_1 = paddr; + /* Can never be greater than 8 */ + exit_info_2 = bytes; + + ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); + + return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); +} + +/* + * The MOVS instruction has two memory operands, which raises the + * problem that it is not known whether the access to the source or the + * destination caused the #VC exception (and hence whether an MMIO read + * or write operation needs to be emulated). + * + * Instead of playing games with walking page-tables and trying to guess + * whether the source or destination is an MMIO range, split the move + * into two operations, a read and a write with only one memory operand. + * This will cause a nested #VC exception on the MMIO address which can + * then be handled. + * + * This implementation has the benefit that it also supports MOVS where + * source _and_ destination are MMIO regions. + * + * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a + * rare operation. If it turns out to be a performance problem the split + * operations can be moved to memcpy_fromio() and memcpy_toio(). + */ +static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, + unsigned int bytes) +{ + unsigned long ds_base, es_base; + unsigned char *src, *dst; + unsigned char buffer[8]; + enum es_result ret; + bool rep; + int off; + + ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + if (ds_base == -1L || es_base == -1L) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + src = ds_base + (unsigned char *)ctxt->regs->si; + dst = es_base + (unsigned char *)ctxt->regs->di; + + ret = vc_read_mem(ctxt, src, buffer, bytes); + if (ret != ES_OK) + return ret; + + ret = vc_write_mem(ctxt, dst, buffer, bytes); + if (ret != ES_OK) + return ret; + + if (ctxt->regs->flags & X86_EFLAGS_DF) + off = -bytes; + else + off = bytes; + + ctxt->regs->si += off; + ctxt->regs->di += off; + + rep = insn_has_rep_prefix(&ctxt->insn); + if (rep) + ctxt->regs->cx -= 1; + + if (!rep || ctxt->regs->cx == 0) + return ES_OK; + else + return ES_RETRY; +} + +static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + enum insn_mmio_type mmio; + unsigned int bytes = 0; + enum es_result ret; + u8 sign_byte; + long *reg_data; + + mmio = insn_decode_mmio(insn, &bytes); + if (mmio == INSN_MMIO_DECODE_FAILED) + return ES_DECODE_FAILED; + + if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { + reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); + if (!reg_data) + return ES_DECODE_FAILED; + } + + if (user_mode(ctxt->regs)) + return ES_UNSUPPORTED; + + switch (mmio) { + case INSN_MMIO_WRITE: + memcpy(ghcb->shared_buffer, reg_data, bytes); + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + case INSN_MMIO_WRITE_IMM: + memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + case INSN_MMIO_READ: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero-extend for 32-bit operation */ + if (bytes == 4) + *reg_data = 0; + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case INSN_MMIO_READ_ZERO_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero extend based on operand size */ + memset(reg_data, 0, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case INSN_MMIO_READ_SIGN_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + if (bytes == 1) { + u8 *val = (u8 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x80) ? 0xff : 0x00; + } else { + u16 *val = (u16 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x8000) ? 0xff : 0x00; + } + + /* Sign extend based on operand size */ + memset(reg_data, sign_byte, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case INSN_MMIO_MOVS: + ret = vc_handle_mmio_movs(ctxt, bytes); + break; + default: + ret = ES_UNSUPPORTED; + break; + } + + return ret; +} + +static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long val, *reg = vc_insn_get_rm(ctxt); + enum es_result ret; + + if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) + return ES_VMM_ERROR; + + if (!reg) + return ES_DECODE_FAILED; + + val = *reg; + + /* Upper 32 bits must be written as zeroes */ + if (val >> 32) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + /* Clear out other reserved bits and set bit 10 */ + val = (val & 0xffff23ffL) | BIT(10); + + /* Early non-zero writes to DR7 are not supported */ + if (!data && (val & ~DR7_RESET_VALUE)) + return ES_UNSUPPORTED; + + /* Using a value of 0 for ExitInfo1 means RAX holds the value */ + ghcb_set_rax(ghcb, val); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + if (ret != ES_OK) + return ret; + + if (data) + data->dr7 = val; + + return ES_OK; +} + +static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long *reg = vc_insn_get_rm(ctxt); + + if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) + return ES_VMM_ERROR; + + if (!reg) + return ES_DECODE_FAILED; + + if (data) + *reg = data->dr7; + else + *reg = DR7_RESET_VALUE; + + return ES_OK; +} + +static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); +} + +static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rcx(ghcb, ctxt->regs->cx); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_monitor(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Treat it as a NOP and do not leak a physical address to the + * hypervisor. + */ + return ES_OK; +} + +static enum es_result vc_handle_mwait(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* Treat the same as MONITOR/MONITORX */ + return ES_OK; +} + +static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rax(ghcb, ctxt->regs->ax); + ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); + + if (x86_platform.hyper.sev_es_hcall_prepare) + x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); + if (ret != ES_OK) + return ret; + + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + + /* + * Call sev_es_hcall_finish() after regs->ax is already set. + * This allows the hypervisor handler to overwrite it again if + * necessary. + */ + if (x86_platform.hyper.sev_es_hcall_finish && + !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) + return ES_VMM_ERROR; + + return ES_OK; +} + +static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Calling ecx_alignment_check() directly does not work, because it + * enables IRQs and the GHCB is active. Forward the exception and call + * it later from vc_forward_exception(). + */ + ctxt->fi.vector = X86_TRAP_AC; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; +} + +static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, + struct ghcb *ghcb, + unsigned long exit_code) +{ + enum es_result result = vc_check_opcode_bytes(ctxt, exit_code); + + if (result != ES_OK) + return result; + + switch (exit_code) { + case SVM_EXIT_READ_DR7: + result = vc_handle_dr7_read(ghcb, ctxt); + break; + case SVM_EXIT_WRITE_DR7: + result = vc_handle_dr7_write(ghcb, ctxt); + break; + case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: + result = vc_handle_trap_ac(ghcb, ctxt); + break; + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(ghcb, ctxt, exit_code); + break; + case SVM_EXIT_RDPMC: + result = vc_handle_rdpmc(ghcb, ctxt); + break; + case SVM_EXIT_INVD: + pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); + result = ES_UNSUPPORTED; + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(ghcb, ctxt); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(ghcb, ctxt); + break; + case SVM_EXIT_MSR: + result = vc_handle_msr(ghcb, ctxt); + break; + case SVM_EXIT_VMMCALL: + result = vc_handle_vmmcall(ghcb, ctxt); + break; + case SVM_EXIT_WBINVD: + result = vc_handle_wbinvd(ghcb, ctxt); + break; + case SVM_EXIT_MONITOR: + result = vc_handle_monitor(ghcb, ctxt); + break; + case SVM_EXIT_MWAIT: + result = vc_handle_mwait(ghcb, ctxt); + break; + case SVM_EXIT_NPF: + result = vc_handle_mmio(ghcb, ctxt); + break; + default: + /* + * Unexpected #VC exception + */ + result = ES_UNSUPPORTED; + } + + return result; +} + +static __always_inline bool is_vc2_stack(unsigned long sp) +{ + return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); +} + +static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) +{ + unsigned long sp, prev_sp; + + sp = (unsigned long)regs; + prev_sp = regs->sp; + + /* + * If the code was already executing on the VC2 stack when the #VC + * happened, let it proceed to the normal handling routine. This way the + * code executing on the VC2 stack can cause #VC exceptions to get handled. + */ + return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); +} + +static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result result; + struct ghcb *ghcb; + bool ret = true; + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + result = vc_init_em_ctxt(&ctxt, regs, error_code); + + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, ghcb, error_code); + + __sev_put_ghcb(&state); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_VMM_ERROR: + pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_DECODE_FAILED: + pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + pr_emerg("Unknown result in %s():%d\n", __func__, result); + /* + * Emulating the instruction which caused the #VC exception + * failed - can't continue so print debug information + */ + BUG(); + } + + return ret; +} + +static __always_inline bool vc_is_db(unsigned long error_code) +{ + return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; +} + +/* + * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode + * and will panic when an error happens. + */ +DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) +{ + irqentry_state_t irq_state; + + /* + * With the current implementation it is always possible to switch to a + * safe stack because #VC exceptions only happen at known places, like + * intercepted instructions or accesses to MMIO areas/IO ports. They can + * also happen with code instrumentation when the hypervisor intercepts + * #DB, but the critical paths are forbidden to be instrumented, so #DB + * exceptions currently also only happen in safe places. + * + * But keep this here in case the noinstr annotations are violated due + * to bug elsewhere. + */ + if (unlikely(vc_from_invalid_context(regs))) { + instrumentation_begin(); + panic("Can't handle #VC exception from unsupported context\n"); + instrumentation_end(); + } + + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + exc_debug(regs); + return; + } + + irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + if (!vc_raw_handle_exception(regs, error_code)) { + /* Show some debug info */ + show_regs(regs); + + /* Ask hypervisor to sev_es_terminate */ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* If that fails and we get here - just panic */ + panic("Returned from Terminate-Request to Hypervisor\n"); + } + + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +/* + * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode + * and will kill the current task with SIGBUS when an error happens. + */ +DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) +{ + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + noist_exc_debug(regs); + return; + } + + irqentry_enter_from_user_mode(regs); + instrumentation_begin(); + + if (!vc_raw_handle_exception(regs, error_code)) { + /* + * Do not kill the machine if user-space triggered the + * exception. Send SIGBUS instead and let user-space deal with + * it. + */ + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); + } + + instrumentation_end(); + irqentry_exit_to_user_mode(regs); +} + +bool __init handle_vc_boot_ghcb(struct pt_regs *regs) +{ + unsigned long exit_code = regs->orig_ax; + struct es_em_ctxt ctxt; + enum es_result result; + + vc_ghcb_invalidate(boot_ghcb); + + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_early_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + BUG(); + } + + return true; + +fail: + show_regs(regs); + + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} + diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c new file mode 100644 index 000000000000..2c0ab0fdc060 --- /dev/null +++ b/arch/x86/coco/sev/vc-shared.c @@ -0,0 +1,504 @@ +// SPDX-License-Identifier: GPL-2.0 + +static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + unsigned int opcode = (unsigned int)ctxt->insn.opcode.value; + u8 modrm = ctxt->insn.modrm.value; + + switch (exit_code) { + + case SVM_EXIT_IOIO: + case SVM_EXIT_NPF: + /* handled separately */ + return ES_OK; + + case SVM_EXIT_CPUID: + if (opcode == 0xa20f) + return ES_OK; + break; + + case SVM_EXIT_INVD: + if (opcode == 0x080f) + return ES_OK; + break; + + case SVM_EXIT_MONITOR: + /* MONITOR and MONITORX instructions generate the same error code */ + if (opcode == 0x010f && (modrm == 0xc8 || modrm == 0xfa)) + return ES_OK; + break; + + case SVM_EXIT_MWAIT: + /* MWAIT and MWAITX instructions generate the same error code */ + if (opcode == 0x010f && (modrm == 0xc9 || modrm == 0xfb)) + return ES_OK; + break; + + case SVM_EXIT_MSR: + /* RDMSR */ + if (opcode == 0x320f || + /* WRMSR */ + opcode == 0x300f) + return ES_OK; + break; + + case SVM_EXIT_RDPMC: + if (opcode == 0x330f) + return ES_OK; + break; + + case SVM_EXIT_RDTSC: + if (opcode == 0x310f) + return ES_OK; + break; + + case SVM_EXIT_RDTSCP: + if (opcode == 0x010f && modrm == 0xf9) + return ES_OK; + break; + + case SVM_EXIT_READ_DR7: + if (opcode == 0x210f && + X86_MODRM_REG(ctxt->insn.modrm.value) == 7) + return ES_OK; + break; + + case SVM_EXIT_VMMCALL: + if (opcode == 0x010f && modrm == 0xd9) + return ES_OK; + + break; + + case SVM_EXIT_WRITE_DR7: + if (opcode == 0x230f && + X86_MODRM_REG(ctxt->insn.modrm.value) == 7) + return ES_OK; + break; + + case SVM_EXIT_WBINVD: + if (opcode == 0x90f) + return ES_OK; + break; + + default: + break; + } + + sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n", + opcode, exit_code, ctxt->regs->ip); + + return ES_UNSUPPORTED; +} + +static bool vc_decoding_needed(unsigned long exit_code) +{ + /* Exceptions don't require to decode the instruction */ + return !(exit_code >= SVM_EXIT_EXCP_BASE && + exit_code <= SVM_EXIT_LAST_EXCP); +} + +static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, + struct pt_regs *regs, + unsigned long exit_code) +{ + enum es_result ret = ES_OK; + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->regs = regs; + + if (vc_decoding_needed(exit_code)) + ret = vc_decode_insn(ctxt); + + return ret; +} + +static void vc_finish_insn(struct es_em_ctxt *ctxt) +{ + ctxt->regs->ip += ctxt->insn.length; +} + +static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt, + unsigned long address, + bool write) +{ + if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_USER; + ctxt->fi.cr2 = address; + if (write) + ctxt->fi.error_code |= X86_PF_WRITE; + + return ES_EXCEPTION; + } + + return ES_OK; +} + +static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, + void *src, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, b = backwards ? -1 : 1; + unsigned long address = (unsigned long)src; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, false); + if (ret != ES_OK) + return ret; + + for (i = 0; i < count; i++) { + void *s = src + (i * data_size * b); + char *d = buf + (i * data_size); + + ret = vc_read_mem(ctxt, s, d, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, + void *dst, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, s = backwards ? -1 : 1; + unsigned long address = (unsigned long)dst; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, true); + if (ret != ES_OK) + return ret; + + for (i = 0; i < count; i++) { + void *d = dst + (i * data_size * s); + char *b = buf + (i * data_size); + + ret = vc_write_mem(ctxt, d, b, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +#define IOIO_TYPE_STR BIT(2) +#define IOIO_TYPE_IN 1 +#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) +#define IOIO_TYPE_OUT 0 +#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) + +#define IOIO_REP BIT(3) + +#define IOIO_ADDR_64 BIT(9) +#define IOIO_ADDR_32 BIT(8) +#define IOIO_ADDR_16 BIT(7) + +#define IOIO_DATA_32 BIT(6) +#define IOIO_DATA_16 BIT(5) +#define IOIO_DATA_8 BIT(4) + +#define IOIO_SEG_ES (0 << 10) +#define IOIO_SEG_DS (3 << 10) + +static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) +{ + struct insn *insn = &ctxt->insn; + size_t size; + u64 port; + + *exitinfo = 0; + + switch (insn->opcode.bytes[0]) { + /* INS opcodes */ + case 0x6c: + case 0x6d: + *exitinfo |= IOIO_TYPE_INS; + *exitinfo |= IOIO_SEG_ES; + port = ctxt->regs->dx & 0xffff; + break; + + /* OUTS opcodes */ + case 0x6e: + case 0x6f: + *exitinfo |= IOIO_TYPE_OUTS; + *exitinfo |= IOIO_SEG_DS; + port = ctxt->regs->dx & 0xffff; + break; + + /* IN immediate opcodes */ + case 0xe4: + case 0xe5: + *exitinfo |= IOIO_TYPE_IN; + port = (u8)insn->immediate.value & 0xffff; + break; + + /* OUT immediate opcodes */ + case 0xe6: + case 0xe7: + *exitinfo |= IOIO_TYPE_OUT; + port = (u8)insn->immediate.value & 0xffff; + break; + + /* IN register opcodes */ + case 0xec: + case 0xed: + *exitinfo |= IOIO_TYPE_IN; + port = ctxt->regs->dx & 0xffff; + break; + + /* OUT register opcodes */ + case 0xee: + case 0xef: + *exitinfo |= IOIO_TYPE_OUT; + port = ctxt->regs->dx & 0xffff; + break; + + default: + return ES_DECODE_FAILED; + } + + *exitinfo |= port << 16; + + switch (insn->opcode.bytes[0]) { + case 0x6c: + case 0x6e: + case 0xe4: + case 0xe6: + case 0xec: + case 0xee: + /* Single byte opcodes */ + *exitinfo |= IOIO_DATA_8; + size = 1; + break; + default: + /* Length determined by instruction parsing */ + *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 + : IOIO_DATA_32; + size = (insn->opnd_bytes == 2) ? 2 : 4; + } + + switch (insn->addr_bytes) { + case 2: + *exitinfo |= IOIO_ADDR_16; + break; + case 4: + *exitinfo |= IOIO_ADDR_32; + break; + case 8: + *exitinfo |= IOIO_ADDR_64; + break; + } + + if (insn_has_rep_prefix(insn)) + *exitinfo |= IOIO_REP; + + return vc_ioio_check(ctxt, (u16)port, size); +} + +static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u64 exit_info_1, exit_info_2; + enum es_result ret; + + ret = vc_ioio_exitinfo(ctxt, &exit_info_1); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_STR) { + + /* (REP) INS/OUTS */ + + bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); + unsigned int io_bytes, exit_bytes; + unsigned int ghcb_count, op_count; + unsigned long es_base; + u64 sw_scratch; + + /* + * For the string variants with rep prefix the amount of in/out + * operations per #VC exception is limited so that the kernel + * has a chance to take interrupts and re-schedule while the + * instruction is emulated. + */ + io_bytes = (exit_info_1 >> 4) & 0x7; + ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; + + op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; + exit_info_2 = min(op_count, ghcb_count); + exit_bytes = exit_info_2 * io_bytes; + + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + /* Read bytes of OUTS into the shared buffer */ + if (!(exit_info_1 & IOIO_TYPE_IN)) { + ret = vc_insn_string_read(ctxt, + (void *)(es_base + regs->si), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + } + + /* + * Issue an VMGEXIT to the HV to consume the bytes from the + * shared buffer or to have it write them into the shared buffer + * depending on the instruction: OUTS or INS. + */ + sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); + ghcb_set_sw_scratch(ghcb, sw_scratch); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, + exit_info_1, exit_info_2); + if (ret != ES_OK) + return ret; + + /* Read bytes from shared buffer into the guest's destination. */ + if (exit_info_1 & IOIO_TYPE_IN) { + ret = vc_insn_string_write(ctxt, + (void *)(es_base + regs->di), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + + if (df) + regs->di -= exit_bytes; + else + regs->di += exit_bytes; + } else { + if (df) + regs->si -= exit_bytes; + else + regs->si += exit_bytes; + } + + if (exit_info_1 & IOIO_REP) + regs->cx -= exit_info_2; + + ret = regs->cx ? ES_RETRY : ES_OK; + + } else { + + /* IN/OUT into/from rAX */ + + int bits = (exit_info_1 & 0x70) >> 1; + u64 rax = 0; + + if (!(exit_info_1 & IOIO_TYPE_IN)) + rax = lower_bits(regs->ax, bits); + + ghcb_set_rax(ghcb, rax); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_IN) { + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + regs->ax = lower_bits(ghcb->save.rax, bits); + } + } + + return ret; +} + +static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + struct cpuid_leaf leaf; + int ret; + + leaf.fn = regs->ax; + leaf.subfn = regs->cx; + ret = snp_cpuid(ghcb, ctxt, &leaf); + if (!ret) { + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; + } + + return ret; +} + +static enum es_result vc_handle_cpuid(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u32 cr4 = native_read_cr4(); + enum es_result ret; + int snp_cpuid_ret; + + snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt); + if (!snp_cpuid_ret) + return ES_OK; + if (snp_cpuid_ret != -EOPNOTSUPP) + return ES_VMM_ERROR; + + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rcx(ghcb, regs->cx); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #GP - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + regs->ax = ghcb->save.rax; + regs->bx = ghcb->save.rbx; + regs->cx = ghcb->save.rcx; + regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); + enum es_result ret; + + /* + * The hypervisor should not be intercepting RDTSC/RDTSCP when Secure + * TSC is enabled. A #VC exception will be generated if the RDTSC/RDTSCP + * instructions are being intercepted. If this should occur and Secure + * TSC is enabled, guest execution should be terminated as the guest + * cannot rely on the TSC value provided by the hypervisor. + */ + if (sev_status & MSR_AMD64_SNP_SECURE_TSC) + return ES_VMM_ERROR; + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && + (!rdtscp || ghcb_rcx_is_valid(ghcb)))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + if (rdtscp) + ctxt->regs->cx = ghcb->save.rcx; + + return ES_OK; +} diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 139ad80d1df3..5fe201efc753 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2803,8 +2803,15 @@ static unsigned long get_segment_base(unsigned int segment) #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; + /* + * If we're not in a valid context with a real (not just lazy) + * user mm, then don't even try. + */ + if (!nmi_uaccess_okay()) + return 0; + /* IRQs are off, so this synchronizes with smp_store_release */ - ldt = READ_ONCE(current->active_mm->context.ldt); + ldt = smp_load_acquire(¤t->mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4a37a8bd87fd..e18cdaa1573c 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -82,6 +82,12 @@ struct alt_instr { extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +extern s32 __retpoline_sites[], __retpoline_sites_end[]; +extern s32 __return_sites[], __return_sites_end[]; +extern s32 __cfi_sites[], __cfi_sites_end[]; +extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; +extern s32 __smp_locks[], __smp_locks_end[]; + /* * Debug flag that can be tested to see whether alternative * instructions were patched in already: @@ -335,11 +341,6 @@ void nop_func(void); __ALTERNATIVE(\oldinstr, \newinstr, \ft_flags) .endm -#define old_len 141b-140b -#define new_len1 144f-143f -#define new_len2 145f-144f -#define new_len3 146f-145f - /* * Same as ALTERNATIVE macro above but for two alternatives. If CPU * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index cbc6157f0b4b..b5982b94bdba 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -16,8 +16,7 @@ static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res; - asm_inline (ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE - "call __sw_hweight32", + asm_inline (ALTERNATIVE("call __sw_hweight32", "popcntl %[val], %[cnt]", X86_FEATURE_POPCNT) : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT : [val] REG_IN (w)); @@ -46,8 +45,7 @@ static __always_inline unsigned long __arch_hweight64(__u64 w) { unsigned long res; - asm_inline (ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE - "call __sw_hweight64", + asm_inline (ALTERNATIVE("call __sw_hweight64", "popcntq %[val], %[cnt]", X86_FEATURE_POPCNT) : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT : [val] REG_IN (w)); diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index cc2881576c2c..eef0771512de 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -114,17 +114,12 @@ #endif #ifndef __ASSEMBLER__ -#ifndef __pic__ static __always_inline __pure void *rip_rel_ptr(void *p) { asm("leaq %c1(%%rip), %0" : "=r"(p) : "i"(p)); return p; } -#define RIP_REL_REF(var) (*(typeof(&(var)))rip_rel_ptr(&(var))) -#else -#define RIP_REL_REF(var) (var) -#endif #endif /* diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 100413aff640..eebbc8889e70 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -248,7 +248,7 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) static __always_inline unsigned long variable__ffs(unsigned long word) { - asm("rep; bsf %1,%0" + asm("tzcnt %1,%0" : "=r" (word) : ASM_INPUT_RM (word)); return word; @@ -267,10 +267,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word) static __always_inline unsigned long variable_ffz(unsigned long word) { - asm("rep; bsf %1,%0" - : "=r" (word) - : "r" (~word)); - return word; + return variable__ffs(~word); } /** diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 3f02ff6d333d..02b23aa78955 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -74,6 +74,11 @@ # define BOOT_STACK_SIZE 0x1000 #endif +#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) + +#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE +#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0 + #ifndef __ASSEMBLER__ extern unsigned int output_len; extern const unsigned long kernel_text_size; @@ -83,6 +88,11 @@ unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, void (*error)(char *x)); extern struct boot_params *boot_params_ptr; +extern unsigned long *trampoline_32bit; +extern const u16 trampoline_ljmp_imm_offset; + +void trampoline_32bit_src(void *trampoline, bool enable_5lvl); + #endif #endif /* _ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index e7225452963f..e1dbf8df1b69 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -22,7 +22,7 @@ static inline u64 cc_get_mask(void) static inline void cc_set_mask(u64 mask) { - RIP_REL_REF(cc_mask) = mask; + cc_mask = mask; } u64 cc_mkenc(u64 val); diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index fdbbbfec745a..719d95f1ab5e 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -23,7 +23,7 @@ DECLARE_PER_CPU(unsigned long, cpu_dr7); static __always_inline unsigned long native_get_debugreg(int regno) { - unsigned long val = 0; /* Damn you, gcc! */ + unsigned long val; switch (regno) { case 0: @@ -43,7 +43,7 @@ static __always_inline unsigned long native_get_debugreg(int regno) break; case 7: /* - * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them + * Use "asm volatile" for DR7 reads to forbid re-ordering them * with other code. * * This is needed because a DR7 access can cause a #VC exception @@ -55,7 +55,7 @@ static __always_inline unsigned long native_get_debugreg(int regno) * re-ordered to happen before the call to sev_es_ist_enter(), * causing stack recursion. */ - asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%db7, %0" : "=r" (val)); break; default: BUG(); @@ -83,15 +83,15 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value) break; case 7: /* - * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them + * Use "asm volatile" for DR7 writes to forbid re-ordering them * with other code. * * While is didn't happen with a DR7 write (see the DR7 read * comment above which explains where it happened), add the - * __FORCE_ORDER here too to avoid similar problems in the + * "asm volatile" here too to avoid similar problems in the * future. */ - asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER); + asm volatile("mov %0, %%db7" ::"r" (value)); break; default: BUG(); diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 53e4015242b4..97f341777db5 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -82,6 +82,7 @@ #define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) #define INAT_REX2_VARIANT (1 << (INAT_FLAG_OFFS + 9)) #define INAT_EVEX_SCALABLE (1 << (INAT_FLAG_OFFS + 10)) +#define INAT_INV64 (1 << (INAT_FLAG_OFFS + 11)) /* Attribute making macros for attribute tables */ #define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) #define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) @@ -242,4 +243,9 @@ static inline int inat_evex_scalable(insn_attr_t attr) { return attr & INAT_EVEX_SCALABLE; } + +static inline int inat_is_invalid64(insn_attr_t attr) +{ + return attr & INAT_INV64; +} #endif diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index e889c3bab5a2..ca309a3227c7 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -217,7 +217,7 @@ void memset_io(volatile void __iomem *, int, size_t); static inline void __iowrite32_copy(void __iomem *to, const void *from, size_t count) { - asm volatile("rep ; movsl" + asm volatile("rep movsl" : "=&c"(count), "=&D"(to), "=&S"(from) : "0"(count), "1"(to), "2"(from) : "memory"); @@ -282,7 +282,7 @@ static inline void outs##bwl(u16 port, const void *addr, unsigned long count) \ count--; \ } \ } else { \ - asm volatile("rep; outs" #bwl \ + asm volatile("rep outs" #bwl \ : "+S"(addr), "+c"(count) \ : "d"(port) : "memory"); \ } \ @@ -298,7 +298,7 @@ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ count--; \ } \ } else { \ - asm volatile("rep; ins" #bwl \ + asm volatile("rep ins" #bwl \ : "+D"(addr), "+c"(count) \ : "d"(port) : "memory"); \ } \ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 5432457d2338..f2ad77929d6e 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -8,6 +8,9 @@ # define PA_PGD 2 # define PA_SWAP_PAGE 3 # define PAGES_NR 4 +#else +/* Size of each exception handler referenced by the IDT */ +# define KEXEC_DEBUG_EXC_HANDLER_SIZE 6 /* PUSHI, PUSHI, 2-byte JMP */ #endif # define KEXEC_CONTROL_PAGE_SIZE 4096 @@ -59,6 +62,10 @@ struct kimage; extern unsigned long kexec_va_control_page; extern unsigned long kexec_pa_table_page; extern unsigned long kexec_pa_swap_page; +extern gate_desc kexec_debug_idt[]; +extern unsigned char kexec_debug_exc_vectors[]; +extern uint16_t kexec_debug_8250_port; +extern unsigned long kexec_debug_8250_mmio32; #endif /* diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index b51d8a4673f5..9d38ae744a2e 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -141,5 +141,15 @@ #define SYM_FUNC_START_WEAK_NOALIGN(name) \ SYM_START(name, SYM_L_WEAK, SYM_A_NONE) +/* + * Expose 'sym' to the startup code in arch/x86/boot/startup/, by emitting an + * alias prefixed with __pi_ + */ +#ifdef __ASSEMBLER__ +#define SYM_PIC_ALIAS(sym) SYM_ALIAS(__pi_ ## sym, sym, SYM_L_GLOBAL) +#else +#define SYM_PIC_ALIAS(sym) extern typeof(sym) __PASTE(__pi_, sym) __alias(sym) +#endif + #endif /* _ASM_X86_LINKAGE_H */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 1530ee301dfe..ea6494628cb0 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -61,7 +61,7 @@ void __init sev_es_init_vc_handling(void); static inline u64 sme_get_me_mask(void) { - return RIP_REL_REF(sme_me_mask); + return sme_me_mask; } #define __bss_decrypted __section(".bss..decrypted") diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 695e569159c1..be7cddc414e4 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -17,10 +17,12 @@ struct ucode_cpu_info { void load_ucode_bsp(void); void load_ucode_ap(void); void microcode_bsp_resume(void); +bool __init microcode_loader_disabled(void); #else static inline void load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } static inline void microcode_bsp_resume(void) { } +static inline bool __init microcode_loader_disabled(void) { return false; } #endif extern unsigned long initrd_start_early; diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 8b8055a8eb9e..0fe9c569d171 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -16,6 +16,8 @@ #define MM_CONTEXT_LOCK_LAM 2 /* Allow LAM and SVA coexisting */ #define MM_CONTEXT_FORCE_TAGGED_SVA 3 +/* Tracks mm_cpumask */ +#define MM_CONTEXT_NOTRACK 4 /* * x86 has arch-specific MMU state beyond what lives in mm_struct. @@ -44,9 +46,7 @@ typedef struct { struct ldt_struct *ldt; #endif -#ifdef CONFIG_X86_64 unsigned long flags; -#endif #ifdef CONFIG_ADDRESS_MASKING /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 2398058b6e83..73bf3b1b44e8 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -190,7 +190,7 @@ extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, #define activate_mm(prev, next) \ do { \ paravirt_enter_mmap(next); \ - switch_mm((prev), (next), NULL); \ + switch_mm_irqs_off((prev), (next), NULL); \ } while (0); #ifdef CONFIG_X86_32 @@ -247,6 +247,16 @@ static inline bool is_64bit_mm(struct mm_struct *mm) } #endif +static inline bool is_notrack_mm(struct mm_struct *mm) +{ + return test_bit(MM_CONTEXT_NOTRACK, &mm->context.flags); +} + +static inline void set_notrack_mm(struct mm_struct *mm) +{ + set_bit(MM_CONTEXT_NOTRACK, &mm->context.flags); +} + /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other @@ -272,4 +282,7 @@ unsigned long __get_current_cr3_fast(void); #include <asm-generic/mmu_context.h> +extern struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm); +extern void unuse_temporary_mm(struct mm_struct *prev_mm); + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index fd4effd7654a..51a677fe9a8d 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -561,7 +561,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); +DECLARE_STATIC_KEY_FALSE(cpu_buf_vm_clear); extern u16 mds_verw_sel; diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 9f77bf03d747..018a8d906ca3 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -29,9 +29,7 @@ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC /* Physical address where kernel should be loaded. */ -#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ - + (CONFIG_PHYSICAL_ALIGN - 1)) \ - & ~(CONFIG_PHYSICAL_ALIGN - 1)) +#define LOAD_PHYSICAL_ADDR __ALIGN_KERNEL_MASK(CONFIG_PHYSICAL_START, CONFIG_PHYSICAL_ALIGN - 1) #define __START_KERNEL (__START_KERNEL_map + LOAD_PHYSICAL_ADDR) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 5fe314a2e73e..b0d03b6c279b 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -29,6 +29,8 @@ #ifdef CONFIG_SMP +#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":" + #ifdef CONFIG_CC_HAS_NAMED_AS #ifdef __CHECKER__ @@ -36,23 +38,23 @@ # define __seg_fs __attribute__((address_space(__seg_fs))) #endif +#define __percpu_prefix #define __percpu_seg_override CONCATENATE(__seg_, __percpu_seg) -#define __percpu_prefix "" #else /* !CONFIG_CC_HAS_NAMED_AS: */ +#define __percpu_prefix __force_percpu_prefix #define __percpu_seg_override -#define __percpu_prefix "%%"__stringify(__percpu_seg)":" #endif /* CONFIG_CC_HAS_NAMED_AS */ -#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":" -#define __my_cpu_offset this_cpu_read(this_cpu_off) - /* * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. - * + */ +#define __my_cpu_offset this_cpu_read(this_cpu_off) + +/* * arch_raw_cpu_ptr should not be used in 32-bit VDSO for a 64-bit * kernel, because games are played with CONFIG_X86_64 there and * sizeof(this_cpu_off) becames 4. @@ -77,9 +79,9 @@ #else /* !CONFIG_SMP: */ +#define __force_percpu_prefix +#define __percpu_prefix #define __percpu_seg_override -#define __percpu_prefix "" -#define __force_percpu_prefix "" #define PER_CPU_VAR(var) (var)__percpu_rel @@ -97,8 +99,8 @@ # define __my_cpu_var(var) (*__my_cpu_ptr(&(var))) #endif -#define __percpu_arg(x) __percpu_prefix "%" #x #define __force_percpu_arg(x) __force_percpu_prefix "%" #x +#define __percpu_arg(x) __percpu_prefix "%" #x /* * For arch-specific code, we can use direct single-insn ops (they diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5d2f7e5aff26..0973bed22172 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -734,6 +734,7 @@ void store_cpu_caps(struct cpuinfo_x86 *info); enum l1tf_mitigations { L1TF_MITIGATION_OFF, + L1TF_MITIGATION_AUTO, L1TF_MITIGATION_FLUSH_NOWARN, L1TF_MITIGATION_FLUSH, L1TF_MITIGATION_FLUSH_NOSMT, diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ad9212df0ec0..6324f4c6c545 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -52,6 +52,7 @@ extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp); extern void startup_64_setup_gdt_idt(void); +extern void startup_64_load_idt(void *vc_handler); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h new file mode 100644 index 000000000000..b7232081f8f7 --- /dev/null +++ b/arch/x86/include/asm/sev-internal.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define DR7_RESET_VALUE 0x400 + +extern struct ghcb boot_ghcb_page; +extern u64 sev_hv_features; +extern u64 sev_secrets_pa; + +/* #VC handler runtime per-CPU data */ +struct sev_es_runtime_data { + struct ghcb ghcb_page; + + /* + * Reserve one page per CPU as backup storage for the unencrypted GHCB. + * It is needed when an NMI happens while the #VC handler uses the real + * GHCB, and the NMI handler itself is causing another #VC exception. In + * that case the GHCB content of the first handler needs to be backed up + * and restored. + */ + struct ghcb backup_ghcb; + + /* + * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. + * There is no need for it to be atomic, because nothing is written to + * the GHCB between the read and the write of ghcb_active. So it is safe + * to use it when a nested #VC exception happens before the write. + * + * This is necessary for example in the #VC->NMI->#VC case when the NMI + * happens while the first #VC handler uses the GHCB. When the NMI code + * raises a second #VC handler it might overwrite the contents of the + * GHCB written by the first handler. To avoid this the content of the + * GHCB is saved and restored when the GHCB is detected to be in use + * already. + */ + bool ghcb_active; + bool backup_ghcb_active; + + /* + * Cached DR7 value - write it on DR7 writes and return it on reads. + * That value will never make it to the real hardware DR7 as debugging + * is currently unsupported in SEV-ES guests. + */ + unsigned long dr7; +}; + +struct ghcb_state { + struct ghcb *ghcb; +}; + +extern struct svsm_ca boot_svsm_ca_page; + +struct ghcb *__sev_get_ghcb(struct ghcb_state *state); +void __sev_put_ghcb(struct ghcb_state *state); + +DECLARE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +DECLARE_PER_CPU(struct sev_es_save_area *, sev_vmsa); + +void early_set_pages_state(unsigned long vaddr, unsigned long paddr, + unsigned long npages, enum psc_op op); + +DECLARE_PER_CPU(struct svsm_ca *, svsm_caa); +DECLARE_PER_CPU(u64, svsm_caa_pa); + +extern struct svsm_ca *boot_svsm_caa; +extern u64 boot_svsm_caa_pa; + +static __always_inline struct svsm_ca *svsm_get_caa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa); + else + return boot_svsm_caa; +} + +static __always_inline u64 svsm_get_caa_pa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa_pa); + else + return boot_svsm_caa_pa; +} + +int svsm_perform_call_protocol(struct svsm_call *call); + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + return __rdmsr(MSR_AMD64_SEV_ES_GHCB); +} + +static __always_inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = (u32)(val); + high = (u32)(val >> 32); + + native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); +} + +void snp_register_ghcb_early(unsigned long paddr); +bool sev_es_negotiate_protocol(void); +bool sev_es_check_cpu_features(void); +u64 get_hv_features(void); + +const struct snp_cpuid_table *snp_cpuid_get_table(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index ba7999f66abe..6158893786d6 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -15,6 +15,7 @@ #include <asm/sev-common.h> #include <asm/coco.h> #include <asm/set_memory.h> +#include <asm/svm.h> #define GHCB_PROTOCOL_MIN 1ULL #define GHCB_PROTOCOL_MAX 2ULL @@ -83,6 +84,36 @@ extern void vc_no_ghcb(void); extern void vc_boot_ghcb(void); extern bool handle_vc_boot_ghcb(struct pt_regs *regs); +/* + * Individual entries of the SNP CPUID table, as defined by the SNP + * Firmware ABI, Revision 0.9, Section 7.1, Table 14. + */ +struct snp_cpuid_fn { + u32 eax_in; + u32 ecx_in; + u64 xcr0_in; + u64 xss_in; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u64 __reserved; +} __packed; + +/* + * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, + * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit + * of 64 entries per CPUID table. + */ +#define SNP_CPUID_COUNT_MAX 64 + +struct snp_cpuid_table { + u32 count; + u32 __reserved1; + u64 __reserved2; + struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; +} __packed; + /* PVALIDATE return codes */ #define PVALIDATE_FAIL_SIZEMISMATCH 6 @@ -484,6 +515,34 @@ int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); +static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) +{ + ghcb->save.sw_exit_code = 0; + __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +void vc_forward_exception(struct es_em_ctxt *ctxt); + +/* I/O parameters for CPUID-related helpers */ +struct cpuid_leaf { + u32 fn; + u32 subfn; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +}; + +int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf); + +void __noreturn sev_es_terminate(unsigned int set, unsigned int reason); +enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2); + +extern struct ghcb *boot_ghcb; + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define snp_vmpl 0 diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 6266d6b9e0b8..ecda17efa042 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -10,30 +10,19 @@ #include <linux/irqflags.h> #include <linux/jump_label.h> -/* - * The compiler should not reorder volatile asm statements with respect to each - * other: they should execute in program order. However GCC 4.9.x and 5.x have - * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder - * volatile asm. The write functions are not affected since they have memory - * clobbers preventing reordering. To prevent reads from being reordered with - * respect to writes, use a dummy memory operand. - */ - -#define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL) - void native_write_cr0(unsigned long val); static inline unsigned long native_read_cr0(void) { unsigned long val; - asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%cr0,%0" : "=r" (val)); return val; } static __always_inline unsigned long native_read_cr2(void) { unsigned long val; - asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%cr2,%0" : "=r" (val)); return val; } @@ -45,7 +34,7 @@ static __always_inline void native_write_cr2(unsigned long val) static __always_inline unsigned long __native_read_cr3(void) { unsigned long val; - asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%cr3,%0" : "=r" (val)); return val; } @@ -66,10 +55,10 @@ static inline unsigned long native_read_cr4(void) asm volatile("1: mov %%cr4, %0\n" "2:\n" _ASM_EXTABLE(1b, 2b) - : "=r" (val) : "0" (0), __FORCE_ORDER); + : "=r" (val) : "0" (0)); #else /* CR4 always exists on x86_64. */ - asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%cr4,%0" : "=r" (val)); #endif return val; } diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 32c0d981a82a..e9cce169bb4c 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -33,11 +33,11 @@ extern size_t strlen(const char *s); static __always_inline void *__memcpy(void *to, const void *from, size_t n) { int d0, d1, d2; - asm volatile("rep ; movsl\n\t" + asm volatile("rep movsl\n\t" "movl %4,%%ecx\n\t" "andl $3,%%ecx\n\t" "jz 1f\n\t" - "rep ; movsb\n\t" + "rep movsb\n\t" "1:" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from) @@ -89,7 +89,7 @@ static __always_inline void *__constant_memcpy(void *to, const void *from, if (n >= 5 * 4) { /* large block: use rep prefix */ int ecx; - asm volatile("rep ; movsl" + asm volatile("rep movsl" : "=&c" (ecx), "=&D" (edi), "=&S" (esi) : "0" (n / 4), "1" (edi), "2" (esi) : "memory" @@ -165,8 +165,7 @@ extern void *memchr(const void *cs, int c, size_t count); static inline void *__memset_generic(void *s, char c, size_t count) { int d0, d1; - asm volatile("rep\n\t" - "stosb" + asm volatile("rep stosb" : "=&c" (d0), "=&D" (d1) : "a" (c), "1" (s), "0" (count) : "memory"); @@ -199,8 +198,7 @@ extern void *memset(void *, int, size_t); static inline void *memset16(uint16_t *s, uint16_t v, size_t n) { int d0, d1; - asm volatile("rep\n\t" - "stosw" + asm volatile("rep stosw" : "=&c" (d0), "=&D" (d1) : "a" (v), "1" (s), "0" (n) : "memory"); @@ -211,8 +209,7 @@ static inline void *memset16(uint16_t *s, uint16_t v, size_t n) static inline void *memset32(uint32_t *s, uint32_t v, size_t n) { int d0, d1; - asm volatile("rep\n\t" - "stosl" + asm volatile("rep stosl" : "=&c" (d0), "=&D" (d1) : "a" (v), "1" (s), "0" (n) : "memory"); diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index ab9e143ec9fe..5337f1be18f6 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -11,11 +11,11 @@ * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. * Raise it if needed. */ -#define POKE_MAX_OPCODE_SIZE 5 +#define TEXT_POKE_MAX_OPCODE_SIZE 5 extern void text_poke_early(void *addr, const void *opcode, size_t len); -extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len); +extern void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len); /* * Clear and restore the kernel write-protection flag on the local CPU. @@ -32,17 +32,17 @@ extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); -extern void text_poke_sync(void); +extern void smp_text_poke_sync_each_cpu(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); #define text_poke_copy text_poke_copy extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); -extern int poke_int3_handler(struct pt_regs *regs); -extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); +extern int smp_text_poke_int3_handler(struct pt_regs *regs); +extern void smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate); -extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); -extern void text_poke_finish(void); +extern void smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate); +extern void smp_text_poke_batch_finish(void); #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC @@ -82,7 +82,7 @@ static __always_inline int text_opcode_size(u8 opcode) } union text_poke_insn { - u8 text[POKE_MAX_OPCODE_SIZE]; + u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; struct { u8 opcode; s32 disp; @@ -128,8 +128,8 @@ void *text_gen_insn(u8 opcode, const void *addr, const void *dest) } extern int after_bootmem; -extern __ro_after_init struct mm_struct *poking_mm; -extern __ro_after_init unsigned long poking_addr; +extern __ro_after_init struct mm_struct *text_poke_mm; +extern __ro_after_init unsigned long text_poke_mm_addr; #ifndef CONFIG_UML_X86 static __always_inline @@ -142,13 +142,14 @@ static __always_inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* - * The int3 handler in entry_64.S adds a gap between the + * The INT3 handler in entry_64.S adds a gap between the * stack where the break point happened, and the saving of * pt_regs. We can extend the original stack because of - * this gap. See the idtentry macro's create_gap option. + * this gap. See the idtentry macro's X86_TRAP_BP logic. * - * Similarly entry_32.S will have a gap on the stack for (any) hardware - * exception and pt_regs; see FIXUP_FRAME. + * Similarly, entry_32.S will have a gap on the stack for + * (any) hardware exception and pt_regs; see the + * FIXUP_FRAME macro. */ regs->sp -= sizeof(unsigned long); *(unsigned long *)regs->sp = val; diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index c52f0133425b..c8a5ae35c871 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -26,8 +26,8 @@ extern unsigned long USER_PTR_MAX; */ static inline unsigned long __untagged_addr(unsigned long addr) { - asm (ALTERNATIVE("", - "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM) + asm_inline (ALTERNATIVE("", "and " __percpu_arg([mask]) ", %[addr]", + X86_FEATURE_LAM) : [addr] "+r" (addr) : [mask] "m" (__my_cpu_var(tlbstate_untag_mask))); @@ -54,7 +54,7 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, #endif #define valid_user_address(x) \ - ((__force unsigned long)(x) <= runtime_const_ptr(USER_PTR_MAX)) + likely((__force unsigned long)(x) <= runtime_const_ptr(USER_PTR_MAX)) /* * Masking the user address is an alternative to a conditional diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h index c9b2ba7a9ec4..7000aeb59aa2 100644 --- a/arch/x86/include/asm/vdso/processor.h +++ b/arch/x86/include/asm/vdso/processor.h @@ -7,15 +7,15 @@ #ifndef __ASSEMBLER__ -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static __always_inline void rep_nop(void) +/* PAUSE is a good thing to insert into busy-wait loops. */ +static __always_inline void native_pause(void) { - asm volatile("rep; nop" ::: "memory"); + asm volatile("pause" ::: "memory"); } static __always_inline void cpu_relax(void) { - rep_nop(); + native_pause(); } struct getcpu_cache; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index bf82c6f7d690..ddbc303e41e3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,36 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt -#include <linux/module.h> -#include <linux/sched.h> +#include <linux/mmu_context.h> #include <linux/perf_event.h> -#include <linux/mutex.h> -#include <linux/list.h> -#include <linux/stringify.h> -#include <linux/highmem.h> -#include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> -#include <linux/stop_machine.h> -#include <linux/slab.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> -#include <linux/mmu_context.h> -#include <linux/bsearch.h> -#include <linux/sync_core.h> + #include <asm/text-patching.h> -#include <asm/alternative.h> -#include <asm/sections.h> -#include <asm/mce.h> -#include <asm/nmi.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> #include <asm/insn.h> -#include <asm/io.h> -#include <asm/fixmap.h> -#include <asm/paravirt.h> -#include <asm/asm-prototypes.h> -#include <asm/cfi.h> +#include <asm/nmi.h> int __read_mostly alternatives_patched; @@ -171,13 +149,6 @@ static void add_nop(u8 *buf, unsigned int len) *buf = INT3_INSN_OPCODE; } -extern s32 __retpoline_sites[], __retpoline_sites_end[]; -extern s32 __return_sites[], __return_sites_end[]; -extern s32 __cfi_sites[], __cfi_sites_end[]; -extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; -extern s32 __smp_locks[], __smp_locks_end[]; -void text_poke_early(void *addr, const void *opcode, size_t len); - /* * Matches NOP and NOPL, not any of the other possible NOPs. */ @@ -369,7 +340,7 @@ static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, } } -void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) +void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { __apply_relocation(buf, instr, instrlen, repl, repl_len); optimize_nops(instr, buf, instrlen); @@ -525,7 +496,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; - apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); + text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); @@ -2010,7 +1981,7 @@ __visible noinline void __init __alt_reloc_selftest(void *arg) static noinline void __init alt_reloc_selftest(void) { /* - * Tests apply_relocation(). + * Tests text_poke_apply_relocation(). * * This has a relative immediate (CALL) in a place other than the first * instruction and additionally on x86_64 we get a RIP-relative LEA: @@ -2140,76 +2111,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } -typedef struct { - struct mm_struct *mm; -} temp_mm_state_t; - -/* - * Using a temporary mm allows to set temporary mappings that are not accessible - * by other CPUs. Such mappings are needed to perform sensitive memory writes - * that override the kernel memory protections (e.g., W^X), without exposing the - * temporary page-table mappings that are required for these write operations to - * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the - * mapping is torn down. - * - * Context: The temporary mm needs to be used exclusively by a single core. To - * harden security IRQs must be disabled while the temporary mm is - * loaded, thereby preventing interrupt handler bugs from overriding - * the kernel memory protection. - */ -static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) -{ - temp_mm_state_t temp_state; - - lockdep_assert_irqs_disabled(); - - /* - * Make sure not to be in TLB lazy mode, as otherwise we'll end up - * with a stale address space WITHOUT being in lazy mode after - * restoring the previous mm. - */ - if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) - leave_mm(); - - temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); - switch_mm_irqs_off(NULL, mm, current); - - /* - * If breakpoints are enabled, disable them while the temporary mm is - * used. Userspace might set up watchpoints on addresses that are used - * in the temporary mm, which would lead to wrong signals being sent or - * crashes. - * - * Note that breakpoints are not disabled selectively, which also causes - * kernel breakpoints (e.g., perf's) to be disabled. This might be - * undesirable, but still seems reasonable as the code that runs in the - * temporary mm should be short. - */ - if (hw_breakpoint_active()) - hw_breakpoint_disable(); - - return temp_state; -} - -__ro_after_init struct mm_struct *poking_mm; -__ro_after_init unsigned long poking_addr; - -static inline void unuse_temporary_mm(temp_mm_state_t prev_state) -{ - lockdep_assert_irqs_disabled(); - - switch_mm_irqs_off(NULL, prev_state.mm, current); - - /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ - cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm)); - - /* - * Restore the breakpoints if they were disabled before the temporary mm - * was loaded. - */ - if (hw_breakpoint_active()) - hw_breakpoint_restore(); -} +__ro_after_init struct mm_struct *text_poke_mm; +__ro_after_init unsigned long text_poke_mm_addr; static void text_poke_memcpy(void *dst, const void *src, size_t len) { @@ -2229,7 +2132,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; - temp_mm_state_t prev; + struct mm_struct *prev_mm; unsigned long flags; pte_t pte, *ptep; spinlock_t *ptl; @@ -2266,7 +2169,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l /* * The lock is not really needed, but this allows to avoid open-coding. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); /* * This must not fail; preallocated in poking_init(). @@ -2276,21 +2179,21 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l local_irq_save(flags); pte = mk_pte(pages[0], pgprot); - set_pte_at(poking_mm, poking_addr, ptep, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); - set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte); } /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ - prev = use_temporary_mm(poking_mm); + prev_mm = use_temporary_mm(text_poke_mm); kasan_disable_current(); - func((u8 *)poking_addr + offset_in_page(addr), src, len); + func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -2299,22 +2202,22 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l */ barrier(); - pte_clear(poking_mm, poking_addr, ptep); + pte_clear(text_poke_mm, text_poke_mm_addr, ptep); if (cross_page_boundary) - pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ - unuse_temporary_mm(prev); + unuse_temporary_mm(prev_mm); /* * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ - flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); @@ -2450,7 +2353,7 @@ static void do_sync_core(void *info) sync_core(); } -void text_poke_sync(void) +void smp_text_poke_sync_each_cpu(void) { on_each_cpu(do_sync_core, NULL, 1); } @@ -2460,64 +2363,66 @@ void text_poke_sync(void) * this thing. When len == 6 everything is prefixed with 0x0f and we map * opcode to Jcc.d8, using len to distinguish. */ -struct text_poke_loc { +struct smp_text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; s32 disp; u8 len; u8 opcode; - const u8 text[POKE_MAX_OPCODE_SIZE]; - /* see text_poke_bp_batch() */ + const u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; + /* see smp_text_poke_batch_finish() */ u8 old; }; -struct bp_patching_desc { - struct text_poke_loc *vec; +#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) + +static struct smp_text_poke_array { + struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX]; int nr_entries; - atomic_t refs; -}; +} text_poke_array; -static struct bp_patching_desc bp_desc; +static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); -static __always_inline -struct bp_patching_desc *try_get_desc(void) +/* + * These four __always_inline annotations imply noinstr, necessary + * due to smp_text_poke_int3_handler() being noinstr: + */ + +static __always_inline bool try_get_text_poke_array(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); - if (!raw_atomic_inc_not_zero(&desc->refs)) - return NULL; + if (!raw_atomic_inc_not_zero(refs)) + return false; - return desc; + return true; } -static __always_inline void put_desc(void) +static __always_inline void put_text_poke_array(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); smp_mb__before_atomic(); - raw_atomic_dec(&desc->refs); + raw_atomic_dec(refs); } -static __always_inline void *text_poke_addr(struct text_poke_loc *tp) +static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl) { - return _stext + tp->rel_addr; + return _stext + tpl->rel_addr; } -static __always_inline int patch_cmp(const void *key, const void *elt) +static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b) { - struct text_poke_loc *tp = (struct text_poke_loc *) elt; - - if (key < text_poke_addr(tp)) + if (tpl_a < text_poke_addr(tpl_b)) return -1; - if (key > text_poke_addr(tp)) + if (tpl_a > text_poke_addr(tpl_b)) return 1; return 0; } -noinstr int poke_int3_handler(struct pt_regs *regs) +noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { - struct bp_patching_desc *desc; - struct text_poke_loc *tp; + struct smp_text_poke_loc *tpl; int ret = 0; void *ip; @@ -2526,41 +2431,40 @@ noinstr int poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * bp_desc with non-zero refcount: + * text_poke_array with non-zero refcount: * - * bp_desc.refs = 1 INT3 - * WMB RMB - * write INT3 if (bp_desc.refs != 0) + * text_poke_array_refs = 1 INT3 + * WMB RMB + * write INT3 if (text_poke_array_refs != 0) */ smp_rmb(); - desc = try_get_desc(); - if (!desc) + if (!try_get_text_poke_array()) return 0; /* - * Discount the INT3. See text_poke_bp_batch(). + * Discount the INT3. See smp_text_poke_batch_finish(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. */ - if (unlikely(desc->nr_entries > 1)) { - tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, - sizeof(struct text_poke_loc), + if (unlikely(text_poke_array.nr_entries > 1)) { + tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries, + sizeof(struct smp_text_poke_loc), patch_cmp); - if (!tp) + if (!tpl) goto out_put; } else { - tp = desc->vec; - if (text_poke_addr(tp) != ip) + tpl = text_poke_array.vec; + if (text_poke_addr(tpl) != ip) goto out_put; } - ip += tp->len; + ip += tpl->len; - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: /* * Someone poked an explicit INT3, they'll want to handle it, @@ -2573,16 +2477,16 @@ noinstr int poke_int3_handler(struct pt_regs *regs) break; case CALL_INSN_OPCODE: - int3_emulate_call(regs, (long)ip + tp->disp); + int3_emulate_call(regs, (long)ip + tpl->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - int3_emulate_jmp(regs, (long)ip + tp->disp); + int3_emulate_jmp(regs, (long)ip + tpl->disp); break; case 0x70 ... 0x7f: /* Jcc */ - int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); + int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp); break; default: @@ -2592,51 +2496,50 @@ noinstr int poke_int3_handler(struct pt_regs *regs) ret = 1; out_put: - put_desc(); + put_text_poke_array(); return ret; } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) -static struct text_poke_loc tp_vec[TP_VEC_MAX]; -static int tp_vec_nr; - /** - * text_poke_bp_batch() -- update instructions on live kernel on SMP - * @tp: vector of instructions to patch - * @nr_entries: number of entries in the vector + * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP * - * Modify multi-byte instruction by using int3 breakpoint on SMP. - * We completely avoid stop_machine() here, and achieve the - * synchronization using int3 breakpoint. + * Input state: + * text_poke_array.vec: vector of instructions to patch + * text_poke_array.nr_entries: number of entries in the vector + * + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. * * The way it is done: * - For each entry in the vector: - * - add a int3 trap to the address that will be patched - * - sync cores + * - add an INT3 trap to the address that will be patched + * - SMP sync all CPUs * - For each entry in the vector: * - update all but the first byte of the patched range - * - sync cores + * - SMP sync all CPUs * - For each entry in the vector: - * - replace the first byte (int3) by the first byte of + * - replace the first byte (INT3) by the first byte of the * replacing opcode - * - sync cores + * - SMP sync all CPUs */ -static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +void smp_text_poke_batch_finish(void) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; - lockdep_assert_held(&text_mutex); + if (!text_poke_array.nr_entries) + return; - bp_desc.vec = tp; - bp_desc.nr_entries = nr_entries; + lockdep_assert_held(&text_mutex); /* - * Corresponds to the implicit memory barrier in try_get_desc() to - * ensure reading a non-zero refcount provides up to date bp_desc data. + * Corresponds to the implicit memory barrier in try_get_text_poke_array() to + * ensure reading a non-zero refcount provides up to date text_poke_array data. */ - atomic_set_release(&bp_desc.refs, 1); + for_each_possible_cpu(i) + atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); /* * Function tracing can enable thousands of places that need to be @@ -2649,33 +2552,33 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries cond_resched(); /* - * Corresponding read barrier in int3 notifier for making sure the - * nr_entries and handler are correctly ordered wrt. patching. + * Corresponding read barrier in INT3 notifier for making sure the + * text_poke_array.nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* - * First step: add a int3 trap to the address that will be patched. + * First step: add a INT3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) { - tp[i].old = *(u8 *)text_poke_addr(&tp[i]); - text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + for (i = 0; i < text_poke_array.nr_entries; i++) { + text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE); } - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Second step: update all but the first byte of the patched range. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; - u8 _new[POKE_MAX_OPCODE_SIZE+1]; - const u8 *new = tp[i].text; - int len = tp[i].len; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, }; + u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1]; + const u8 *new = text_poke_array.vec[i].text; + int len = text_poke_array.vec[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, - text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); if (len == 6) { @@ -2684,7 +2587,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries new = _new; } - text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); @@ -2715,7 +2618,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * The old instruction is recorded so that the event can be * processed forwards or backwards. */ - perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); + perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len); } if (do_sync) { @@ -2724,63 +2627,79 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } /* - * Third step: replace the first byte (int3) by the first byte of + * Third step: replace the first byte (INT3) by the first byte of the * replacing opcode. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 byte = tp[i].text[0]; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 byte = text_poke_array.vec[i].text[0]; - if (tp[i].len == 6) + if (text_poke_array.vec[i].len == 6) byte = 0x0f; if (byte == INT3_INSN_OPCODE) continue; - text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE); do_sync++; } if (do_sync) - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Remove and wait for refs to be zero. + * + * Notably, if after step-3 above the INT3 got removed, then the + * smp_text_poke_sync_each_cpu() will have serialized against any running INT3 + * handlers and the below spin-wait will not happen. + * + * IOW. unless the replacement instruction is INT3, this case goes + * unused. */ - if (!atomic_dec_and_test(&bp_desc.refs)) - atomic_cond_read_acquire(&bp_desc.refs, !VAL); + for_each_possible_cpu(i) { + atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i); + + if (unlikely(!atomic_dec_and_test(refs))) + atomic_cond_read_acquire(refs, !VAL); + } + + /* They are all completed: */ + text_poke_array.nr_entries = 0; } -static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate) +static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { + struct smp_text_poke_loc *tpl; struct insn insn; int ret, i = 0; + tpl = &text_poke_array.vec[text_poke_array.nr_entries++]; + if (len == 6) i = 1; - memcpy((void *)tp->text, opcode+i, len-i); + memcpy((void *)tpl->text, opcode+i, len-i); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); - tp->rel_addr = addr - (void *)_stext; - tp->len = len; - tp->opcode = insn.opcode.bytes[0]; + tpl->rel_addr = addr - (void *)_stext; + tpl->len = len; + tpl->opcode = insn.opcode.bytes[0]; if (is_jcc32(&insn)) { /* * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. */ - tp->opcode = insn.opcode.bytes[1] - 0x10; + tpl->opcode = insn.opcode.bytes[1] - 0x10; } - switch (tp->opcode) { + switch (tpl->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: @@ -2789,14 +2708,14 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, * next instruction can be padded with INT3. */ for (i = insn.length; i < len; i++) - BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + BUG_ON(tpl->text[i] != INT3_INSN_OPCODE); break; default: BUG_ON(len != insn.length); } - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; @@ -2805,21 +2724,21 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: case 0x70 ... 0x7f: /* Jcc */ - tp->disp = insn.immediate.value; + tpl->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP8_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP8_INSN_OPCODE; + tpl->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP32_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP32_INSN_OPCODE; + tpl->disp = 0; break; default: /* unknown instruction */ @@ -2830,51 +2749,50 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, } /* - * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing * early if needed. */ -static bool tp_order_fail(void *addr) +static bool text_poke_addr_ordered(void *addr) { - struct text_poke_loc *tp; - - if (!tp_vec_nr) - return false; + WARN_ON_ONCE(!addr); - if (!addr) /* force */ + if (!text_poke_array.nr_entries) return true; - tp = &tp_vec[tp_vec_nr - 1]; - if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) - return true; - - return false; -} - -static void text_poke_flush(void *addr) -{ - if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { - text_poke_bp_batch(tp_vec, tp_vec_nr); - tp_vec_nr = 0; - } -} + /* + * If the last current entry's address is higher than the + * new entry's address we'd like to add, then ordering + * is violated and we must first flush all pending patching + * requests: + */ + if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr) + return false; -void text_poke_finish(void) -{ - text_poke_flush(NULL); + return true; } -void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +/** + * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @emulate: instruction to be emulated + * + * Add a new instruction to the current queue of to-be-patched instructions + * the kernel maintains. The patching request will not be executed immediately, + * but becomes part of an array of patching requests, optimized for batched + * execution. All pending patching requests will be executed on the next + * smp_text_poke_batch_finish() call. + */ +void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc *tp; - - text_poke_flush(addr); - - tp = &tp_vec[tp_vec_nr++]; - text_poke_loc_init(tp, addr, opcode, len, emulate); + if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) + smp_text_poke_batch_finish(); + __smp_text_poke_batch_add(addr, opcode, len, emulate); } /** - * text_poke_bp() -- update instructions on live kernel on SMP + * smp_text_poke_single() -- update instruction on live kernel on SMP immediately * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy @@ -2882,12 +2800,11 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is - * not possible to allocate memory. + * not possible to allocate memory for a vector. The single instruction + * is patched in immediately. */ -void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) +void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp; - - text_poke_loc_init(&tp, addr, opcode, len, emulate); - text_poke_bp_batch(&tp, 1); + __smp_text_poke_batch_add(addr, opcode, len, emulate); + smp_text_poke_batch_finish(); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index eebc360ed1bb..ba5a4ccda37a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1486,7 +1486,7 @@ static void __init delay_with_tsc(void) * 1 GHz == 40 jiffies */ do { - rep_nop(); + native_pause(); now = rdtsc(); } while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end)); } diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index d86d7d6e750c..a951333c5995 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -185,7 +185,7 @@ static void *patch_dest(void *dest, bool direct) u8 *pad = dest - tsize; memcpy(insn_buff, skl_call_thunk_template, tsize); - apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); + text_poke_apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); /* Already patched? */ if (!bcmp(pad, insn_buff, tsize)) @@ -294,7 +294,7 @@ static bool is_callthunk(void *addr) pad = (void *)(dest - tmpl_size); memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); + text_poke_apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); return !bcmp(pad, insn_buff, tmpl_size); } @@ -312,7 +312,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip) return 0; memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); + text_poke_apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); memcpy(*pprog, insn_buff, tmpl_size); *pprog += tmpl_size; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 606c0cb97e86..246a76ddbf46 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -34,21 +34,63 @@ #include "cpu.h" +/* + * Speculation Vulnerability Handling + * + * Each vulnerability is handled with the following functions: + * <vuln>_select_mitigation() -- Selects a mitigation to use. This should + * take into account all relevant command line + * options. + * <vuln>_update_mitigation() -- This is called after all vulnerabilities have + * selected a mitigation, in case the selection + * may want to change based on other choices + * made. This function is optional. + * <vuln>_apply_mitigation() -- Enable the selected mitigation. + * + * The compile-time mitigation in all cases should be AUTO. An explicit + * command-line option can override AUTO. If no such option is + * provided, <vuln>_select_mitigation() will override AUTO to the best + * mitigation option. + */ + static void __init spectre_v1_select_mitigation(void); +static void __init spectre_v1_apply_mitigation(void); static void __init spectre_v2_select_mitigation(void); +static void __init spectre_v2_update_mitigation(void); +static void __init spectre_v2_apply_mitigation(void); static void __init retbleed_select_mitigation(void); +static void __init retbleed_update_mitigation(void); +static void __init retbleed_apply_mitigation(void); static void __init spectre_v2_user_select_mitigation(void); +static void __init spectre_v2_user_update_mitigation(void); +static void __init spectre_v2_user_apply_mitigation(void); static void __init ssb_select_mitigation(void); +static void __init ssb_apply_mitigation(void); static void __init l1tf_select_mitigation(void); +static void __init l1tf_apply_mitigation(void); static void __init mds_select_mitigation(void); -static void __init md_clear_update_mitigation(void); -static void __init md_clear_select_mitigation(void); +static void __init mds_update_mitigation(void); +static void __init mds_apply_mitigation(void); static void __init taa_select_mitigation(void); +static void __init taa_update_mitigation(void); +static void __init taa_apply_mitigation(void); static void __init mmio_select_mitigation(void); +static void __init mmio_update_mitigation(void); +static void __init mmio_apply_mitigation(void); +static void __init rfds_select_mitigation(void); +static void __init rfds_update_mitigation(void); +static void __init rfds_apply_mitigation(void); static void __init srbds_select_mitigation(void); +static void __init srbds_apply_mitigation(void); static void __init l1d_flush_select_mitigation(void); static void __init srso_select_mitigation(void); +static void __init srso_update_mitigation(void); +static void __init srso_apply_mitigation(void); static void __init gds_select_mitigation(void); +static void __init gds_apply_mitigation(void); +static void __init bhi_select_mitigation(void); +static void __init bhi_update_mitigation(void); +static void __init bhi_apply_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -127,9 +169,13 @@ EXPORT_SYMBOL_GPL(mds_idle_clear); */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ -DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); -EXPORT_SYMBOL_GPL(mmio_stale_data_clear); +/* + * Controls CPU Fill buffer clear before VMenter. This is a subset of + * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only + * mitigation is required. + */ +DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); +EXPORT_SYMBOL_GPL(cpu_buf_vm_clear); void __init cpu_select_mitigations(void) { @@ -154,30 +200,60 @@ void __init cpu_select_mitigations(void) /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); - /* - * retbleed_select_mitigation() relies on the state set by - * spectre_v2_select_mitigation(); specifically it wants to know about - * spectre_v2=ibrs. - */ retbleed_select_mitigation(); - /* - * spectre_v2_user_select_mitigation() relies on the state set by - * retbleed_select_mitigation(); specifically the STIBP selection is - * forced for UNRET or IBPB. - */ spectre_v2_user_select_mitigation(); ssb_select_mitigation(); l1tf_select_mitigation(); - md_clear_select_mitigation(); + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + rfds_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); + srso_select_mitigation(); + gds_select_mitigation(); + bhi_select_mitigation(); /* - * srso_select_mitigation() depends and must run after - * retbleed_select_mitigation(). + * After mitigations are selected, some may need to update their + * choices. */ - srso_select_mitigation(); - gds_select_mitigation(); + spectre_v2_update_mitigation(); + /* + * retbleed_update_mitigation() relies on the state set by + * spectre_v2_update_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_update_mitigation(); + + /* + * spectre_v2_user_update_mitigation() depends on + * retbleed_update_mitigation(), specifically the STIBP + * selection is forced for UNRET or IBPB. + */ + spectre_v2_user_update_mitigation(); + mds_update_mitigation(); + taa_update_mitigation(); + mmio_update_mitigation(); + rfds_update_mitigation(); + bhi_update_mitigation(); + /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ + srso_update_mitigation(); + + spectre_v1_apply_mitigation(); + spectre_v2_apply_mitigation(); + retbleed_apply_mitigation(); + spectre_v2_user_apply_mitigation(); + ssb_apply_mitigation(); + l1tf_apply_mitigation(); + mds_apply_mitigation(); + taa_apply_mitigation(); + mmio_apply_mitigation(); + rfds_apply_mitigation(); + srbds_apply_mitigation(); + srso_apply_mitigation(); + gds_apply_mitigation(); + bhi_apply_mitigation(); } /* @@ -280,6 +356,12 @@ enum rfds_mitigations { static enum rfds_mitigations rfds_mitigation __ro_after_init = IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF; +/* + * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing + * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry. + */ +static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init; + static void __init mds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { @@ -290,12 +372,34 @@ static void __init mds_select_mitigation(void) if (mds_mitigation == MDS_MITIGATION_AUTO) mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_OFF) + return; + + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init mds_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) + return; + + /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */ + if (verw_clear_cpu_buf_mitigation_selected) + mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_FULL) { if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; + } - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + pr_info("%s\n", mds_strings[mds_mitigation]); +} +static void __init mds_apply_mitigation(void) +{ + if (mds_mitigation == MDS_MITIGATION_FULL || + mds_mitigation == MDS_MITIGATION_VMWERV) { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); @@ -335,6 +439,11 @@ static const char * const taa_strings[] = { [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", }; +static bool __init taa_vulnerable(void) +{ + return boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM); +} + static void __init taa_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_TAA)) { @@ -348,48 +457,63 @@ static void __init taa_select_mitigation(void) return; } - if (cpu_mitigations_off()) { + if (cpu_mitigations_off()) taa_mitigation = TAA_MITIGATION_OFF; - return; - } - /* - * TAA mitigation via VERW is turned off if both - * tsx_async_abort=off and mds=off are specified. - */ - if (taa_mitigation == TAA_MITIGATION_OFF && - mds_mitigation == MDS_MITIGATION_OFF) + /* Microcode will be checked in taa_update_mitigation(). */ + if (taa_mitigation == TAA_MITIGATION_AUTO) + taa_mitigation = TAA_MITIGATION_VERW; + + if (taa_mitigation != TAA_MITIGATION_OFF) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init taa_update_mitigation(void) +{ + if (!taa_vulnerable() || cpu_mitigations_off()) return; - if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + if (verw_clear_cpu_buf_mitigation_selected) taa_mitigation = TAA_MITIGATION_VERW; - else - taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; - /* - * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. - * A microcode update fixes this behavior to clear CPU buffers. It also - * adds support for MSR_IA32_TSX_CTRL which is enumerated by the - * ARCH_CAP_TSX_CTRL_MSR bit. - * - * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode - * update is required. - */ - if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) && - !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) - taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + if (taa_mitigation == TAA_MITIGATION_VERW) { + /* Check if the requisite ucode is available. */ + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; - /* - * TSX is enabled, select alternate mitigation for TAA which is - * the same as MDS. Enable MDS static branch to clear CPU buffers. - * - * For guests that can't determine whether the correct microcode is - * present on host, enable the mitigation for UCODE_NEEDED as well. - */ - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && + !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + } - if (taa_nosmt || cpu_mitigations_auto_nosmt()) - cpu_smt_disable(false); + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static void __init taa_apply_mitigation(void) +{ + if (taa_mitigation == TAA_MITIGATION_VERW || + taa_mitigation == TAA_MITIGATION_UCODE_NEEDED) { + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + } } static int __init tsx_async_abort_parse_cmdline(char *str) @@ -432,25 +556,62 @@ static void __init mmio_select_mitigation(void) return; } + /* Microcode will be checked in mmio_update_mitigation(). */ + if (mmio_mitigation == MMIO_MITIGATION_AUTO) + mmio_mitigation = MMIO_MITIGATION_VERW; + if (mmio_mitigation == MMIO_MITIGATION_OFF) return; /* * Enable CPU buffer clear mitigation for host and VMM, if also affected - * by MDS or TAA. Otherwise, enable mitigation for VMM only. + * by MDS or TAA. */ - if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && - boot_cpu_has(X86_FEATURE_RTM))) - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + if (boot_cpu_has_bug(X86_BUG_MDS) || taa_vulnerable()) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init mmio_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || cpu_mitigations_off()) + return; + + if (verw_clear_cpu_buf_mitigation_selected) + mmio_mitigation = MMIO_MITIGATION_VERW; + + if (mmio_mitigation == MMIO_MITIGATION_VERW) { + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if (!((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)))) + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + } + + pr_info("%s\n", mmio_strings[mmio_mitigation]); +} + +static void __init mmio_apply_mitigation(void) +{ + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; /* - * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based - * mitigations, disable KVM-only mitigation in that case. + * Only enable the VMM mitigation if the CPU buffer clear mitigation is + * not being used. */ - if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) - static_branch_disable(&mmio_stale_data_clear); - else - static_branch_enable(&mmio_stale_data_clear); + if (verw_clear_cpu_buf_mitigation_selected) { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + static_branch_disable(&cpu_buf_vm_clear); + } else { + static_branch_enable(&cpu_buf_vm_clear); + } /* * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can @@ -460,21 +621,6 @@ static void __init mmio_select_mitigation(void) if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) static_branch_enable(&mds_idle_clear); - /* - * Check if the system has the right microcode. - * - * CPU Fill buffer clear mitigation is enumerated by either an explicit - * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS - * affected systems. - */ - if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || - (boot_cpu_has(X86_FEATURE_MD_CLEAR) && - boot_cpu_has(X86_FEATURE_FLUSH_L1D) && - !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))) - mmio_mitigation = MMIO_MITIGATION_VERW; - else - mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; - if (mmio_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); } @@ -509,22 +655,48 @@ static const char * const rfds_strings[] = { [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", }; +static inline bool __init verw_clears_cpu_reg_file(void) +{ + return (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR); +} + static void __init rfds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) { rfds_mitigation = RFDS_MITIGATION_OFF; return; } + + if (rfds_mitigation == RFDS_MITIGATION_AUTO) + rfds_mitigation = RFDS_MITIGATION_VERW; + if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (rfds_mitigation == RFDS_MITIGATION_AUTO) + if (verw_clears_cpu_reg_file()) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init rfds_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) + return; + + if (verw_clear_cpu_buf_mitigation_selected) rfds_mitigation = RFDS_MITIGATION_VERW; - if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) + if (rfds_mitigation == RFDS_MITIGATION_VERW) { + if (!verw_clears_cpu_reg_file()) + rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; + } + + pr_info("%s\n", rfds_strings[rfds_mitigation]); +} + +static void __init rfds_apply_mitigation(void) +{ + if (rfds_mitigation == RFDS_MITIGATION_VERW) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); - else - rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; } static __init int rfds_parse_cmdline(char *str) @@ -545,74 +717,11 @@ static __init int rfds_parse_cmdline(char *str) early_param("reg_file_data_sampling", rfds_parse_cmdline); #undef pr_fmt -#define pr_fmt(fmt) "" fmt - -static void __init md_clear_update_mitigation(void) -{ - if (cpu_mitigations_off()) - return; - - if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) - goto out; - - /* - * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO - * Stale Data mitigation, if necessary. - */ - if (mds_mitigation == MDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MDS)) { - mds_mitigation = MDS_MITIGATION_FULL; - mds_select_mitigation(); - } - if (taa_mitigation == TAA_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_TAA)) { - taa_mitigation = TAA_MITIGATION_VERW; - taa_select_mitigation(); - } - /* - * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear - * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state. - */ - if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { - mmio_mitigation = MMIO_MITIGATION_VERW; - mmio_select_mitigation(); - } - if (rfds_mitigation == RFDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_RFDS)) { - rfds_mitigation = RFDS_MITIGATION_VERW; - rfds_select_mitigation(); - } -out: - if (boot_cpu_has_bug(X86_BUG_MDS)) - pr_info("MDS: %s\n", mds_strings[mds_mitigation]); - if (boot_cpu_has_bug(X86_BUG_TAA)) - pr_info("TAA: %s\n", taa_strings[taa_mitigation]); - if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) - pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); - if (boot_cpu_has_bug(X86_BUG_RFDS)) - pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]); -} - -static void __init md_clear_select_mitigation(void) -{ - mds_select_mitigation(); - taa_select_mitigation(); - mmio_select_mitigation(); - rfds_select_mitigation(); - - /* - * As these mitigations are inter-related and rely on VERW instruction - * to clear the microarchitural buffers, update and print their status - * after mitigation selection is done for each of these vulnerabilities. - */ - md_clear_update_mitigation(); -} - -#undef pr_fmt #define pr_fmt(fmt) "SRBDS: " fmt enum srbds_mitigations { SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_AUTO, SRBDS_MITIGATION_UCODE_NEEDED, SRBDS_MITIGATION_FULL, SRBDS_MITIGATION_TSX_OFF, @@ -620,7 +729,7 @@ enum srbds_mitigations { }; static enum srbds_mitigations srbds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_AUTO : SRBDS_MITIGATION_OFF; static const char * const srbds_strings[] = { [SRBDS_MITIGATION_OFF] = "Vulnerable", @@ -671,8 +780,13 @@ void update_srbds_msr(void) static void __init srbds_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + if (!boot_cpu_has_bug(X86_BUG_SRBDS) || cpu_mitigations_off()) { + srbds_mitigation = SRBDS_MITIGATION_OFF; return; + } + + if (srbds_mitigation == SRBDS_MITIGATION_AUTO) + srbds_mitigation = SRBDS_MITIGATION_FULL; /* * Check to see if this is one of the MDS_NO systems supporting TSX that @@ -686,13 +800,17 @@ static void __init srbds_select_mitigation(void) srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; - else if (cpu_mitigations_off() || srbds_off) + else if (srbds_off) srbds_mitigation = SRBDS_MITIGATION_OFF; - update_srbds_msr(); pr_info("%s\n", srbds_strings[srbds_mitigation]); } +static void __init srbds_apply_mitigation(void) +{ + update_srbds_msr(); +} + static int __init srbds_parse_cmdline(char *str) { if (!str) @@ -739,6 +857,7 @@ early_param("l1d_flush", l1d_flush_parse_cmdline); enum gds_mitigations { GDS_MITIGATION_OFF, + GDS_MITIGATION_AUTO, GDS_MITIGATION_UCODE_NEEDED, GDS_MITIGATION_FORCE, GDS_MITIGATION_FULL, @@ -747,7 +866,7 @@ enum gds_mitigations { }; static enum gds_mitigations gds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_AUTO : GDS_MITIGATION_OFF; static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", @@ -788,6 +907,7 @@ void update_gds_msr(void) case GDS_MITIGATION_FORCE: case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: + case GDS_MITIGATION_AUTO: return; } @@ -811,26 +931,21 @@ static void __init gds_select_mitigation(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { gds_mitigation = GDS_MITIGATION_HYPERVISOR; - goto out; + return; } if (cpu_mitigations_off()) gds_mitigation = GDS_MITIGATION_OFF; /* Will verify below that mitigation _can_ be disabled */ + if (gds_mitigation == GDS_MITIGATION_AUTO) + gds_mitigation = GDS_MITIGATION_FULL; + /* No microcode */ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { - if (gds_mitigation == GDS_MITIGATION_FORCE) { - /* - * This only needs to be done on the boot CPU so do it - * here rather than in update_gds_msr() - */ - setup_clear_cpu_cap(X86_FEATURE_AVX); - pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); - } else { + if (gds_mitigation != GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; - } - goto out; + return; } /* Microcode has mitigation, use it */ @@ -851,9 +966,25 @@ static void __init gds_select_mitigation(void) */ gds_mitigation = GDS_MITIGATION_FULL_LOCKED; } +} + +static void __init gds_apply_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return; + + /* Microcode is present */ + if (x86_arch_cap_msr & ARCH_CAP_GDS_CTRL) + update_gds_msr(); + else if (gds_mitigation == GDS_MITIGATION_FORCE) { + /* + * This only needs to be done on the boot CPU so do it + * here rather than in update_gds_msr() + */ + setup_clear_cpu_cap(X86_FEATURE_AVX); + pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); + } - update_gds_msr(); -out: pr_info("%s\n", gds_strings[gds_mitigation]); } @@ -914,10 +1045,14 @@ static bool smap_works_speculatively(void) static void __init spectre_v1_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; +} + +static void __init spectre_v1_apply_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) return; - } if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { /* @@ -972,6 +1107,7 @@ enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; enum retbleed_mitigation { RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_AUTO, RETBLEED_MITIGATION_UNRET, RETBLEED_MITIGATION_IBPB, RETBLEED_MITIGATION_IBRS, @@ -979,14 +1115,6 @@ enum retbleed_mitigation { RETBLEED_MITIGATION_STUFF, }; -enum retbleed_mitigation_cmd { - RETBLEED_CMD_OFF, - RETBLEED_CMD_AUTO, - RETBLEED_CMD_UNRET, - RETBLEED_CMD_IBPB, - RETBLEED_CMD_STUFF, -}; - static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_NONE] = "Vulnerable", [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", @@ -997,9 +1125,7 @@ static const char * const retbleed_strings[] = { }; static enum retbleed_mitigation retbleed_mitigation __ro_after_init = - RETBLEED_MITIGATION_NONE; -static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF; + IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_MITIGATION_AUTO : RETBLEED_MITIGATION_NONE; static int __ro_after_init retbleed_nosmt = false; @@ -1016,15 +1142,15 @@ static int __init retbleed_parse_cmdline(char *str) } if (!strcmp(str, "off")) { - retbleed_cmd = RETBLEED_CMD_OFF; + retbleed_mitigation = RETBLEED_MITIGATION_NONE; } else if (!strcmp(str, "auto")) { - retbleed_cmd = RETBLEED_CMD_AUTO; + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } else if (!strcmp(str, "unret")) { - retbleed_cmd = RETBLEED_CMD_UNRET; + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; } else if (!strcmp(str, "ibpb")) { - retbleed_cmd = RETBLEED_CMD_IBPB; + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } else if (!strcmp(str, "stuff")) { - retbleed_cmd = RETBLEED_CMD_STUFF; + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; } else if (!strcmp(str, "force")) { @@ -1045,72 +1171,109 @@ early_param("retbleed", retbleed_parse_cmdline); static void __init retbleed_select_mitigation(void) { - bool mitigate_smt = false; - - if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) - return; - - switch (retbleed_cmd) { - case RETBLEED_CMD_OFF: + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) { + retbleed_mitigation = RETBLEED_MITIGATION_NONE; return; + } - case RETBLEED_CMD_UNRET: - if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - } else { + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_UNRET: + if (!IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n"); - goto do_cmd_auto; } break; - - case RETBLEED_CMD_IBPB: + case RETBLEED_MITIGATION_IBPB: if (!boot_cpu_has(X86_FEATURE_IBPB)) { pr_err("WARNING: CPU does not support IBPB.\n"); - goto do_cmd_auto; - } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; - } else { + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } else if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); - goto do_cmd_auto; + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } break; + case RETBLEED_MITIGATION_STUFF: + if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) { + pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } else if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("WARNING: retbleed=stuff only supported for Intel CPUs.\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } + break; + default: + break; + } - case RETBLEED_CMD_STUFF: - if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && - spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { - retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO) + return; - } else { - if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) - pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); - else - pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); + /* Intel mitigation selected in retbleed_update_mitigation() */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && + boot_cpu_has(X86_FEATURE_IBPB)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + else + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } +} - goto do_cmd_auto; - } - break; +static void __init retbleed_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + + if (retbleed_mitigation == RETBLEED_MITIGATION_NONE) + goto out; -do_cmd_auto: - case RETBLEED_CMD_AUTO: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && - boot_cpu_has(X86_FEATURE_IBPB)) - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + /* + * retbleed=stuff is only allowed on Intel. If stuffing can't be used + * then a different mitigation will be selected below. + */ + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF) { + if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) { + pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } + } + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option except for call depth based stuffing + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_err(RETBLEED_INTEL_MSG); + } + /* If nothing has set the mitigation yet, default to NONE. */ + if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO) + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } +out: + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} - /* - * The Intel mitigation (IBRS or eIBRS) was already selected in - * spectre_v2_select_mitigation(). 'retbleed_mitigation' will - * be set accordingly below. - */ - break; - } +static void __init retbleed_apply_mitigation(void) +{ + bool mitigate_smt = false; switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_NONE: + return; + case RETBLEED_MITIGATION_UNRET: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); @@ -1160,28 +1323,6 @@ do_cmd_auto: if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && (retbleed_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); - - /* - * Let IBRS trump all on Intel without affecting the effects of the - * retbleed= cmdline option except for call depth based stuffing - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - switch (spectre_v2_enabled) { - case SPECTRE_V2_IBRS: - retbleed_mitigation = RETBLEED_MITIGATION_IBRS; - break; - case SPECTRE_V2_EIBRS: - case SPECTRE_V2_EIBRS_RETPOLINE: - case SPECTRE_V2_EIBRS_LFENCE: - retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; - break; - default: - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) - pr_err(RETBLEED_INTEL_MSG); - } - } - - pr_info("%s\n", retbleed_strings[retbleed_mitigation]); } #undef pr_fmt @@ -1261,6 +1402,8 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_IBRS, }; +static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO; + enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, @@ -1299,31 +1442,18 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } -static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; - -static enum spectre_v2_user_cmd __init -spectre_v2_parse_user_cmdline(void) +static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) { - enum spectre_v2_user_cmd mode; char arg[20]; int ret, i; - mode = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? - SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; - - switch (spectre_v2_cmd) { - case SPECTRE_V2_CMD_NONE: + if (cpu_mitigations_off() || !IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2)) return SPECTRE_V2_USER_CMD_NONE; - case SPECTRE_V2_CMD_FORCE: - return SPECTRE_V2_USER_CMD_FORCE; - default: - break; - } ret = cmdline_find_option(boot_command_line, "spectre_v2_user", arg, sizeof(arg)); if (ret < 0) - return mode; + return SPECTRE_V2_USER_CMD_AUTO; for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { if (match_option(arg, ret, v2_user_options[i].option)) { @@ -1334,7 +1464,7 @@ spectre_v2_parse_user_cmdline(void) } pr_err("Unknown user space protection option (%s). Switching to default\n", arg); - return mode; + return SPECTRE_V2_USER_CMD_AUTO; } static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) @@ -1342,60 +1472,72 @@ static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; } -static void __init -spectre_v2_user_select_mitigation(void) +static void __init spectre_v2_user_select_mitigation(void) { - enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; - enum spectre_v2_user_cmd cmd; - if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - cmd = spectre_v2_parse_user_cmdline(); - switch (cmd) { + switch (spectre_v2_parse_user_cmdline()) { case SPECTRE_V2_USER_CMD_NONE: - goto set_mode; + return; case SPECTRE_V2_USER_CMD_FORCE: - mode = SPECTRE_V2_USER_STRICT; + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_PRCTL: + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; + break; case SPECTRE_V2_USER_CMD_PRCTL_IBPB: - mode = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; break; case SPECTRE_V2_USER_CMD_SECCOMP: + if (IS_ENABLED(CONFIG_SECCOMP)) + spectre_v2_user_ibpb = SPECTRE_V2_USER_SECCOMP; + else + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = spectre_v2_user_ibpb; + break; case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; if (IS_ENABLED(CONFIG_SECCOMP)) - mode = SPECTRE_V2_USER_SECCOMP; + spectre_v2_user_stibp = SPECTRE_V2_USER_SECCOMP; else - mode = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; break; } - /* Initialize Indirect Branch Prediction Barrier */ - if (boot_cpu_has(X86_FEATURE_IBPB)) { - static_branch_enable(&switch_vcpu_ibpb); + /* + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. + */ + if ((spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED; - spectre_v2_user_ibpb = mode; - switch (cmd) { - case SPECTRE_V2_USER_CMD_NONE: - break; - case SPECTRE_V2_USER_CMD_FORCE: - case SPECTRE_V2_USER_CMD_PRCTL_IBPB: - case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: - static_branch_enable(&switch_mm_always_ibpb); - spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; - break; - case SPECTRE_V2_USER_CMD_PRCTL: - case SPECTRE_V2_USER_CMD_AUTO: - case SPECTRE_V2_USER_CMD_SECCOMP: - static_branch_enable(&switch_mm_cond_ibpb); - break; - } + if (!boot_cpu_has(X86_FEATURE_IBPB)) + spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; - pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", - static_key_enabled(&switch_mm_always_ibpb) ? - "always-on" : "conditional"); + if (!boot_cpu_has(X86_FEATURE_STIBP)) + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; +} + +static void __init spectre_v2_user_update_mitigation(void) +{ + if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) + return; + + /* The spectre_v2 cmd line can override spectre_v2_user options */ + if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) { + spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; + } else if (spectre_v2_cmd == SPECTRE_V2_CMD_FORCE) { + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; } /* @@ -1413,30 +1555,44 @@ spectre_v2_user_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_STIBP) || !cpu_smt_possible() || (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && - !boot_cpu_has(X86_FEATURE_AUTOIBRS))) + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) { + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; return; + } - /* - * At this point, an STIBP mode other than "off" has been set. - * If STIBP support is not being forced, check if STIBP always-on - * is preferred. - */ - if (mode != SPECTRE_V2_USER_STRICT && - boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) - mode = SPECTRE_V2_USER_STRICT_PREFERRED; - - if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || - retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - if (mode != SPECTRE_V2_USER_STRICT && - mode != SPECTRE_V2_USER_STRICT_PREFERRED) + if (spectre_v2_user_stibp != SPECTRE_V2_USER_NONE && + (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB)) { + if (spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT && + spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT_PREFERRED) pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); - mode = SPECTRE_V2_USER_STRICT_PREFERRED; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED; } + pr_info("%s\n", spectre_v2_user_strings[spectre_v2_user_stibp]); +} + +static void __init spectre_v2_user_apply_mitigation(void) +{ + /* Initialize Indirect Branch Prediction Barrier */ + if (spectre_v2_user_ibpb != SPECTRE_V2_USER_NONE) { + static_branch_enable(&switch_vcpu_ibpb); - spectre_v2_user_stibp = mode; + switch (spectre_v2_user_ibpb) { + case SPECTRE_V2_USER_STRICT: + static_branch_enable(&switch_mm_always_ibpb); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + static_branch_enable(&switch_mm_cond_ibpb); + break; + default: + break; + } -set_mode: - pr_info("%s\n", spectre_v2_user_strings[mode]); + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", + static_key_enabled(&switch_mm_always_ibpb) ? + "always-on" : "conditional"); + } } static const char * const spectre_v2_strings[] = { @@ -1656,12 +1812,13 @@ static bool __init spec_ctrl_bhi_dis(void) enum bhi_mitigations { BHI_MITIGATION_OFF, + BHI_MITIGATION_AUTO, BHI_MITIGATION_ON, BHI_MITIGATION_VMEXIT_ONLY, }; static enum bhi_mitigations bhi_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF; static int __init spectre_bhi_parse_cmdline(char *str) { @@ -1683,6 +1840,25 @@ early_param("spectre_bhi", spectre_bhi_parse_cmdline); static void __init bhi_select_mitigation(void) { + if (!boot_cpu_has(X86_BUG_BHI) || cpu_mitigations_off()) + bhi_mitigation = BHI_MITIGATION_OFF; + + if (bhi_mitigation == BHI_MITIGATION_AUTO) + bhi_mitigation = BHI_MITIGATION_ON; +} + +static void __init bhi_update_mitigation(void) +{ + if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) + bhi_mitigation = BHI_MITIGATION_OFF; + + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + spectre_v2_cmd == SPECTRE_V2_CMD_AUTO) + bhi_mitigation = BHI_MITIGATION_OFF; +} + +static void __init bhi_apply_mitigation(void) +{ if (bhi_mitigation == BHI_MITIGATION_OFF) return; @@ -1714,75 +1890,80 @@ static void __init bhi_select_mitigation(void) static void __init spectre_v2_select_mitigation(void) { - enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); - enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + spectre_v2_cmd = spectre_v2_parse_cmdline(); - /* - * If the CPU is not affected and the command line mode is NONE or AUTO - * then nothing to do. - */ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)) return; - switch (cmd) { + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return; case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_EIBRS; - break; - } - - if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && - boot_cpu_has_bug(X86_BUG_RETBLEED) && - retbleed_cmd != RETBLEED_CMD_OFF && - retbleed_cmd != RETBLEED_CMD_STUFF && - boot_cpu_has(X86_FEATURE_IBRS) && - boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - mode = SPECTRE_V2_IBRS; + spectre_v2_enabled = SPECTRE_V2_EIBRS; break; } - mode = spectre_v2_select_retpoline(); + spectre_v2_enabled = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_RETPOLINE_LFENCE: pr_err(SPECTRE_V2_LFENCE_MSG); - mode = SPECTRE_V2_LFENCE; + spectre_v2_enabled = SPECTRE_V2_LFENCE; break; case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - mode = SPECTRE_V2_RETPOLINE; + spectre_v2_enabled = SPECTRE_V2_RETPOLINE; break; case SPECTRE_V2_CMD_RETPOLINE: - mode = spectre_v2_select_retpoline(); + spectre_v2_enabled = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_IBRS: - mode = SPECTRE_V2_IBRS; + spectre_v2_enabled = SPECTRE_V2_IBRS; break; case SPECTRE_V2_CMD_EIBRS: - mode = SPECTRE_V2_EIBRS; + spectre_v2_enabled = SPECTRE_V2_EIBRS; break; case SPECTRE_V2_CMD_EIBRS_LFENCE: - mode = SPECTRE_V2_EIBRS_LFENCE; + spectre_v2_enabled = SPECTRE_V2_EIBRS_LFENCE; break; case SPECTRE_V2_CMD_EIBRS_RETPOLINE: - mode = SPECTRE_V2_EIBRS_RETPOLINE; + spectre_v2_enabled = SPECTRE_V2_EIBRS_RETPOLINE; break; } +} + +static void __init spectre_v2_update_mitigation(void) +{ + if (spectre_v2_cmd == SPECTRE_V2_CMD_AUTO) { + if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_mitigation != RETBLEED_MITIGATION_NONE && + retbleed_mitigation != RETBLEED_MITIGATION_STUFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + spectre_v2_enabled = SPECTRE_V2_IBRS; + } + } - if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && !cpu_mitigations_off()) + pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]); +} + +static void __init spectre_v2_apply_mitigation(void) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); - if (spectre_v2_in_ibrs_mode(mode)) { + if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) { if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); } else { @@ -1791,8 +1972,10 @@ static void __init spectre_v2_select_mitigation(void) } } - switch (mode) { + switch (spectre_v2_enabled) { case SPECTRE_V2_NONE: + return; + case SPECTRE_V2_EIBRS: break; @@ -1818,18 +2001,12 @@ static void __init spectre_v2_select_mitigation(void) * JMPs gets protection against BHI and Intramode-BTI, but RET * prediction from a non-RSB predictor is still a risk. */ - if (mode == SPECTRE_V2_EIBRS_LFENCE || - mode == SPECTRE_V2_EIBRS_RETPOLINE || - mode == SPECTRE_V2_RETPOLINE) + if (spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE || + spectre_v2_enabled == SPECTRE_V2_EIBRS_RETPOLINE || + spectre_v2_enabled == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); - if (boot_cpu_has(X86_BUG_BHI)) - bhi_select_mitigation(); - - spectre_v2_enabled = mode; - pr_info("%s\n", spectre_v2_strings[mode]); - - spectre_v2_select_rsb_mitigation(mode); + spectre_v2_select_rsb_mitigation(spectre_v2_enabled); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS @@ -1837,28 +2014,26 @@ static void __init spectre_v2_select_mitigation(void) * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't * otherwise enabled. * - * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because - * the user might select retpoline on the kernel command line and if - * the CPU supports Enhanced IBRS, kernel might un-intentionally not - * enable IBRS around firmware calls. + * Use "spectre_v2_enabled" to check Enhanced IBRS instead of + * boot_cpu_has(), because the user might select retpoline on the kernel + * command line and if the CPU supports Enhanced IBRS, kernel might + * un-intentionally not enable IBRS around firmware calls. */ if (boot_cpu_has_bug(X86_BUG_RETBLEED) && boot_cpu_has(X86_FEATURE_IBPB) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { - if (retbleed_cmd != RETBLEED_CMD_IBPB) { + if (retbleed_mitigation != RETBLEED_MITIGATION_IBPB) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); pr_info("Enabling Speculation Barrier for firmware calls\n"); } - } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { + } else if (boot_cpu_has(X86_FEATURE_IBRS) && + !spectre_v2_in_ibrs_mode(spectre_v2_enabled)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } - - /* Set up IBPB and STIBP depending on the general spectre V2 command */ - spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) @@ -2047,19 +2222,18 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) return cmd; } -static enum ssb_mitigation __init __ssb_select_mitigation(void) +static void __init ssb_select_mitigation(void) { - enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; enum ssb_mitigation_cmd cmd; if (!boot_cpu_has(X86_FEATURE_SSBD)) - return mode; + goto out; cmd = ssb_parse_cmdline(); if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && (cmd == SPEC_STORE_BYPASS_CMD_NONE || cmd == SPEC_STORE_BYPASS_CMD_AUTO)) - return mode; + return; switch (cmd) { case SPEC_STORE_BYPASS_CMD_SECCOMP: @@ -2068,28 +2242,35 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) * enabled. */ if (IS_ENABLED(CONFIG_SECCOMP)) - mode = SPEC_STORE_BYPASS_SECCOMP; + ssb_mode = SPEC_STORE_BYPASS_SECCOMP; else - mode = SPEC_STORE_BYPASS_PRCTL; + ssb_mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_ON: - mode = SPEC_STORE_BYPASS_DISABLE; + ssb_mode = SPEC_STORE_BYPASS_DISABLE; break; case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_PRCTL: - mode = SPEC_STORE_BYPASS_PRCTL; + ssb_mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_NONE: break; } +out: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + pr_info("%s\n", ssb_strings[ssb_mode]); +} + +static void __init ssb_apply_mitigation(void) +{ /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation */ - if (mode == SPEC_STORE_BYPASS_DISABLE) { + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) { setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); /* * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may @@ -2103,16 +2284,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) update_spec_ctrl(x86_spec_ctrl_base); } } - - return mode; -} - -static void ssb_select_mitigation(void) -{ - ssb_mode = __ssb_select_mitigation(); - - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - pr_info("%s\n", ssb_strings[ssb_mode]); } #undef pr_fmt @@ -2368,7 +2539,7 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); /* Default mitigation for L1TF-affected CPUs */ enum l1tf_mitigations l1tf_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF; #if IS_ENABLED(CONFIG_KVM_INTEL) EXPORT_SYMBOL_GPL(l1tf_mitigation); #endif @@ -2416,22 +2587,33 @@ static void override_cache_bits(struct cpuinfo_x86 *c) static void __init l1tf_select_mitigation(void) { + if (!boot_cpu_has_bug(X86_BUG_L1TF) || cpu_mitigations_off()) { + l1tf_mitigation = L1TF_MITIGATION_OFF; + return; + } + + if (l1tf_mitigation == L1TF_MITIGATION_AUTO) { + if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + else + l1tf_mitigation = L1TF_MITIGATION_FLUSH; + } +} + +static void __init l1tf_apply_mitigation(void) +{ u64 half_pa; if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; - if (cpu_mitigations_off()) - l1tf_mitigation = L1TF_MITIGATION_OFF; - else if (cpu_mitigations_auto_nosmt()) - l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; - override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_AUTO: break; case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: @@ -2491,6 +2673,7 @@ early_param("l1tf", l1tf_cmdline); enum srso_mitigation { SRSO_MITIGATION_NONE, + SRSO_MITIGATION_AUTO, SRSO_MITIGATION_UCODE_NEEDED, SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, SRSO_MITIGATION_MICROCODE, @@ -2500,14 +2683,6 @@ enum srso_mitigation { SRSO_MITIGATION_BP_SPEC_REDUCE, }; -enum srso_mitigation_cmd { - SRSO_CMD_OFF, - SRSO_CMD_MICROCODE, - SRSO_CMD_SAFE_RET, - SRSO_CMD_IBPB, - SRSO_CMD_IBPB_ON_VMEXIT, -}; - static const char * const srso_strings[] = { [SRSO_MITIGATION_NONE] = "Vulnerable", [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", @@ -2519,8 +2694,7 @@ static const char * const srso_strings[] = { [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation" }; -static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; -static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; +static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO; static int __init srso_parse_cmdline(char *str) { @@ -2528,15 +2702,15 @@ static int __init srso_parse_cmdline(char *str) return -EINVAL; if (!strcmp(str, "off")) - srso_cmd = SRSO_CMD_OFF; + srso_mitigation = SRSO_MITIGATION_NONE; else if (!strcmp(str, "microcode")) - srso_cmd = SRSO_CMD_MICROCODE; + srso_mitigation = SRSO_MITIGATION_MICROCODE; else if (!strcmp(str, "safe-ret")) - srso_cmd = SRSO_CMD_SAFE_RET; + srso_mitigation = SRSO_MITIGATION_SAFE_RET; else if (!strcmp(str, "ibpb")) - srso_cmd = SRSO_CMD_IBPB; + srso_mitigation = SRSO_MITIGATION_IBPB; else if (!strcmp(str, "ibpb-vmexit")) - srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT; + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; else pr_err("Ignoring unknown SRSO option (%s).", str); @@ -2548,132 +2722,83 @@ early_param("spec_rstack_overflow", srso_parse_cmdline); static void __init srso_select_mitigation(void) { - bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); + bool has_microcode; - if (!boot_cpu_has_bug(X86_BUG_SRSO) || - cpu_mitigations_off() || - srso_cmd == SRSO_CMD_OFF) { - if (boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; - goto out; - } + if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) + srso_mitigation = SRSO_MITIGATION_NONE; + + if (srso_mitigation == SRSO_MITIGATION_NONE) + return; + if (srso_mitigation == SRSO_MITIGATION_AUTO) + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + + has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); if (has_microcode) { /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. - * - * Zen1/2 don't have SBPB, no need to try to enable it here. */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); - goto out; - } - - if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - srso_mitigation = SRSO_MITIGATION_IBPB; - goto out; + srso_mitigation = SRSO_MITIGATION_NONE; + return; } } else { pr_warn("IBPB-extending microcode not applied!\n"); pr_warn(SRSO_NOTICE); - - /* may be overwritten by SRSO_CMD_SAFE_RET below */ - srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; } - switch (srso_cmd) { - case SRSO_CMD_MICROCODE: - if (has_microcode) { - srso_mitigation = SRSO_MITIGATION_MICROCODE; - pr_warn(SRSO_NOTICE); - } - break; - - case SRSO_CMD_SAFE_RET: - if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) + switch (srso_mitigation) { + case SRSO_MITIGATION_SAFE_RET: + if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) { + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; goto ibpb_on_vmexit; + } - if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { - /* - * Enable the return thunk for generated code - * like ftrace, static_call, etc. - */ - setup_force_cpu_cap(X86_FEATURE_RETHUNK); - setup_force_cpu_cap(X86_FEATURE_UNRET); - - if (boot_cpu_data.x86 == 0x19) { - setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); - x86_return_thunk = srso_alias_return_thunk; - } else { - setup_force_cpu_cap(X86_FEATURE_SRSO); - x86_return_thunk = srso_return_thunk; - } - if (has_microcode) - srso_mitigation = SRSO_MITIGATION_SAFE_RET; - else - srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; - } else { + if (!IS_ENABLED(CONFIG_MITIGATION_SRSO)) { pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); + srso_mitigation = SRSO_MITIGATION_NONE; } - break; - case SRSO_CMD_IBPB: - if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - if (has_microcode) { - setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - srso_mitigation = SRSO_MITIGATION_IBPB; - - /* - * IBPB on entry already obviates the need for - * software-based untraining so clear those in case some - * other mitigation like Retbleed has selected them. - */ - setup_clear_cpu_cap(X86_FEATURE_UNRET); - setup_clear_cpu_cap(X86_FEATURE_RETHUNK); - - /* - * There is no need for RSB filling: write_ibpb() ensures - * all predictions, including the RSB, are invalidated, - * regardless of IBPB implementation. - */ - setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); - } - } else { - pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); - } + if (!has_microcode) + srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; break; - ibpb_on_vmexit: - case SRSO_CMD_IBPB_ON_VMEXIT: + case SRSO_MITIGATION_IBPB_ON_VMEXIT: if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) { pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n"); srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE; break; } - - if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - if (has_microcode) { - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; - - /* - * There is no need for RSB filling: write_ibpb() ensures - * all predictions, including the RSB, are invalidated, - * regardless of IBPB implementation. - */ - setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); - } - } else { + fallthrough; + case SRSO_MITIGATION_IBPB: + if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); + srso_mitigation = SRSO_MITIGATION_NONE; } + + if (!has_microcode) + srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; break; default: break; } +} -out: +static void __init srso_update_mitigation(void) +{ + /* If retbleed is using IBPB, that works for SRSO as well */ + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB && + boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) + srso_mitigation = SRSO_MITIGATION_IBPB; + + if (boot_cpu_has_bug(X86_BUG_SRSO) && !cpu_mitigations_off()) + pr_info("%s\n", srso_strings[srso_mitigation]); +} + +static void __init srso_apply_mitigation(void) +{ /* * Clear the feature flag if this mitigation is not selected as that * feature flag controls the BpSpecReduce MSR bit toggling in KVM. @@ -2681,8 +2806,52 @@ out: if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE) setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE); - if (srso_mitigation != SRSO_MITIGATION_NONE) - pr_info("%s\n", srso_strings[srso_mitigation]); + if (srso_mitigation == SRSO_MITIGATION_NONE) { + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; + } + + switch (srso_mitigation) { + case SRSO_MITIGATION_SAFE_RET: + case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED: + /* + * Enable the return thunk for generated code + * like ftrace, static_call, etc. + */ + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86 == 0x19) { + setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); + x86_return_thunk = srso_alias_return_thunk; + } else { + setup_force_cpu_cap(X86_FEATURE_SRSO); + x86_return_thunk = srso_return_thunk; + } + break; + case SRSO_MITIGATION_IBPB: + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like Retbleed has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + fallthrough; + case SRSO_MITIGATION_IBPB_ON_VMEXIT: + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); + break; + default: + break; + } } #undef pr_fmt diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e5734df3b4a1..8a1f9a889eb4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -242,6 +242,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +SYM_PIC_ALIAS(gdt_page); #ifdef CONFIG_X86_64 static int __init x86_nopcid_setup(char *s) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 4a10d35e70aa..96cb992d50ef 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -1098,15 +1098,17 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz static int __init save_microcode_in_initrd(void) { - unsigned int cpuid_1_eax = native_cpuid_eax(1); struct cpuinfo_x86 *c = &boot_cpu_data; struct cont_desc desc = { 0 }; + unsigned int cpuid_1_eax; enum ucode_state ret; struct cpio_data cp; - if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) + if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) return 0; + cpuid_1_eax = native_cpuid_eax(1); + if (!find_blobs_in_containers(&cp)) return -EINVAL; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b3658d11e7b6..079f046ee26d 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -41,8 +41,8 @@ #include "internal.h" -static struct microcode_ops *microcode_ops; -bool dis_ucode_ldr = true; +static struct microcode_ops *microcode_ops; +static bool dis_ucode_ldr = false; bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); module_param(force_minrev, bool, S_IRUSR | S_IWUSR); @@ -84,6 +84,9 @@ static bool amd_check_current_patch_level(void) u32 lvl, dummy, i; u32 *levels; + if (x86_cpuid_vendor() != X86_VENDOR_AMD) + return false; + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); levels = final_levels; @@ -95,27 +98,29 @@ static bool amd_check_current_patch_level(void) return false; } -static bool __init check_loader_disabled_bsp(void) +bool __init microcode_loader_disabled(void) { - static const char *__dis_opt_str = "dis_ucode_ldr"; - const char *cmdline = boot_command_line; - const char *option = __dis_opt_str; + if (dis_ucode_ldr) + return true; /* - * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not - * completely accurate as xen pv guests don't see that CPUID bit set but - * that's good enough as they don't land on the BSP path anyway. + * Disable when: + * + * 1) The CPU does not support CPUID. + * + * 2) Bit 31 in CPUID[1]:ECX is clear + * The bit is reserved for hypervisor use. This is still not + * completely accurate as XEN PV guests don't see that CPUID bit + * set, but that's good enough as they don't land on the BSP + * path anyway. + * + * 3) Certain AMD patch levels are not allowed to be + * overwritten. */ - if (native_cpuid_ecx(1) & BIT(31)) - return true; - - if (x86_cpuid_vendor() == X86_VENDOR_AMD) { - if (amd_check_current_patch_level()) - return true; - } - - if (cmdline_find_option_bool(cmdline, option) <= 0) - dis_ucode_ldr = false; + if (!have_cpuid_p() || + native_cpuid_ecx(1) & BIT(31) || + amd_check_current_patch_level()) + dis_ucode_ldr = true; return dis_ucode_ldr; } @@ -125,7 +130,10 @@ void __init load_ucode_bsp(void) unsigned int cpuid_1_eax; bool intel = true; - if (!have_cpuid_p()) + if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) + dis_ucode_ldr = true; + + if (microcode_loader_disabled()) return; cpuid_1_eax = native_cpuid_eax(1); @@ -146,9 +154,6 @@ void __init load_ucode_bsp(void) return; } - if (check_loader_disabled_bsp()) - return; - if (intel) load_ucode_intel_bsp(&early_data); else @@ -159,6 +164,11 @@ void load_ucode_ap(void) { unsigned int cpuid_1_eax; + /* + * Can't use microcode_loader_disabled() here - .init section + * hell. It doesn't have to either - the BSP variant must've + * parsed cmdline already anyway. + */ if (dis_ucode_ldr) return; @@ -810,7 +820,7 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; int error; - if (dis_ucode_ldr) + if (microcode_loader_disabled()) return -EINVAL; if (c->x86_vendor == X86_VENDOR_INTEL) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 819199bc0119..2a397da43923 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -389,7 +389,7 @@ static int __init save_builtin_microcode(void) if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED) return 0; - if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; uci.mc = get_microcode_blob(&uci, true); diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 5df621752fef..50a9702ae4e2 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -94,7 +94,6 @@ static inline unsigned int x86_cpuid_family(void) return x86_family(eax); } -extern bool dis_ucode_ldr; extern bool force_minrev; #ifdef CONFIG_CPU_SUP_AMD diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 3aad78bfcb26..cba75306e5b6 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/console.h> #include <linux/kernel.h> +#include <linux/kexec.h> #include <linux/init.h> #include <linux/string.h> #include <linux/screen_info.h> @@ -144,6 +145,11 @@ static __init void early_serial_hw_init(unsigned divisor) static_call(serial_out)(early_serial_base, DLL, divisor & 0xff); static_call(serial_out)(early_serial_base, DLH, (divisor >> 8) & 0xff); static_call(serial_out)(early_serial_base, LCR, c & ~DLAB); + +#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64) + if (static_call_query(serial_in) == io_serial_in) + kexec_debug_8250_port = early_serial_base; +#endif } #define DEFAULT_BAUD 9600 @@ -327,6 +333,9 @@ static __init void early_pci_serial_init(char *s) /* WARNING! assuming the address is always in the first 4G */ early_serial_base = (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10); +#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64) + kexec_debug_8250_mmio32 = bar0 & PCI_BASE_ADDRESS_MEM_MASK; +#endif write_pci_config(bus, slot, func, PCI_COMMAND, cmdreg|PCI_COMMAND_MEMORY); } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cace6e8d7cc7..0853ba3fd04a 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -55,10 +55,10 @@ void ftrace_arch_code_modify_post_process(void) { /* * ftrace_make_{call,nop}() may be called during - * module load, and we need to finish the text_poke_queue() + * module load, and we need to finish the smp_text_poke_batch_add() * that they do, here. */ - text_poke_finish(); + smp_text_poke_batch_finish(); ftrace_poke_late = 0; mutex_unlock(&text_mutex); } @@ -119,7 +119,7 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, /* replace the text with the new text */ if (ftrace_poke_late) - text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_batch_add((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); else text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; @@ -186,11 +186,11 @@ int ftrace_update_ftrace_func(ftrace_func_t func) ip = (unsigned long)(&ftrace_call); new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); ip = (unsigned long)(&ftrace_regs_call); new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } @@ -247,10 +247,10 @@ void ftrace_replace_code(int enable) break; } - text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_batch_add((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); ftrace_update_record(rec, enable); } - text_poke_finish(); + smp_text_poke_batch_finish(); } void arch_ftrace_update_code(int command) @@ -492,7 +492,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) mutex_lock(&text_mutex); /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); mutex_unlock(&text_mutex); } @@ -586,7 +586,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func) const char *new; new = ftrace_jmp_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index de001b2146ab..375f2d7f1762 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -145,10 +145,6 @@ void __init __no_stack_protector mk_early_pgtbl_32(void) *ptr = (unsigned long)ptep + PAGE_OFFSET; #ifdef CONFIG_MICROCODE_INITRD32 - /* Running on a hypervisor? */ - if (native_cpuid_ecx(1) & BIT(31)) - return; - params = (struct boot_params *)__pa_nodebug(&boot_params); if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image) return; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index fa9b6339975f..510fb41f55fc 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -47,7 +47,8 @@ * Manage page tables very early on. */ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; -static unsigned int __initdata next_early_pgt; +unsigned int __initdata next_early_pgt; +SYM_PIC_ALIAS(next_early_pgt); pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); #ifdef CONFIG_X86_5LEVEL @@ -61,221 +62,15 @@ EXPORT_SYMBOL(ptrs_per_p4d); #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); +SYM_PIC_ALIAS(page_offset_base); unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4; EXPORT_SYMBOL(vmalloc_base); +SYM_PIC_ALIAS(vmalloc_base); unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; EXPORT_SYMBOL(vmemmap_base); +SYM_PIC_ALIAS(vmemmap_base); #endif -static inline bool check_la57_support(void) -{ - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) - return false; - - /* - * 5-level paging is detected and enabled at kernel decompression - * stage. Only check if it has been enabled there. - */ - if (!(native_read_cr4() & X86_CR4_LA57)) - return false; - - RIP_REL_REF(__pgtable_l5_enabled) = 1; - RIP_REL_REF(pgdir_shift) = 48; - RIP_REL_REF(ptrs_per_p4d) = 512; - RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5; - RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5; - RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5; - - return true; -} - -static unsigned long __head sme_postprocess_startup(struct boot_params *bp, - pmdval_t *pmd, - unsigned long p2v_offset) -{ - unsigned long paddr, paddr_end; - int i; - - /* Encrypt the kernel and related (if SME is active) */ - sme_encrypt_kernel(bp); - - /* - * Clear the memory encryption mask from the .bss..decrypted section. - * The bss section will be memset to zero later in the initialization so - * there is no need to zero it after changing the memory encryption - * attribute. - */ - if (sme_get_me_mask()) { - paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted); - paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted); - - for (; paddr < paddr_end; paddr += PMD_SIZE) { - /* - * On SNP, transition the page to shared in the RMP table so that - * it is consistent with the page table attribute change. - * - * __start_bss_decrypted has a virtual address in the high range - * mapping (kernel .text). PVALIDATE, by way of - * early_snp_set_memory_shared(), requires a valid virtual - * address but the kernel is currently running off of the identity - * mapping so use the PA to get a *currently* valid virtual address. - */ - early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD); - - i = pmd_index(paddr - p2v_offset); - pmd[i] -= sme_get_me_mask(); - } - } - - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); -} - -/* Code in __startup_64() can be relocated during execution, but the compiler - * doesn't have to generate PC-relative relocations when accessing globals from - * that function. Clang actually does not generate them, which leads to - * boot-time crashes. To work around this problem, every global pointer must - * be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined - * by subtracting p2v_offset from the RIP-relative address. - */ -unsigned long __head __startup_64(unsigned long p2v_offset, - struct boot_params *bp) -{ - pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts); - unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text); - unsigned long va_text, va_end; - unsigned long pgtable_flags; - unsigned long load_delta; - pgdval_t *pgd; - p4dval_t *p4d; - pudval_t *pud; - pmdval_t *pmd, pmd_entry; - bool la57; - int i; - - la57 = check_la57_support(); - - /* Is the address too large? */ - if (physaddr >> MAX_PHYSMEM_BITS) - for (;;); - - /* - * Compute the delta between the address I am compiled to run at - * and the address I am actually running at. - */ - load_delta = __START_KERNEL_map + p2v_offset; - RIP_REL_REF(phys_base) = load_delta; - - /* Is the address not 2M aligned? */ - if (load_delta & ~PMD_MASK) - for (;;); - - va_text = physaddr - p2v_offset; - va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset; - - /* Include the SME encryption mask in the fixup value */ - load_delta += sme_get_me_mask(); - - /* Fixup the physical addresses in the page table */ - - pgd = &RIP_REL_REF(early_top_pgt)->pgd; - pgd[pgd_index(__START_KERNEL_map)] += load_delta; - - if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) { - p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt); - p4d[MAX_PTRS_PER_P4D - 1] += load_delta; - - pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE; - } - - RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta; - RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta; - - for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) - RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta; - - /* - * Set up the identity mapping for the switchover. These - * entries should *NOT* have the global bit set! This also - * creates a bunch of nonsense entries but that is fine -- - * it avoids problems around wraparound. - */ - - pud = &early_pgts[0]->pmd; - pmd = &early_pgts[1]->pmd; - RIP_REL_REF(next_early_pgt) = 2; - - pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); - - if (la57) { - p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd; - - i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; - pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; - - i = physaddr >> P4D_SHIFT; - p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; - p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; - } else { - i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)pud + pgtable_flags; - pgd[i + 1] = (pgdval_t)pud + pgtable_flags; - } - - i = physaddr >> PUD_SHIFT; - pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; - pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; - - pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; - /* Filter out unsupported __PAGE_KERNEL_* bits: */ - pmd_entry &= RIP_REL_REF(__supported_pte_mask); - pmd_entry += sme_get_me_mask(); - pmd_entry += physaddr; - - for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) { - int idx = i + (physaddr >> PMD_SHIFT); - - pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; - } - - /* - * Fixup the kernel text+data virtual addresses. Note that - * we might write invalid pmds, when the kernel is relocated - * cleanup_highmap() fixes this up along with the mappings - * beyond _end. - * - * Only the region occupied by the kernel image has so far - * been checked against the table of usable memory regions - * provided by the firmware, so invalidate pages outside that - * region. A page table entry that maps to a reserved area of - * memory would allow processor speculation into that area, - * and on some hardware (particularly the UV platform) even - * speculative access to some reserved areas is caught as an - * error, causing the BIOS to halt the system. - */ - - pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd; - - /* invalidate pages before the kernel image */ - for (i = 0; i < pmd_index(va_text); i++) - pmd[i] &= ~_PAGE_PRESENT; - - /* fixup pages that are part of the kernel image */ - for (; i <= pmd_index(va_end); i++) - if (pmd[i] & _PAGE_PRESENT) - pmd[i] += load_delta; - - /* invalidate pages after the kernel image */ - for (; i < PTRS_PER_PMD; i++) - pmd[i] &= ~_PAGE_PRESENT; - - return sme_postprocess_startup(bp, pmd, p2v_offset); -} - /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { @@ -513,41 +308,6 @@ void __init __noreturn x86_64_start_reservations(char *real_mode_data) start_kernel(); } -/* - * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is - * used until the idt_table takes over. On the boot CPU this happens in - * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases - * this happens in the functions called from head_64.S. - * - * The idt_table can't be used that early because all the code modifying it is - * in idt.c and can be instrumented by tracing or KASAN, which both don't work - * during early CPU bringup. Also the idt_table has the runtime vectors - * configured which require certain CPU state to be setup already (like TSS), - * which also hasn't happened yet in early CPU bringup. - */ -static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; - -/* This may run while still in the direct mapping */ -static void __head startup_64_load_idt(void *vc_handler) -{ - struct desc_ptr desc = { - .address = (unsigned long)&RIP_REL_REF(bringup_idt_table), - .size = sizeof(bringup_idt_table) - 1, - }; - struct idt_data data; - gate_desc idt_desc; - - /* @vc_handler is set only for a VMM Communication Exception */ - if (vc_handler) { - init_idt_data(&data, X86_TRAP_VC, vc_handler); - idt_init_desc(&idt_desc, &data); - native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc); - } - - native_load_idt(&desc); -} - -/* This is used when running on kernel addresses */ void early_setup_idt(void) { void *handler = NULL; @@ -559,30 +319,3 @@ void early_setup_idt(void) startup_64_load_idt(handler); } - -/* - * Setup boot CPU state needed before kernel switches to virtual addresses. - */ -void __head startup_64_setup_gdt_idt(void) -{ - struct desc_struct *gdt = (void *)(__force unsigned long)gdt_page.gdt; - void *handler = NULL; - - struct desc_ptr startup_gdt_descr = { - .address = (unsigned long)&RIP_REL_REF(*gdt), - .size = GDT_SIZE - 1, - }; - - /* Load GDT */ - native_load_gdt(&startup_gdt_descr); - - /* New GDT is live - reload data segment registers */ - asm volatile("movl %%eax, %%ds\n" - "movl %%eax, %%ss\n" - "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) - handler = &RIP_REL_REF(vc_no_ghcb); - - startup_64_load_idt(handler); -} diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 2e42056d2306..76743dfad6ab 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -86,7 +86,7 @@ SYM_CODE_START(startup_32) movl $pa(__bss_stop),%ecx subl %edi,%ecx shrl $2,%ecx - rep ; stosl + rep stosl /* * Copy bootup parameters out of the way. * Note: %esi still has the pointer to the real-mode data. @@ -98,15 +98,13 @@ SYM_CODE_START(startup_32) movl $pa(boot_params),%edi movl $(PARAM_SIZE/4),%ecx cld - rep - movsl + rep movsl movl pa(boot_params) + NEW_CL_POINTER,%esi andl %esi,%esi jz 1f # No command line movl $pa(boot_command_line),%edi movl $(COMMAND_LINE_SIZE/4),%ecx - rep - movsl + rep movsl 1: #ifdef CONFIG_OLPC diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index fefe2a25cf02..069420853304 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -573,6 +573,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) /* Pure iret required here - don't use INTERRUPT_RETURN */ iretq SYM_CODE_END(vc_no_ghcb) +SYM_PIC_ALIAS(vc_no_ghcb); #endif #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION @@ -604,10 +605,12 @@ SYM_DATA_START_PTI_ALIGNED(early_top_pgt) .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 SYM_DATA_END(early_top_pgt) +SYM_PIC_ALIAS(early_top_pgt) SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 SYM_DATA_END(early_dynamic_pgts) +SYM_PIC_ALIAS(early_dynamic_pgts); SYM_DATA(early_recursion_flag, .long 0) @@ -651,6 +654,7 @@ SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC SYM_DATA_END(level4_kernel_pgt) +SYM_PIC_ALIAS(level4_kernel_pgt) #endif SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) @@ -659,6 +663,7 @@ SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC SYM_DATA_END(level3_kernel_pgt) +SYM_PIC_ALIAS(level3_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) /* @@ -676,6 +681,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) SYM_DATA_END(level2_kernel_pgt) +SYM_PIC_ALIAS(level2_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 @@ -688,6 +694,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) /* 6 MB reserved space + a 2MB hole */ .fill 4,8,0 SYM_DATA_END(level2_fixmap_pgt) +SYM_PIC_ALIAS(level2_fixmap_pgt) SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) .rept (FIXMAP_PMD_NUM) @@ -703,6 +710,7 @@ SYM_DATA(smpboot_control, .long 0) .align 16 /* This must match the first entry in level2_kernel_pgt */ SYM_DATA(phys_base, .quad 0x0) +SYM_PIC_ALIAS(phys_base); EXPORT_SYMBOL(phys_base) #include "../xen/xen-head.S" diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f5b8ef02d172..a7949a54a0ff 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -102,7 +102,7 @@ __jump_label_transform(struct jump_entry *entry, return; } - text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, @@ -135,7 +135,7 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, mutex_lock(&text_mutex); jlp = __jump_label_patch(entry, type); - text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } @@ -143,6 +143,6 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, void arch_jump_label_transform_apply(void) { mutex_lock(&text_mutex); - text_poke_finish(); + smp_text_poke_batch_finish(); mutex_unlock(&text_mutex); } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 09608fd93687..47cb8eb138ba 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -808,7 +808,7 @@ void arch_arm_kprobe(struct kprobe *p) u8 int3 = INT3_INSN_OPCODE; text_poke(p->addr, &int3, 1); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); } @@ -818,7 +818,7 @@ void arch_disarm_kprobe(struct kprobe *p) perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); text_poke(p->addr, &p->opcode, 1); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } void arch_remove_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 36d6809c6c9e..0aabd4c4e2c4 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -488,7 +488,7 @@ void arch_optimize_kprobes(struct list_head *oplist) insn_buff[0] = JMP32_INSN_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); + smp_text_poke_single(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); list_del_init(&op->list); } @@ -513,11 +513,11 @@ void arch_unoptimize_kprobe(struct optimized_kprobe *op) JMP32_INSN_SIZE - INT3_INSN_SIZE); text_poke(addr, new, INT3_INSN_SIZE); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); text_poke(addr + INT3_INSN_SIZE, new + INT3_INSN_SIZE, JMP32_INSN_SIZE - INT3_INSN_SIZE); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); } diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a68f5a0a9f37..949c9e4bfad2 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -76,6 +76,19 @@ map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } #endif +static int map_mmio_serial(struct x86_mapping_info *info, pgd_t *level4p) +{ + unsigned long mstart, mend; + + if (!kexec_debug_8250_mmio32) + return 0; + + mstart = kexec_debug_8250_mmio32 & PAGE_MASK; + mend = (kexec_debug_8250_mmio32 + PAGE_SIZE + 23) & PAGE_MASK; + pr_info("Map PCI serial at %lx - %lx\n", mstart, mend); + return kernel_ident_mapping_init(info, level4p, mstart, mend); +} + #ifdef CONFIG_KEXEC_FILE const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_bzImage64_ops, @@ -285,6 +298,10 @@ static int init_pgtable(struct kimage *image, unsigned long control_page) if (result) return result; + result = map_mmio_serial(&info, image->arch.pgd); + if (result) + return result; + /* * This must be last because the intermediate page table pages it * allocates will not be control pages and may overlap the image. @@ -304,6 +321,24 @@ static void load_segments(void) ); } +static void prepare_debug_idt(unsigned long control_page, unsigned long vec_ofs) +{ + gate_desc idtentry = { 0 }; + int i; + + idtentry.bits.p = 1; + idtentry.bits.type = GATE_TRAP; + idtentry.segment = __KERNEL_CS; + idtentry.offset_low = (control_page & 0xFFFF) + vec_ofs; + idtentry.offset_middle = (control_page >> 16) & 0xFFFF; + idtentry.offset_high = control_page >> 32; + + for (i = 0; i < 16; i++) { + kexec_debug_idt[i] = idtentry; + idtentry.offset_low += KEXEC_DEBUG_EXC_HANDLER_SIZE; + } +} + int machine_kexec_prepare(struct kimage *image) { void *control_page = page_address(image->control_code_page); @@ -321,6 +356,9 @@ int machine_kexec_prepare(struct kimage *image) if (image->type == KEXEC_TYPE_DEFAULT) kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT; + prepare_debug_idt((unsigned long)__pa(control_page), + (unsigned long)kexec_debug_exc_vectors - reloc_start); + __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start); set_memory_rox((unsigned long)control_page, 1); @@ -396,16 +434,10 @@ void __nocfi machine_kexec(struct kimage *image) * with from a table in memory. At no other time is the * descriptor table in memory accessed. * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. + * Take advantage of this here by force loading the segments, + * before the GDT is zapped with an invalid value. */ load_segments(); - /* - * The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - native_idt_invalidate(); - native_gdt_invalidate(); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index a7998f351701..231d6326d1fd 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -206,7 +206,7 @@ static int write_relocate_add(Elf64_Shdr *sechdrs, write, apply); if (!early) { - text_poke_sync(); + smp_text_poke_sync_each_cpu(); mutex_unlock(&text_mutex); } diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index c7c4b1917336..57276f134d12 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -263,17 +263,17 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movl %edx, %edi movl $1024, %ecx - rep ; movsl + rep movsl movl %ebp, %edi movl %eax, %esi movl $1024, %ecx - rep ; movsl + rep movsl movl %eax, %edi movl %edx, %esi movl $1024, %ecx - rep ; movsl + rep movsl lea PAGE_SIZE(%ebp), %esi jmp 0b diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index ac058971a382..ea604f4d0b52 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -39,6 +39,8 @@ SYM_DATA(kexec_va_control_page, .quad 0) SYM_DATA(kexec_pa_table_page, .quad 0) SYM_DATA(kexec_pa_swap_page, .quad 0) SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) +SYM_DATA(kexec_debug_8250_mmio32, .quad 0) +SYM_DATA(kexec_debug_8250_port, .word 0) .balign 16 SYM_DATA_START_LOCAL(kexec_debug_gdt) @@ -50,6 +52,11 @@ SYM_DATA_START_LOCAL(kexec_debug_gdt) .quad 0x00cf92000000ffff /* __KERNEL_DS */ SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end) + .balign 8 +SYM_DATA_START(kexec_debug_idt) + .skip 0x100, 0x00 +SYM_DATA_END(kexec_debug_idt) + .section .text..relocate_kernel,"ax"; .code64 SYM_CODE_START_NOALIGN(relocate_kernel) @@ -72,8 +79,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel) pushq %r15 pushf - /* zero out flags, and disable interrupts */ - pushq $0 + /* Invalidate GDT/IDT, zero out flags */ + pushq $0 + pushq $0 + + lidt (%rsp) + lgdt (%rsp) + addq $8, %rsp popfq /* Switch to the identity mapped page tables */ @@ -139,6 +151,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %ds, %rax movq %rax, %ds + /* Now an IDTR on the stack to load the IDT the kernel created */ + leaq kexec_debug_idt(%rip), %rsi + pushq %rsi + pushw $0xff + lidt (%rsp) + addq $10, %rsp + + //int3 + /* * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP * below. @@ -342,20 +363,20 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) /* copy source page to swap page */ movq kexec_pa_swap_page(%rip), %rdi movl $512, %ecx - rep ; movsq + rep movsq /* copy destination page to source page */ movq %rax, %rdi movq %rdx, %rsi movl $512, %ecx - rep ; movsq + rep movsq /* copy swap page to destination page */ movq %rdx, %rdi movq kexec_pa_swap_page(%rip), %rsi .Lnoswap: movl $512, %ecx - rep ; movsq + rep movsq lea PAGE_SIZE(%rax), %rsi jmp .Lloop @@ -364,3 +385,222 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) ret int3 SYM_CODE_END(swap_pages) + +/* + * Generic 'print character' routine + * - %al: Character to be printed (may clobber %rax) + * - %rdx: MMIO address or port. + */ +#define XMTRDY 0x20 + +#define TXR 0 /* Transmit register (WRITE) */ +#define LSR 5 /* Line Status */ + +SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + addw $LSR, %dx + xchg %al, %ah +.Lxmtrdy_loop: + inb %dx, %al + testb $XMTRDY, %al + jnz .Lready + pause + jmp .Lxmtrdy_loop + +.Lready: + subw $LSR, %dx + xchg %al, %ah + outb %al, %dx +pr_char_null: + ANNOTATE_NOENDBR + + ANNOTATE_UNRET_SAFE + ret +SYM_CODE_END(pr_char_8250) + +SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR +.Lxmtrdy_loop_mmio: + movb (LSR*4)(%rdx), %ah + testb $XMTRDY, %ah + jnz .Lready_mmio + pause + jmp .Lxmtrdy_loop_mmio + +.Lready_mmio: + movb %al, (%rdx) + ANNOTATE_UNRET_SAFE + ret +SYM_CODE_END(pr_char_8250_mmio32) + +/* + * Load pr_char function pointer into %rsi and load %rdx with whatever + * that function wants to see there (typically port/MMIO address). + */ +.macro pr_setup + leaq pr_char_8250(%rip), %rsi + movw kexec_debug_8250_port(%rip), %dx + testw %dx, %dx + jnz 1f + + leaq pr_char_8250_mmio32(%rip), %rsi + movq kexec_debug_8250_mmio32(%rip), %rdx + testq %rdx, %rdx + jnz 1f + + leaq pr_char_null(%rip), %rsi +1: +.endm + +/* Print the nybble in %bl, clobber %rax */ +SYM_CODE_START_LOCAL_NOALIGN(pr_nybble) + UNWIND_HINT_FUNC + movb %bl, %al + nop + andb $0x0f, %al + addb $0x30, %al + cmpb $0x3a, %al + jb 1f + addb $('a' - '0' - 10), %al + ANNOTATE_RETPOLINE_SAFE +1: jmp *%rsi +SYM_CODE_END(pr_nybble) + +SYM_CODE_START_LOCAL_NOALIGN(pr_qword) + UNWIND_HINT_FUNC + movq $16, %rcx +1: rolq $4, %rbx + call pr_nybble + loop 1b + movb $'\n', %al + ANNOTATE_RETPOLINE_SAFE + jmp *%rsi +SYM_CODE_END(pr_qword) + +.macro print_reg a, b, c, d, r + movb $\a, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movb $\b, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movb $\c, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movb $\d, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movq \r, %rbx + call pr_qword +.endm + +SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors) + /* Each of these is 6 bytes. */ +.macro vec_err exc + UNWIND_HINT_ENTRY + . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE) + nop + nop + pushq $\exc + jmp exc_handler +.endm + +.macro vec_noerr exc + UNWIND_HINT_ENTRY + . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE) + pushq $0 + pushq $\exc + jmp exc_handler +.endm + + ANNOTATE_NOENDBR + vec_noerr 0 // #DE + vec_noerr 1 // #DB + vec_noerr 2 // #NMI + vec_noerr 3 // #BP + vec_noerr 4 // #OF + vec_noerr 5 // #BR + vec_noerr 6 // #UD + vec_noerr 7 // #NM + vec_err 8 // #DF + vec_noerr 9 + vec_err 10 // #TS + vec_err 11 // #NP + vec_err 12 // #SS + vec_err 13 // #GP + vec_err 14 // #PF + vec_noerr 15 +SYM_CODE_END(kexec_debug_exc_vectors) + +SYM_CODE_START_LOCAL_NOALIGN(exc_handler) + /* No need for RET mitigations during kexec */ + VALIDATE_UNRET_END + + pushq %rax + pushq %rbx + pushq %rcx + pushq %rdx + pushq %rsi + + /* Stack frame */ +#define EXC_SS 0x58 /* Architectural... */ +#define EXC_RSP 0x50 +#define EXC_EFLAGS 0x48 +#define EXC_CS 0x40 +#define EXC_RIP 0x38 +#define EXC_ERRORCODE 0x30 /* Either architectural or zero pushed by handler */ +#define EXC_EXCEPTION 0x28 /* Pushed by handler entry point */ +#define EXC_RAX 0x20 /* Pushed just above in exc_handler */ +#define EXC_RBX 0x18 +#define EXC_RCX 0x10 +#define EXC_RDX 0x08 +#define EXC_RSI 0x00 + + /* Set up %rdx/%rsi for debug output */ + pr_setup + + /* rip and exception info */ + print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp) + print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp) + print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp) + print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp) + + /* We spilled these to the stack */ + print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp) + print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp) + print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp) + print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp) + print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp) + + /* Other registers untouched */ + print_reg 'r', 'd', 'i', ':', %rdi + print_reg 'r', '8', ' ', ':', %r8 + print_reg 'r', '9', ' ', ':', %r9 + print_reg 'r', '1', '0', ':', %r10 + print_reg 'r', '1', '1', ':', %r11 + print_reg 'r', '1', '2', ':', %r12 + print_reg 'r', '1', '3', ':', %r13 + print_reg 'r', '1', '4', ':', %r14 + print_reg 'r', '1', '5', ':', %r15 + print_reg 'c', 'r', '2', ':', %cr2 + + /* Only return from INT3 */ + cmpq $3, EXC_EXCEPTION(%rsp) + jne .Ldie + + popq %rsi + popq %rdx + popq %rcx + popq %rbx + popq %rax + + addq $16, %rsp + iretq + +.Ldie: + hlt + jmp .Ldie + +SYM_CODE_END(exc_handler) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9d2a13b37833..e0cf1595a0ab 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -134,6 +134,7 @@ struct ist_info ist_info; struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); +SYM_PIC_ALIAS(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) __visible unsigned long mmu_cr4_features __ro_after_init; diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index a59c72e77645..8164a7323c17 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -108,7 +108,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, if (system_state == SYSTEM_BOOTING || modinit) return text_poke_early(insn, code, size); - text_poke_bp(insn, code, size, emulate); + smp_text_poke_single(insn, code, size, emulate); } static void __static_call_validate(u8 *insn, bool tail, bool tramp) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9f88b8a78e50..d67407c623f3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -882,16 +882,16 @@ static void do_int3_user(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_int3) { /* - * poke_int3_handler() is completely self contained code; it does (and + * smp_text_poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ - if (poke_int3_handler(regs)) + if (smp_text_poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() - * and therefore can trigger INT3, hence poke_int3_handler() must + * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index ccdc45e5b759..d813f64a89d6 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -79,11 +79,13 @@ const_cpu_current_top_of_stack = cpu_current_top_of_stack; #define BSS_DECRYPTED \ . = ALIGN(PMD_SIZE); \ __start_bss_decrypted = .; \ + __pi___start_bss_decrypted = .; \ *(.bss..decrypted); \ . = ALIGN(PAGE_SIZE); \ __start_bss_decrypted_unused = .; \ . = ALIGN(PMD_SIZE); \ __end_bss_decrypted = .; \ + __pi___end_bss_decrypted = .; \ #else @@ -128,6 +130,7 @@ SECTIONS /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; + __pi__text = .; _stext = .; ALIGN_ENTRY_TEXT_BEGIN *(.text..__x86.rethunk_untrain) @@ -391,6 +394,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ _end = .; + __pi__end = .; #ifdef CONFIG_AMD_MEM_ENCRYPT /* @@ -466,10 +470,18 @@ SECTIONS } /* - * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + * COMPILE_TEST kernels can be large - CONFIG_KASAN, for example, can cause + * this. Let's assume that nobody will be running a COMPILE_TEST kernel and + * let's assert that fuller build coverage is more valuable than being able to + * run a COMPILE_TEST kernel. + */ +#ifndef CONFIG_COMPILE_TEST +/* + * The ASSERT() sync to . is intentional, for binutils 2.14 compatibility: */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); +#endif /* needed for Clang - see arch/x86/entry/entry.S */ PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 050a0e229a4d..f2b36d32ef40 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -104,6 +104,9 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { + if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) + kvm_mmu_free_obsolete_roots(vcpu); + /* * Checking root.hpa is sufficient even when KVM has mirror root. * We can have either: diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 63bb77ee1bb1..8d1b632e33d2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5974,6 +5974,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } +EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots); static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) @@ -7669,9 +7670,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { + struct kvm_memory_slot *slot = range->slot; + int level; + /* * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM @@ -7686,6 +7708,38 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; + if (WARN_ON_ONCE(range->end <= range->start)) + return false; + + /* + * If the head and tail pages of the range currently allow a hugepage, + * i.e. reside fully in the slot and don't have mixed attributes, then + * add each corresponding hugepage range to the ongoing invalidation, + * e.g. to prevent KVM from creating a hugepage in response to a fault + * for a gfn whose attributes aren't changing. Note, only the range + * of gfns whose attributes are being modified needs to be explicitly + * unmapped, as that will unmap any existing hugepages. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t start = gfn_round_for_level(range->start, level); + gfn_t end = gfn_round_for_level(range->end - 1, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + + if ((start != range->start || start + nr_pages > range->end) && + start >= slot->base_gfn && + start + nr_pages <= slot->base_gfn + slot->npages && + !hugepage_test_mixed(slot, start, level)) + kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages); + + if (end == start) + continue; + + if ((end + nr_pages) > range->end && + (end + nr_pages) <= (slot->base_gfn + slot->npages) && + !hugepage_test_mixed(slot, end, level)) + kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages); + } + /* Unmap the old attribute page. */ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE) range->attr_filter = KVM_FILTER_SHARED; @@ -7695,23 +7749,7 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, return kvm_unmap_gfn_range(kvm, range); } -static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; -} - -static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; -} -static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; -} static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long attrs) diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 699e551ec93b..9864c057187d 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) kvm_mmu_reset_context(vcpu); } +EXPORT_SYMBOL_GPL(kvm_smm_changed); void process_smi(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0bc708ee2788..a7a7dc507336 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3173,9 +3173,14 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->sev_es.ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -3184,18 +3189,24 @@ static void dump_ghcb(struct vcpu_svm *svm) return; } - nbits = sizeof(ghcb->save.valid_bitmap) * 8; + nbits = sizeof(svm->sev_es.valid_bitmap) * 8; - pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + /* + * Print KVM's snapshot of the GHCB values that were (unsuccessfully) + * used to handle the exit. If the guest has since modified the GHCB + * itself, dumping the raw GHCB won't help debug why KVM was unable to + * handle the VMGEXIT that KVM observed. + */ + pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", - ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", - ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", - ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); - pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); + svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); } static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) @@ -3266,11 +3277,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d5d0c5c3300b..a89c271a1951 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void) kvm_cpu_svm_disable(); amd_pmu_disable_virt(); - - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) - msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); } static int svm_enable_virtualization_cpu(void) @@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void) rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); } - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) - msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); - return 0; } @@ -1518,6 +1512,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } +#ifdef CONFIG_CPU_MITIGATIONS +static DEFINE_SPINLOCK(srso_lock); +static atomic_t srso_nr_vms; + +static void svm_srso_clear_bp_spec_reduce(void *ign) +{ + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); + + if (!sd->bp_spec_reduce_set) + return; + + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + sd->bp_spec_reduce_set = false; +} + +static void svm_srso_vm_destroy(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + if (atomic_dec_return(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + /* + * Verify a new VM didn't come along, acquire the lock, and increment + * the count before this task acquired the lock. + */ + if (atomic_read(&srso_nr_vms)) + return; + + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); +} + +static void svm_srso_vm_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + /* + * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 + * transition, i.e. destroying the last VM, is fully complete, e.g. so + * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. + */ + if (atomic_inc_not_zero(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + atomic_inc(&srso_nr_vms); +} +#else +static void svm_srso_vm_init(void) { } +static void svm_srso_vm_destroy(void) { } +#endif + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1550,6 +1601,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && + !sd->bp_spec_reduce_set) { + sd->bp_spec_reduce_set = true; + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + } svm->guest_state_loaded = true; } @@ -2231,6 +2287,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) */ if (!sev_es_guest(vcpu->kvm)) { clear_page(svm->vmcb); +#ifdef CONFIG_KVM_SMM + if (is_smm(vcpu)) + kvm_smm_changed(vcpu, false); +#endif kvm_vcpu_reset(vcpu, true); } @@ -5036,6 +5096,8 @@ static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); + + svm_srso_vm_destroy(); } static int svm_vm_init(struct kvm *kvm) @@ -5061,6 +5123,7 @@ static int svm_vm_init(struct kvm *kvm) return ret; } + svm_srso_vm_init(); return 0; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index d4490eaed55d..f16b068c4228 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -335,6 +335,8 @@ struct svm_cpu_data { u32 next_asid; u32 min_asid; + bool bp_spec_reduce_set; + struct vmcb *save_area; unsigned long save_area_pa; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5c5766467a61..0aba4719ae0a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -273,6 +273,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) case L1TF_MITIGATION_OFF: l1tf = VMENTER_L1D_FLUSH_NEVER; break; + case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: @@ -7358,10 +7359,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, * mitigation for MDS is done late in VMentry and is still * executed in spite of L1D Flush. This is because an extra VERW * should not matter much after the big hammer L1D Flush. + * + * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, + * and is affected by MMIO Stale Data. In such cases mitigation in only + * needed against an MMIO capable guest. */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mmio_stale_data_clear) && + else if (static_branch_unlikely(&cpu_buf_vm_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) mds_clear_cpu_buffers(); @@ -7700,6 +7705,7 @@ int vmx_vm_init(struct kvm *kvm) case L1TF_MITIGATION_FLUSH_NOWARN: /* 'I explicitly don't care' is set */ break; + case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index df5b99ea1f18..9896fd574bfc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4597,7 +4597,7 @@ static bool kvm_is_vm_type_supported(unsigned long type) return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); } -static inline u32 kvm_sync_valid_fields(struct kvm *kvm) +static inline u64 kvm_sync_valid_fields(struct kvm *kvm) { return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS; } @@ -11493,7 +11493,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; - u32 sync_valid_fields; + u64 sync_valid_fields; int r; r = kvm_mmu_post_init_vm(vcpu->kvm); diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index e86eda2c0b04..eb2d2e1cbddd 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -75,7 +75,7 @@ static void delay_tsc(u64 cycles) /* Allow RT tasks to run */ preempt_enable(); - rep_nop(); + native_pause(); preempt_disable(); /* diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 98631c0e7a11..f786401ac15d 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -631,14 +631,21 @@ static bool get_desc(struct desc_struct *out, unsigned short sel) /* Bits [15:3] contain the index of the desired entry. */ sel >>= 3; - mutex_lock(¤t->active_mm->context.lock); - ldt = current->active_mm->context.ldt; + /* + * If we're not in a valid context with a real (not just lazy) + * user mm, then don't even try. + */ + if (!nmi_uaccess_okay()) + return false; + + mutex_lock(¤t->mm->context.lock); + ldt = current->mm->context.ldt; if (ldt && sel < ldt->nr_entries) { *out = ldt->entries[sel]; success = true; } - mutex_unlock(¤t->active_mm->context.lock); + mutex_unlock(¤t->mm->context.lock); return success; } diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 6ffb931b9fb1..149a57e334ab 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -324,6 +324,11 @@ int insn_get_opcode(struct insn *insn) } insn->attr = inat_get_opcode_attribute(op); + if (insn->x86_64 && inat_is_invalid64(insn->attr)) { + /* This instruction is invalid, like UD2. Stop decoding. */ + insn->attr &= INAT_INV64; + } + while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ op = get_next(insn_byte_t, insn); @@ -337,6 +342,7 @@ int insn_get_opcode(struct insn *insn) insn->attr = 0; return -EINVAL; } + end: opcode->got = 1; return 0; @@ -658,7 +664,6 @@ int insn_get_immediate(struct insn *insn) } if (!inat_has_immediate(insn->attr)) - /* no immediates */ goto done; switch (inat_immediate_size(insn->attr)) { diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c index 5eecb45d05d5..c20e04764edc 100644 --- a/arch/x86/lib/iomem.c +++ b/arch/x86/lib/iomem.c @@ -10,7 +10,7 @@ static __always_inline void rep_movs(void *to, const void *from, size_t n) { unsigned long d0, d1, d2; - asm volatile("rep ; movsl\n\t" + asm volatile("rep movsl\n\t" "testb $2,%b4\n\t" "je 1f\n\t" "movsw\n" diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 0ae2e1712e2e..12a23fa7c44c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -41,6 +41,7 @@ SYM_FUNC_END(__memcpy) EXPORT_SYMBOL(__memcpy) SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy) +SYM_PIC_ALIAS(memcpy) EXPORT_SYMBOL(memcpy) SYM_FUNC_START_LOCAL(memcpy_orig) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index d66b710d628f..fb5a03cf5ab7 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -42,6 +42,7 @@ SYM_FUNC_END(__memset) EXPORT_SYMBOL(__memset) SYM_FUNC_ALIAS_MEMFUNC(memset, __memset) +SYM_PIC_ALIAS(memset) EXPORT_SYMBOL(memset) SYM_FUNC_START_LOCAL(memset_orig) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index a26c43abd47d..9f3116609c8c 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -40,6 +40,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) ALTERNATIVE_2 __stringify(RETPOLINE \reg), \ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_LFENCE, \ __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), ALT_NOT(X86_FEATURE_RETPOLINE) +SYM_PIC_ALIAS(__x86_indirect_thunk_\reg) .endm @@ -394,6 +395,7 @@ SYM_CODE_START(__x86_return_thunk) #endif int3 SYM_CODE_END(__x86_return_thunk) +SYM_PIC_ALIAS(__x86_return_thunk) EXPORT_SYMBOL(__x86_return_thunk) #endif /* CONFIG_MITIGATION_RETHUNK */ diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index 53b3f202267c..f87ec24fa579 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -40,8 +40,7 @@ char *strncpy(char *dest, const char *src, size_t count) "stosb\n\t" "testb %%al,%%al\n\t" "jne 1b\n\t" - "rep\n\t" - "stosb\n" + "rep stosb\n" "2:" : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) : "0" (src), "1" (dest), "2" (count) : "memory"); @@ -54,8 +53,7 @@ EXPORT_SYMBOL(strncpy); char *strcat(char *dest, const char *src) { int d0, d1, d2, d3; - asm volatile("repne\n\t" - "scasb\n\t" + asm volatile("repne scasb\n\t" "decl %1\n" "1:\tlodsb\n\t" "stosb\n\t" @@ -72,8 +70,7 @@ EXPORT_SYMBOL(strcat); char *strncat(char *dest, const char *src, size_t count) { int d0, d1, d2, d3; - asm volatile("repne\n\t" - "scasb\n\t" + asm volatile("repne scasb\n\t" "decl %1\n\t" "movl %8,%3\n" "1:\tdecl %3\n\t" @@ -167,8 +164,7 @@ size_t strlen(const char *s) { int d0; size_t res; - asm volatile("repne\n\t" - "scasb" + asm volatile("repne scasb" : "=c" (res), "=&D" (d0) : "1" (s), "a" (0), "0" (0xffffffffu) : "memory"); @@ -184,8 +180,7 @@ void *memchr(const void *cs, int c, size_t count) void *res; if (!count) return NULL; - asm volatile("repne\n\t" - "scasb\n\t" + asm volatile("repne scasb\n\t" "je 1f\n\t" "movl $1,%0\n" "1:\tdecl %0" @@ -202,7 +197,7 @@ void *memscan(void *addr, int c, size_t size) { if (!size) return addr; - asm volatile("repnz; scasb\n\t" + asm volatile("repnz scasb\n\t" "jnz 1f\n\t" "dec %%edi\n" "1:" diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c index 38f37df056f7..28267985e85f 100644 --- a/arch/x86/lib/strstr_32.c +++ b/arch/x86/lib/strstr_32.c @@ -8,16 +8,14 @@ int d0, d1; register char *__res; __asm__ __volatile__( "movl %6,%%edi\n\t" - "repne\n\t" - "scasb\n\t" + "repne scasb\n\t" "notl %%ecx\n\t" "decl %%ecx\n\t" /* NOTE! This also sets Z if searchstring='' */ "movl %%ecx,%%edx\n" "1:\tmovl %6,%%edi\n\t" "movl %%esi,%%eax\n\t" "movl %%edx,%%ecx\n\t" - "repe\n\t" - "cmpsb\n\t" + "repe cmpsb\n\t" "je 2f\n\t" /* also works for empty string, see above */ "xchgl %%eax,%%esi\n\t" "incl %%esi\n\t" diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 422257c350c6..f6f436f1d573 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -38,9 +38,9 @@ do { \ might_fault(); \ __asm__ __volatile__( \ ASM_STAC "\n" \ - "0: rep; stosl\n" \ + "0: rep stosl\n" \ " movl %2,%0\n" \ - "1: rep; stosb\n" \ + "1: rep stosb\n" \ "2: " ASM_CLAC "\n" \ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %2) \ _ASM_EXTABLE_UA(1b, 2b) \ @@ -140,9 +140,9 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size) " shrl $2, %0\n" " andl $3, %%eax\n" " cld\n" - "99: rep; movsl\n" + "99: rep movsl\n" "36: movl %%eax, %0\n" - "37: rep; movsb\n" + "37: rep movsb\n" "100:\n" _ASM_EXTABLE_UA(1b, 100b) _ASM_EXTABLE_UA(2b, 100b) @@ -242,9 +242,9 @@ static unsigned long __copy_user_intel_nocache(void *to, " shrl $2, %0\n" " andl $3, %%eax\n" " cld\n" - "6: rep; movsl\n" + "6: rep movsl\n" " movl %%eax,%0\n" - "7: rep; movsb\n" + "7: rep movsb\n" "8:\n" _ASM_EXTABLE_UA(0b, 8b) _ASM_EXTABLE_UA(1b, 8b) @@ -293,14 +293,14 @@ do { \ " negl %0\n" \ " andl $7,%0\n" \ " subl %0,%3\n" \ - "4: rep; movsb\n" \ + "4: rep movsb\n" \ " movl %3,%0\n" \ " shrl $2,%0\n" \ " andl $3,%3\n" \ " .align 2,0x90\n" \ - "0: rep; movsl\n" \ + "0: rep movsl\n" \ " movl %3,%0\n" \ - "1: rep; movsb\n" \ + "1: rep movsb\n" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(4b, 2b, EX_TYPE_UCOPY_LEN1, %3) \ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %3) \ diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index f5dd84eb55dc..262f7ca1fb95 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -35,7 +35,7 @@ # - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) # - (66&F2): Both 0x66 and 0xF2 prefixes are specified. # -# REX2 Prefix +# REX2 Prefix Superscripts # - (!REX2): REX2 is not allowed # - (REX2): REX2 variant e.g. JMPABS @@ -147,7 +147,7 @@ AVXcode: # 0x60 - 0x6f 60: PUSHA/PUSHAD (i64) 61: POPA/POPAD (i64) -62: BOUND Gv,Ma (i64) | EVEX (Prefix) +62: BOUND Gv,Ma (i64) | EVEX (Prefix),(o64) 63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64) 64: SEG=FS (Prefix) 65: SEG=GS (Prefix) @@ -253,8 +253,8 @@ c0: Grp2 Eb,Ib (1A) c1: Grp2 Ev,Ib (1A) c2: RETN Iw (f64) c3: RETN -c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) -c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) +c4: LES Gz,Mp (i64) | VEX+2byte (Prefix),(o64) +c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix),(o64) c6: Grp11A Eb,Ib (1A) c7: Grp11B Ev,Iz (1A) c8: ENTER Iw,Ib @@ -286,10 +286,10 @@ df: ESC # Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix # in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation # to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. -e0: LOOPNE/LOOPNZ Jb (f64) (!REX2) -e1: LOOPE/LOOPZ Jb (f64) (!REX2) -e2: LOOP Jb (f64) (!REX2) -e3: JrCXZ Jb (f64) (!REX2) +e0: LOOPNE/LOOPNZ Jb (f64),(!REX2) +e1: LOOPE/LOOPZ Jb (f64),(!REX2) +e2: LOOP Jb (f64),(!REX2) +e3: JrCXZ Jb (f64),(!REX2) e4: IN AL,Ib (!REX2) e5: IN eAX,Ib (!REX2) e6: OUT Ib,AL (!REX2) @@ -298,10 +298,10 @@ e7: OUT Ib,eAX (!REX2) # in "near" jumps and calls is 16-bit. For CALL, # push of return address is 16-bit wide, RSP is decremented by 2 # but is not truncated to 16 bits, unlike RIP. -e8: CALL Jz (f64) (!REX2) -e9: JMP-near Jz (f64) (!REX2) -ea: JMP-far Ap (i64) (!REX2) -eb: JMP-short Jb (f64) (!REX2) +e8: CALL Jz (f64),(!REX2) +e9: JMP-near Jz (f64),(!REX2) +ea: JMP-far Ap (i64),(!REX2) +eb: JMP-short Jb (f64),(!REX2) ec: IN AL,DX (!REX2) ed: IN eAX,DX (!REX2) ee: OUT DX,AL (!REX2) @@ -478,22 +478,22 @@ AVXcode: 1 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev) # 0x0f 0x80-0x8f # Note: "forced64" is Intel CPU behavior (see comment about CALL insn). -80: JO Jz (f64) (!REX2) -81: JNO Jz (f64) (!REX2) -82: JB/JC/JNAE Jz (f64) (!REX2) -83: JAE/JNB/JNC Jz (f64) (!REX2) -84: JE/JZ Jz (f64) (!REX2) -85: JNE/JNZ Jz (f64) (!REX2) -86: JBE/JNA Jz (f64) (!REX2) -87: JA/JNBE Jz (f64) (!REX2) -88: JS Jz (f64) (!REX2) -89: JNS Jz (f64) (!REX2) -8a: JP/JPE Jz (f64) (!REX2) -8b: JNP/JPO Jz (f64) (!REX2) -8c: JL/JNGE Jz (f64) (!REX2) -8d: JNL/JGE Jz (f64) (!REX2) -8e: JLE/JNG Jz (f64) (!REX2) -8f: JNLE/JG Jz (f64) (!REX2) +80: JO Jz (f64),(!REX2) +81: JNO Jz (f64),(!REX2) +82: JB/JC/JNAE Jz (f64),(!REX2) +83: JAE/JNB/JNC Jz (f64),(!REX2) +84: JE/JZ Jz (f64),(!REX2) +85: JNE/JNZ Jz (f64),(!REX2) +86: JBE/JNA Jz (f64),(!REX2) +87: JA/JNBE Jz (f64),(!REX2) +88: JS Jz (f64),(!REX2) +89: JNS Jz (f64),(!REX2) +8a: JP/JPE Jz (f64),(!REX2) +8b: JNP/JPO Jz (f64),(!REX2) +8c: JL/JNGE Jz (f64),(!REX2) +8d: JNL/JGE Jz (f64),(!REX2) +8e: JLE/JNG Jz (f64),(!REX2) +8f: JNLE/JG Jz (f64),(!REX2) # 0x0f 0x90-0x9f 90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66) 91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 32035d5be5a0..3faa60f13a61 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -3,12 +3,10 @@ KCOV_INSTRUMENT_tlb.o := n KCOV_INSTRUMENT_mem_encrypt.o := n KCOV_INSTRUMENT_mem_encrypt_amd.o := n -KCOV_INSTRUMENT_mem_encrypt_identity.o := n KCOV_INSTRUMENT_pgprot.o := n KASAN_SANITIZE_mem_encrypt.o := n KASAN_SANITIZE_mem_encrypt_amd.o := n -KASAN_SANITIZE_mem_encrypt_identity.o := n KASAN_SANITIZE_pgprot.o := n # Disable KCSAN entirely, because otherwise we get warnings that some functions @@ -16,12 +14,10 @@ KASAN_SANITIZE_pgprot.o := n KCSAN_SANITIZE := n # Avoid recursion by not calling KMSAN hooks for CEA code. KMSAN_SANITIZE_cpu_entry_area.o := n -KMSAN_SANITIZE_mem_encrypt_identity.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg CFLAGS_REMOVE_mem_encrypt_amd.o = -pg -CFLAGS_REMOVE_mem_encrypt_identity.o = -pg CFLAGS_REMOVE_pgprot.o = -pg endif @@ -32,7 +28,6 @@ obj-y += pat/ # Make sure __phys_addr has no stackprotector CFLAGS_physaddr.o := -fno-stack-protector -CFLAGS_mem_encrypt_identity.o := -fno-stack-protector CFLAGS_fault.o := -I $(src)/../include/asm/trace @@ -63,5 +58,4 @@ obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o -obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bfa444a7dbb0..aa56d9ac0b8f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -28,6 +28,7 @@ #include <asm/text-patching.h> #include <asm/memtype.h> #include <asm/paravirt.h> +#include <asm/mmu_context.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -824,31 +825,33 @@ void __init poking_init(void) spinlock_t *ptl; pte_t *ptep; - poking_mm = mm_alloc(); - BUG_ON(!poking_mm); + text_poke_mm = mm_alloc(); + BUG_ON(!text_poke_mm); /* Xen PV guests need the PGD to be pinned. */ - paravirt_enter_mmap(poking_mm); + paravirt_enter_mmap(text_poke_mm); + + set_notrack_mm(text_poke_mm); /* * Randomize the poking address, but make sure that the following page * will be mapped at the same PMD. We need 2 pages, so find space for 3, * and adjust the address if the PMD ends after the first one. */ - poking_addr = TASK_UNMAPPED_BASE; + text_poke_mm_addr = TASK_UNMAPPED_BASE; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % + text_poke_mm_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); - if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) - poking_addr += PAGE_SIZE; + if (((text_poke_mm_addr + PAGE_SIZE) & ~PMD_MASK) == 0) + text_poke_mm_addr += PAGE_SIZE; /* * We need to trigger the allocation of the page-tables that will be * needed for poking now. Later, poking may be performed in an atomic * section, which might cause allocation to fail. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); BUG_ON(!ptep); pte_unmap_unlock(ptep, ptl); } diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 7490ff6d83b1..faf3a13fb6ba 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -40,7 +40,9 @@ * section is later cleared. */ u64 sme_me_mask __section(".data") = 0; +SYM_PIC_ALIAS(sme_me_mask); u64 sev_status __section(".data") = 0; +SYM_PIC_ALIAS(sev_status); u64 sev_check_data __section(".data") = 0; EXPORT_SYMBOL(sme_me_mask); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index f7ae44d3dd9e..e62af72923e8 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -10,6 +10,7 @@ #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); +SYM_PIC_ALIAS(physical_mask); #endif pgtable_t pte_alloc_one(struct mm_struct *mm) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index eb83348f9305..2ea8cec6cd49 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -847,7 +847,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. */ - if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm && + if (IS_ENABLED(CONFIG_DEBUG_VM) && + WARN_ON_ONCE(prev != &init_mm && !is_notrack_mm(prev) && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); @@ -899,20 +900,13 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cond_mitigation(tsk); /* - * Let nmi_uaccess_okay() and finish_asid_transition() - * know that CR3 is changing. + * Indicate that CR3 is about to change. nmi_uaccess_okay() + * and others are sensitive to the window where mm_cpumask(), + * CR3 and cpu_tlbstate.loaded_mm are not all in sync. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); - /* - * Leave this CPU in prev's mm_cpumask. Atomic writes to - * mm_cpumask can be expensive under contention. The CPU - * will be removed lazily at TLB flush time. - */ - VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, - mm_cpumask(prev))); - /* Start receiving IPIs and then read tlb_gen (and LAM below) */ if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) cpumask_set_cpu(cpu, mm_cpumask(next)); @@ -972,6 +966,77 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) } /* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. Temporary mms can also be used for EFI runtime service + * calls or similar functionality. + * + * It is illegal to schedule while using a temporary mm -- the context switch + * code is unaware of the temporary mm and does not know how to context switch. + * Use a real (non-temporary) mm in a kernel thread if you need to sleep. + * + * Note: For sensitive memory writes, the temporary mm needs to be used + * exclusively by a single core, and IRQs should be disabled while the + * temporary mm is loaded, thereby preventing interrupt handler bugs from + * overriding the kernel memory protection. + */ +struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) +{ + struct mm_struct *prev_mm; + + lockdep_assert_preemption_disabled(); + guard(irqsave)(); + + /* + * Make sure not to be in TLB lazy mode, as otherwise we'll end up + * with a stale address space WITHOUT being in lazy mode after + * restoring the previous mm. + */ + if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) + leave_mm(); + + prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, temp_mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return prev_mm; +} + +void unuse_temporary_mm(struct mm_struct *prev_mm) +{ + lockdep_assert_preemption_disabled(); + guard(irqsave)(); + + /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ + cpumask_clear_cpu(smp_processor_id(), mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm))); + + switch_mm_irqs_off(NULL, prev_mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + +/* * Call this when reinitializing a CPU. It fixes the following potential * problems: * @@ -1204,8 +1269,16 @@ done: static bool should_flush_tlb(int cpu, void *data) { + struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu); struct flush_tlb_info *info = data; + /* + * Order the 'loaded_mm' and 'is_lazy' against their + * write ordering in switch_mm_irqs_off(). Ensure + * 'is_lazy' is at least as new as 'loaded_mm'. + */ + smp_rmb(); + /* Lazy TLB will get flushed at the next context switch. */ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) return false; @@ -1214,8 +1287,15 @@ static bool should_flush_tlb(int cpu, void *data) if (!info->mm) return true; + /* + * While switching, the remote CPU could have state from + * either the prev or next mm. Assume the worst and flush. + */ + if (loaded_mm == LOADED_MM_SWITCHING) + return true; + /* The target mm is loaded, and the CPU is not lazy. */ - if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) + if (loaded_mm == info->mm) return true; /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9e5fe2ba858f..e2b9991c3326 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -629,7 +629,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, goto out; ret = 1; if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { - text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); + smp_text_poke_single(ip, new_insn, X86_PATCH_SIZE, NULL); ret = 0; } out: diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index a4b4ebd41b8f..e7e8f77f77f8 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -89,6 +89,7 @@ int __init efi_alloc_page_tables(void) efi_mm.pgd = efi_pgd; mm_init_cpumask(&efi_mm); init_new_context(NULL, &efi_mm); + set_notrack_mm(&efi_mm); return 0; @@ -434,15 +435,12 @@ void __init efi_dump_pagetable(void) */ static void efi_enter_mm(void) { - efi_prev_mm = current->active_mm; - current->active_mm = &efi_mm; - switch_mm(efi_prev_mm, &efi_mm, NULL); + efi_prev_mm = use_temporary_mm(&efi_mm); } static void efi_leave_mm(void) { - current->active_mm = efi_prev_mm; - switch_mm(&efi_mm, efi_prev_mm, NULL); + unuse_temporary_mm(efi_prev_mm); } void arch_efi_call_virt_setup(void) diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index cfa18ec7d55f..1d78e5631bb8 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -87,8 +87,7 @@ SYM_CODE_START(pvh_start_xen) mov %ebx, %esi movl rva(pvh_start_info_sz)(%ebp), %ecx shr $2,%ecx - rep - movsl + rep movsl leal rva(early_stack_end)(%ebp), %esp diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 5606a15cf9a1..fb910d9f8471 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -69,8 +69,7 @@ copy_loop: movl pbe_orig_address(%edx), %edi movl $(PAGE_SIZE >> 2), %ecx - rep - movsl + rep movsl movl pbe_next(%edx), %edx jmp copy_loop diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 66f066b8feda..c73be0a02a6c 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -138,8 +138,7 @@ SYM_FUNC_START(core_restore_code) movq pbe_address(%rdx), %rsi movq pbe_orig_address(%rdx), %rdi movq $(PAGE_SIZE >> 3), %rcx - rep - movsq + rep movsq /* progress to the next pbe */ movq pbe_next(%rdx), %rdx diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 5770c8097f32..2c19d7fc8a85 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -64,6 +64,8 @@ BEGIN { modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" + invalid64_expr = "\\(i64\\)" + only64_expr = "\\(o64\\)" rex_expr = "^((REX(\\.[XRWB]+)+)|(REX$))" rex2_expr = "\\(REX2\\)" no_rex2_expr = "\\(!REX2\\)" @@ -319,6 +321,11 @@ function convert_operands(count,opnd, i,j,imm,mod) if (match(ext, force64_expr)) flags = add_flags(flags, "INAT_FORCE64") + # check invalid in 64-bit (and no only64) + if (match(ext, invalid64_expr) && + !match($0, only64_expr)) + flags = add_flags(flags, "INAT_INV64") + # check REX2 not allowed if (match(ext, no_rex2_expr)) flags = add_flags(flags, "INAT_NO_REX2") diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h index ab5c8e47049c..9193a7790a71 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_32.h +++ b/arch/x86/um/shared/sysdep/faultinfo_32.h @@ -31,8 +31,8 @@ struct faultinfo { #define ___backtrack_faulted(_faulted) \ asm volatile ( \ - "mov $0, %0\n" \ "movl $__get_kernel_nofault_faulted_%=,%1\n" \ + "mov $0, %0\n" \ "jmp _end_%=\n" \ "__get_kernel_nofault_faulted_%=:\n" \ "mov $1, %0;" \ diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h index 26fb4835d3e9..61e4ca1e0ab5 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_64.h +++ b/arch/x86/um/shared/sysdep/faultinfo_64.h @@ -31,8 +31,8 @@ struct faultinfo { #define ___backtrack_faulted(_faulted) \ asm volatile ( \ - "mov $0, %0\n" \ "movq $__get_kernel_nofault_faulted_%=,%1\n" \ + "mov $0, %0\n" \ "jmp _end_%=\n" \ "__get_kernel_nofault_faulted_%=:\n" \ "mov $1, %0;" \ |