diff options
Diffstat (limited to 'arch')
218 files changed, 1939 insertions, 975 deletions
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 483965c5a4de..b154b4e3dfa8 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -5,4 +5,5 @@ generic-y += agp.h generic-y += asm-offsets.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += text-patching.h diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index 4c69522e0328..483caacc6988 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -5,5 +5,6 @@ generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h +generic-y += ring_buffer.h generic-y += user.h generic-y += text-patching.h diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 71fc5dd4123f..73e6647bea46 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -136,7 +136,7 @@ config ARM select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ - select HAVE_RUST if CPU_LITTLE_ENDIAN && CPU_32v7 + select HAVE_RUST if CPU_LITTLE_ENDIAN && CPU_32v7 && !KASAN select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select HAVE_UID16 diff --git a/arch/arm/boot/dts/gemini/gemini-sl93512r.dts b/arch/arm/boot/dts/gemini/gemini-sl93512r.dts index 4992ec276de9..341dec9b636a 100644 --- a/arch/arm/boot/dts/gemini/gemini-sl93512r.dts +++ b/arch/arm/boot/dts/gemini/gemini-sl93512r.dts @@ -146,7 +146,7 @@ partitions { compatible = "redboot-fis"; /* Eraseblock at 0xfe0000 */ - fis-index-block = <0x1fc>; + fis-index-block = <0x7f>; }; }; diff --git a/arch/arm/boot/dts/gemini/gemini-sq201.dts b/arch/arm/boot/dts/gemini/gemini-sq201.dts index f8c6f6e5cdea..bfd1e8581ad6 100644 --- a/arch/arm/boot/dts/gemini/gemini-sq201.dts +++ b/arch/arm/boot/dts/gemini/gemini-sq201.dts @@ -134,7 +134,7 @@ partitions { compatible = "redboot-fis"; /* Eraseblock at 0xfe0000 */ - fis-index-block = <0x1fc>; + fis-index-block = <0x7f>; }; }; diff --git a/arch/arm/boot/dts/microchip/sam9x7.dtsi b/arch/arm/boot/dts/microchip/sam9x7.dtsi index d242d7a934d0..c680a5033b6b 100644 --- a/arch/arm/boot/dts/microchip/sam9x7.dtsi +++ b/arch/arm/boot/dts/microchip/sam9x7.dtsi @@ -990,9 +990,9 @@ <62 IRQ_TYPE_LEVEL_HIGH 3>, /* Queue 3 */ <63 IRQ_TYPE_LEVEL_HIGH 3>, /* Queue 4 */ <64 IRQ_TYPE_LEVEL_HIGH 3>; /* Queue 5 */ - clocks = <&pmc PMC_TYPE_PERIPHERAL 24>, <&pmc PMC_TYPE_PERIPHERAL 24>, <&pmc PMC_TYPE_GCK 24>, <&pmc PMC_TYPE_GCK 67>; - clock-names = "hclk", "pclk", "tx_clk", "tsu_clk"; - assigned-clocks = <&pmc PMC_TYPE_GCK 67>; + clocks = <&pmc PMC_TYPE_PERIPHERAL 24>, <&pmc PMC_TYPE_PERIPHERAL 24>, <&pmc PMC_TYPE_GCK 24>; + clock-names = "hclk", "pclk", "tsu_clk"; + assigned-clocks = <&pmc PMC_TYPE_GCK 24>; assigned-clock-rates = <266666666>; status = "disabled"; }; diff --git a/arch/arm/boot/dts/renesas/r7s72100-genmai.dts b/arch/arm/boot/dts/renesas/r7s72100-genmai.dts index 3c3756509714..da552a66615e 100644 --- a/arch/arm/boot/dts/renesas/r7s72100-genmai.dts +++ b/arch/arm/boot/dts/renesas/r7s72100-genmai.dts @@ -34,9 +34,6 @@ clocks = <&mstp9_clks R7S72100_CLK_SPIBSC0>; power-domains = <&cpg_clocks>; - #address-cells = <1>; - #size-cells = <1>; - partitions { compatible = "fixed-partitions"; #address-cells = <1>; diff --git a/arch/arm/boot/dts/renesas/r7s72100-rskrza1.dts b/arch/arm/boot/dts/renesas/r7s72100-rskrza1.dts index 91178fb9e721..3306bc9b7bc3 100644 --- a/arch/arm/boot/dts/renesas/r7s72100-rskrza1.dts +++ b/arch/arm/boot/dts/renesas/r7s72100-rskrza1.dts @@ -36,8 +36,6 @@ power-domains = <&cpg_clocks>; bank-width = <4>; device-width = <1>; - #address-cells = <1>; - #size-cells = <1>; partitions { compatible = "fixed-partitions"; diff --git a/arch/arm/boot/dts/renesas/r7s72100.dtsi b/arch/arm/boot/dts/renesas/r7s72100.dtsi index 245c26bb8e03..6ec57ffa72e8 100644 --- a/arch/arm/boot/dts/renesas/r7s72100.dtsi +++ b/arch/arm/boot/dts/renesas/r7s72100.dtsi @@ -37,7 +37,7 @@ clock-div = <3>; }; - bsc: bus { + bsc: bus@0 { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; diff --git a/arch/arm/boot/dts/renesas/r8a7778.dtsi b/arch/arm/boot/dts/renesas/r8a7778.dtsi index 859dd29dfce3..7db456b19795 100644 --- a/arch/arm/boot/dts/renesas/r8a7778.dtsi +++ b/arch/arm/boot/dts/renesas/r8a7778.dtsi @@ -40,7 +40,7 @@ spi2 = &hspi2; }; - lbsc: bus { + lbsc: bus@0 { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; diff --git a/arch/arm/boot/dts/renesas/r8a7779.dtsi b/arch/arm/boot/dts/renesas/r8a7779.dtsi index e437c22f452d..9e8a7e190c89 100644 --- a/arch/arm/boot/dts/renesas/r8a7779.dtsi +++ b/arch/arm/boot/dts/renesas/r8a7779.dtsi @@ -704,7 +704,7 @@ }; }; - lbsc: bus { + lbsc: bus@0 { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; diff --git a/arch/arm/boot/dts/renesas/r8a7792.dtsi b/arch/arm/boot/dts/renesas/r8a7792.dtsi index 9e0de69ac3a3..fbdbcff1cbed 100644 --- a/arch/arm/boot/dts/renesas/r8a7792.dtsi +++ b/arch/arm/boot/dts/renesas/r8a7792.dtsi @@ -86,7 +86,7 @@ bootph-all; }; - lbsc: bus { + lbsc: bus@0 { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 03657ff8fbe3..decad5f2c826 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += early_ioremap.h generic-y += extable.h generic-y += flat.h generic-y += parport.h +generic-y += ring_buffer.h generated-y += mach-types.h generated-y += unistd-nr.h diff --git a/arch/arm/mach-socfpga/platsmp.c b/arch/arm/mach-socfpga/platsmp.c index 201191cf68f3..349e6c54518e 100644 --- a/arch/arm/mach-socfpga/platsmp.c +++ b/arch/arm/mach-socfpga/platsmp.c @@ -78,6 +78,7 @@ static void __init socfpga_smp_prepare_cpus(unsigned int max_cpus) } socfpga_scu_base_addr = of_iomap(np, 0); + of_node_put(np); if (!socfpga_scu_base_addr) return; scu_enable(socfpga_scu_base_addr); diff --git a/arch/arm/mach-versatile/integrator_cp.c b/arch/arm/mach-versatile/integrator_cp.c index 2ed4ded56b3f..03dfb5f720b7 100644 --- a/arch/arm/mach-versatile/integrator_cp.c +++ b/arch/arm/mach-versatile/integrator_cp.c @@ -86,14 +86,6 @@ static u64 notrace intcp_read_sched_clock(void) return val; } -static void __init intcp_init_early(void) -{ - cm_map = syscon_regmap_lookup_by_compatible("arm,core-module-integrator"); - if (IS_ERR(cm_map)) - return; - sched_clock_register(intcp_read_sched_clock, 32, 24000000); -} - static void __init intcp_init_irq_of(void) { cm_init(); @@ -119,6 +111,10 @@ static void __init intcp_init_of(void) { struct device_node *cpcon; + cm_map = syscon_regmap_lookup_by_compatible("arm,core-module-integrator"); + if (!IS_ERR(cm_map)) + sched_clock_register(intcp_read_sched_clock, 32, 24000000); + cpcon = of_find_matching_node(NULL, intcp_syscon_match); if (!cpcon) return; @@ -138,7 +134,6 @@ static const char * intcp_dt_board_compat[] = { DT_MACHINE_START(INTEGRATOR_CP_DT, "ARM Integrator/CP (Device Tree)") .reserve = integrator_reserve, .map_io = intcp_map_io, - .init_early = intcp_init_early, .init_irq = intcp_init_irq_of, .init_machine = intcp_init_of, .dt_compat = intcp_dt_board_compat, diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 73a10f65ce8b..6b005c8fef70 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -63,6 +63,9 @@ else KBUILD_CFLAGS += -fasynchronous-unwind-tables KBUILD_AFLAGS += -fasynchronous-unwind-tables KBUILD_RUSTFLAGS += -Cforce-unwind-tables=y -Zuse-sync-unwind=n +# Work around rustc bug on compilers without +# https://github.com/rust-lang/rust/pull/156973. +KBUILD_RUSTFLAGS += $(if $(call rustc-min-version,109800),,-Zllvm_module_flag=uwtable:u32:2:max) endif ifeq ($(CONFIG_STACKPROTECTOR_PER_TASK),y) diff --git a/arch/arm64/boot/dts/qcom/eliza.dtsi b/arch/arm64/boot/dts/qcom/eliza.dtsi index 4a7a0ac40ce6..7e97361a5dc5 100644 --- a/arch/arm64/boot/dts/qcom/eliza.dtsi +++ b/arch/arm64/boot/dts/qcom/eliza.dtsi @@ -843,7 +843,11 @@ "qcom,inline-crypto-engine"; reg = <0x0 0x01d88000 0x0 0x18000>; - clocks = <&gcc GCC_UFS_PHY_ICE_CORE_CLK>; + clocks = <&gcc GCC_UFS_PHY_ICE_CORE_CLK>, + <&gcc GCC_UFS_PHY_AHB_CLK>; + clock-names = "core", + "iface"; + power-domains = <&gcc GCC_UFS_PHY_GDSC>; }; tcsr_mutex: hwlock@1f40000 { diff --git a/arch/arm64/boot/dts/qcom/glymur.dtsi b/arch/arm64/boot/dts/qcom/glymur.dtsi index f23cf81ddb77..82436984485d 100644 --- a/arch/arm64/boot/dts/qcom/glymur.dtsi +++ b/arch/arm64/boot/dts/qcom/glymur.dtsi @@ -2314,11 +2314,9 @@ clocks = <&gcc GCC_USB3_MP_PHY_AUX_CLK>, <&tcsr TCSR_USB3_0_CLKREF_EN>, - <&rpmhcc RPMH_CXO_CLK>, <&gcc GCC_USB3_MP_PHY_COM_AUX_CLK>, <&gcc GCC_USB3_MP_PHY_PIPE_0_CLK>; clock-names = "aux", - "clkref", "ref", "com_aux", "pipe"; @@ -2343,11 +2341,9 @@ clocks = <&gcc GCC_USB3_MP_PHY_AUX_CLK>, <&tcsr TCSR_USB3_1_CLKREF_EN>, - <&rpmhcc RPMH_CXO_CLK>, <&gcc GCC_USB3_MP_PHY_COM_AUX_CLK>, <&gcc GCC_USB3_MP_PHY_PIPE_1_CLK>; clock-names = "aux", - "clkref", "ref", "com_aux", "pipe"; @@ -2482,15 +2478,13 @@ reg = <0x0 0x00fde000 0x0 0x8000>; clocks = <&gcc GCC_USB3_SEC_PHY_AUX_CLK>, - <&rpmhcc RPMH_CXO_CLK>, + <&tcsr TCSR_USB4_1_CLKREF_EN>, <&gcc GCC_USB3_SEC_PHY_COM_AUX_CLK>, - <&gcc GCC_USB3_SEC_PHY_PIPE_CLK>, - <&tcsr TCSR_USB4_1_CLKREF_EN>; + <&gcc GCC_USB3_SEC_PHY_PIPE_CLK>; clock-names = "aux", "ref", "com_aux", - "usb3_pipe", - "clkref"; + "usb3_pipe"; power-domains = <&gcc GCC_USB_1_PHY_GDSC>; @@ -3750,15 +3744,13 @@ reg = <0x0 0x088e1000 0x0 0x8000>; clocks = <&gcc GCC_USB3_TERT_PHY_AUX_CLK>, - <&rpmhcc RPMH_CXO_CLK>, + <&tcsr TCSR_USB4_2_CLKREF_EN>, <&gcc GCC_USB3_TERT_PHY_COM_AUX_CLK>, - <&gcc GCC_USB3_TERT_PHY_PIPE_CLK>, - <&tcsr TCSR_USB4_2_CLKREF_EN>; + <&gcc GCC_USB3_TERT_PHY_PIPE_CLK>; clock-names = "aux", "ref", "com_aux", - "usb3_pipe", - "clkref"; + "usb3_pipe"; power-domains = <&gcc GCC_USB_2_PHY_GDSC>; diff --git a/arch/arm64/boot/dts/qcom/milos.dtsi b/arch/arm64/boot/dts/qcom/milos.dtsi index 4a64a98a434b..a6e463f3885d 100644 --- a/arch/arm64/boot/dts/qcom/milos.dtsi +++ b/arch/arm64/boot/dts/qcom/milos.dtsi @@ -1275,7 +1275,11 @@ "qcom,inline-crypto-engine"; reg = <0x0 0x01d88000 0x0 0x18000>; - clocks = <&gcc GCC_UFS_PHY_ICE_CORE_CLK>; + clocks = <&gcc GCC_UFS_PHY_ICE_CORE_CLK>, + <&gcc GCC_UFS_PHY_AHB_CLK>; + clock-names = "core", + "iface"; + power-domains = <&gcc UFS_PHY_GDSC>; }; tcsr_mutex: hwlock@1f40000 { diff --git a/arch/arm64/boot/dts/qcom/x1-dell-thena.dtsi b/arch/arm64/boot/dts/qcom/x1-dell-thena.dtsi index 0d9a324cc6cc..db291730130c 100644 --- a/arch/arm64/boot/dts/qcom/x1-dell-thena.dtsi +++ b/arch/arm64/boot/dts/qcom/x1-dell-thena.dtsi @@ -982,12 +982,6 @@ status = "okay"; }; -&i2c20 { - clock-frequency = <400000>; - - status = "okay"; -}; - &lpass_tlmm { spkr_01_sd_n_active: spkr-01-sd-n-active-state { pins = "gpio12"; @@ -1308,6 +1302,7 @@ &tlmm { gpio-reserved-ranges = <44 4>, /* SPI11 (TPM) */ <76 4>, /* SPI19 (TZ Protected) */ + <80 2>, /* I2C20 (Battery SMBus) */ <238 1>; /* UFS Reset */ cam_rgb_default: cam-rgb-default-state { diff --git a/arch/arm64/boot/dts/renesas/draak-ebisu-panel-aa104xd12.dtso b/arch/arm64/boot/dts/renesas/draak-ebisu-panel-aa104xd12.dtso index 258f8668ca36..90767d74e21b 100644 --- a/arch/arm64/boot/dts/renesas/draak-ebisu-panel-aa104xd12.dtso +++ b/arch/arm64/boot/dts/renesas/draak-ebisu-panel-aa104xd12.dtso @@ -27,7 +27,12 @@ status = "okay"; ports { + #address-cells = <1>; + #size-cells = <0>; + port@1 { + reg = <1>; + lvds1_out: endpoint { remote-endpoint = <&panel_in>; }; diff --git a/arch/arm64/boot/dts/renesas/r8a78000.dtsi b/arch/arm64/boot/dts/renesas/r8a78000.dtsi index 3e1c98903cea..3ec1b53d2782 100644 --- a/arch/arm64/boot/dts/renesas/r8a78000.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a78000.dtsi @@ -699,7 +699,7 @@ "renesas,rcar-gen5-scif", "renesas,scif"; reg = <0 0xc0700000 0 0x40>; interrupts = <GIC_ESPI 10 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&dummy_clk_sgasyncd16>, <&dummy_clk_sgasyncd16>, <&scif_clk>; + clocks = <&dummy_clk_sgasyncd16>, <&dummy_clk_sgasyncd4>, <&scif_clk>; clock-names = "fck", "brg_int", "scif_clk"; status = "disabled"; }; @@ -709,7 +709,7 @@ "renesas,rcar-gen5-scif", "renesas,scif"; reg = <0 0xc0704000 0 0x40>; interrupts = <GIC_ESPI 11 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&dummy_clk_sgasyncd16>, <&dummy_clk_sgasyncd16>, <&scif_clk>; + clocks = <&dummy_clk_sgasyncd16>, <&dummy_clk_sgasyncd4>, <&scif_clk>; clock-names = "fck", "brg_int", "scif_clk"; status = "disabled"; }; @@ -719,7 +719,7 @@ "renesas,rcar-gen5-scif", "renesas,scif"; reg = <0 0xc0708000 0 0x40>; interrupts = <GIC_ESPI 12 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&dummy_clk_sgasyncd16>, <&dummy_clk_sgasyncd16>, <&scif_clk>; + clocks = <&dummy_clk_sgasyncd16>, <&dummy_clk_sgasyncd4>, <&scif_clk>; clock-names = "fck", "brg_int", "scif_clk"; status = "disabled"; }; @@ -729,7 +729,7 @@ "renesas,rcar-gen5-scif", "renesas,scif"; reg = <0 0xc070c000 0 0x40>; interrupts = <GIC_ESPI 13 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&dummy_clk_sgasyncd16>, <&dummy_clk_sgasyncd16>, <&scif_clk>; + clocks = <&dummy_clk_sgasyncd16>, <&dummy_clk_sgasyncd4>, <&scif_clk>; clock-names = "fck", "brg_int", "scif_clk"; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/renesas/r9a09g056.dtsi b/arch/arm64/boot/dts/renesas/r9a09g056.dtsi index 40525470194e..7ccddd6a4a9a 100644 --- a/arch/arm64/boot/dts/renesas/r9a09g056.dtsi +++ b/arch/arm64/boot/dts/renesas/r9a09g056.dtsi @@ -1327,6 +1327,7 @@ resets = <&cpg 0xaf>; power-domains = <&cpg>; #reset-cells = <0>; + #mux-state-cells = <1>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/renesas/r9a09g057.dtsi b/arch/arm64/boot/dts/renesas/r9a09g057.dtsi index 9581af58024e..6f6fe5f36bef 100644 --- a/arch/arm64/boot/dts/renesas/r9a09g057.dtsi +++ b/arch/arm64/boot/dts/renesas/r9a09g057.dtsi @@ -1345,6 +1345,7 @@ resets = <&cpg 0xaf>; power-domains = <&cpg>; #reset-cells = <0>; + #mux-state-cells = <1>; status = "disabled"; }; @@ -1355,6 +1356,7 @@ resets = <&cpg 0xaf>; power-domains = <&cpg>; #reset-cells = <0>; + #mux-state-cells = <1>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/renesas/rz-smarc-cru-csi-ov5645.dtsi b/arch/arm64/boot/dts/renesas/rz-smarc-cru-csi-ov5645.dtsi index 4d2b0655859a..3feffa4f16a9 100644 --- a/arch/arm64/boot/dts/renesas/rz-smarc-cru-csi-ov5645.dtsi +++ b/arch/arm64/boot/dts/renesas/rz-smarc-cru-csi-ov5645.dtsi @@ -46,7 +46,12 @@ status = "okay"; ports { + #address-cells = <1>; + #size-cells = <0>; + port@0 { + reg = <0>; + csi2_in: endpoint { clock-lanes = <0>; data-lanes = <1 2>; diff --git a/arch/arm64/boot/dts/renesas/rz-smarc-du-adv7513.dtsi b/arch/arm64/boot/dts/renesas/rz-smarc-du-adv7513.dtsi index 36707576030d..f5412578ee65 100644 --- a/arch/arm64/boot/dts/renesas/rz-smarc-du-adv7513.dtsi +++ b/arch/arm64/boot/dts/renesas/rz-smarc-du-adv7513.dtsi @@ -26,7 +26,12 @@ status = "okay"; ports { + #address-cells = <1>; + #size-cells = <0>; + port@0 { + reg = <0>; + du_out_rgb: endpoint { remote-endpoint = <&adv7513_in>; }; diff --git a/arch/arm64/boot/dts/renesas/salvator-panel-aa104xd12.dtso b/arch/arm64/boot/dts/renesas/salvator-panel-aa104xd12.dtso index c83a30adc6ad..7807c3f80409 100644 --- a/arch/arm64/boot/dts/renesas/salvator-panel-aa104xd12.dtso +++ b/arch/arm64/boot/dts/renesas/salvator-panel-aa104xd12.dtso @@ -27,7 +27,12 @@ status = "okay"; ports { + #address-cells = <1>; + #size-cells = <0>; + port@1 { + reg = <1>; + lvds0_out: endpoint { remote-endpoint = <&panel_in>; }; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index d905a0777f93..96ce783f24e7 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -260,6 +260,7 @@ CONFIG_PCI_ENDPOINT=y CONFIG_PCI_ENDPOINT_CONFIGFS=y CONFIG_PCI_EPF_TEST=m CONFIG_PCI_PWRCTRL_GENERIC=m +CONFIG_POWER_SEQUENCING_PCIE_M2=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_FW_LOADER_USER_HELPER=y diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index f463a654a2bb..cc0702fa64a7 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -409,7 +409,7 @@ __AARCH64_INSN_FUNCS(cbz, 0x7F000000, 0x34000000) __AARCH64_INSN_FUNCS(cbnz, 0x7F000000, 0x35000000) __AARCH64_INSN_FUNCS(tbz, 0x7F000000, 0x36000000) __AARCH64_INSN_FUNCS(tbnz, 0x7F000000, 0x37000000) -__AARCH64_INSN_FUNCS(bcond, 0xFF000010, 0x54000000) +__AARCH64_INSN_FUNCS(bcond, 0xFF000000, 0x54000000) __AARCH64_INSN_FUNCS(svc, 0xFFE0001F, 0xD4000001) __AARCH64_INSN_FUNCS(hvc, 0xFFE0001F, 0xD4000002) __AARCH64_INSN_FUNCS(smc, 0xFFE0001F, 0xD4000003) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 65eead8362e0..a49042bfa801 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -511,7 +511,6 @@ enum vcpu_sysreg { ACTLR_EL2, /* Auxiliary Control Register (EL2) */ CPTR_EL2, /* Architectural Feature Trap Register (EL2) */ HACR_EL2, /* Hypervisor Auxiliary Control Register */ - ZCR_EL2, /* SVE Control Register (EL2) */ TTBR0_EL2, /* Translation Table Base Register 0 (EL2) */ TTBR1_EL2, /* Translation Table Base Register 1 (EL2) */ TCR_EL2, /* Translation Control Register (EL2) */ @@ -543,6 +542,7 @@ enum vcpu_sysreg { SCTLR2_EL2, /* System Control Register 2 (EL2) */ MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */ + ZCR_EL2, /* SVE Control Register (EL2) */ /* Any VNCR-capable reg goes after this point */ MARKER(__VNCR_START__), diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 091544e6af44..cdf3e8422ea1 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -23,6 +23,7 @@ static inline u64 tcr_el2_ps_to_tcr_el1_ips(u64 tcr_el2) static inline u64 translate_tcr_el2_to_tcr_el1(u64 tcr) { return TCR_EPD1_MASK | /* disable TTBR1_EL1 */ + ((tcr & TCR_EL2_DS) ? TCR_DS : 0) | ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) | tcr_el2_ps_to_tcr_el1_ips(tcr) | (tcr & TCR_EL2_TG0_MASK) | @@ -131,7 +132,7 @@ static inline bool kvm_s2_trans_exec_el0(struct kvm *kvm, struct kvm_s2_trans *t u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc); if (!kvm_has_xnx(kvm)) - xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10); + xn &= 0b10; switch (xn) { case 0b00: @@ -147,7 +148,7 @@ static inline bool kvm_s2_trans_exec_el1(struct kvm *kvm, struct kvm_s2_trans *t u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc); if (!kvm_has_xnx(kvm)) - xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10); + xn &= 0b10; switch (xn) { case 0b00: diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index e25d0d18f6d7..58200de8a221 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -33,7 +33,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr); #define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio -bool tag_clear_highpages(struct page *to, int numpages); +bool tag_clear_highpages(struct page *to, int numpages, bool clear_pages); #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGES #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/include/asm/ring_buffer.h b/arch/arm64/include/asm/ring_buffer.h new file mode 100644 index 000000000000..62316c406888 --- /dev/null +++ b/arch/arm64/include/asm/ring_buffer.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_ARM64_RING_BUFFER_H +#define _ASM_ARM64_RING_BUFFER_H + +#include <asm/cacheflush.h> + +/* Flush D-cache on persistent ring buffer */ +#define arch_ring_buffer_flush_range(start, end) dcache_clean_pop(start, end) + +#endif /* _ASM_ARM64_RING_BUFFER_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 736561480f36..7aa08d59d494 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -844,7 +844,7 @@ #define INIT_SCTLR_EL2_MMU_ON \ (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I | \ SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | \ - SCTLR_ELx_ITFSB | SCTLR_EL2_RES1) + SCTLR_ELx_ITFSB | SCTLR_ELx_EIS | SCTLR_ELx_EOS | SCTLR_EL2_RES1) #define INIT_SCTLR_EL2_MMU_OFF \ (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 10869d7731b8..751bd57bc3ba 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -53,7 +53,8 @@ static inline int tlb_get_level(struct mmu_gather *tlb) static inline void tlb_flush(struct mmu_gather *tlb) { struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0); - tlbf_t flags = tlb->freed_tables ? TLBF_NONE : TLBF_NOWALKCACHE; + tlbf_t flags = (tlb->freed_tables || tlb->unshared_tables) ? + TLBF_NONE : TLBF_NOWALKCACHE; unsigned long stride = tlb_get_unmap_size(tlb); int tlb_level = tlb_get_level(tlb); diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index cb54335465f6..c7a23f7c2212 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -62,6 +62,13 @@ static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, irqentry_exit_to_kernel_mode_after_preempt(regs, state); } +static __always_inline void arm64_syscall_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + mte_disable_tco_entry(current); + sme_enter_from_user_mode(); +} + /* * Handle IRQ/context state management when entering from user mode. * Before this function is called it is not safe to call regular kernel code, @@ -70,20 +77,30 @@ static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); + rseq_note_user_irq_entry(); mte_disable_tco_entry(current); sme_enter_from_user_mode(); } +static __always_inline void arm64_syscall_exit_to_user_mode(struct pt_regs *regs) +{ + local_irq_disable(); + syscall_exit_to_user_mode_prepare(regs); + local_daif_mask(); + sme_exit_to_user_mode(); + mte_check_tfsr_exit(); + exit_to_user_mode(); +} + /* * Handle IRQ/context state management when exiting to user mode. * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ - static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) { local_irq_disable(); - exit_to_user_mode_prepare_legacy(regs); + irqentry_exit_to_user_mode_prepare(regs); local_daif_mask(); sme_exit_to_user_mode(); mte_check_tfsr_exit(); @@ -92,7 +109,7 @@ static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) { - arm64_exit_to_user_mode(regs); + arm64_syscall_exit_to_user_mode(regs); } /* @@ -716,12 +733,12 @@ static void noinstr el0_brk64(struct pt_regs *regs, unsigned long esr) static void noinstr el0_svc(struct pt_regs *regs) { - arm64_enter_from_user_mode(regs); + arm64_syscall_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); fpsimd_syscall_enter(); local_daif_restore(DAIF_PROCCTX); do_el0_svc(regs); - arm64_exit_to_user_mode(regs); + arm64_syscall_exit_to_user_mode(regs); fpsimd_syscall_exit(); } @@ -868,11 +885,11 @@ static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) static void noinstr el0_svc_compat(struct pt_regs *regs) { - arm64_enter_from_user_mode(regs); + arm64_syscall_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); do_el0_svc_compat(regs); - arm64_exit_to_user_mode(regs); + arm64_syscall_exit_to_user_mode(regs); } static void noinstr el0_bkpt32(struct pt_regs *regs, unsigned long esr) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index ba5eab23fd90..4d08598e2891 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -983,8 +983,8 @@ static int sve_set_common(struct task_struct *target, } /* Always zero V regs, FPSR, and FPCR */ - memset(¤t->thread.uw.fpsimd_state, 0, - sizeof(current->thread.uw.fpsimd_state)); + memset(&target->thread.uw.fpsimd_state, 0, + sizeof(target->thread.uw.fpsimd_state)); /* Registers: FPSIMD-only case */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 8bb2c7422cc8..9453321ef8c6 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -4,6 +4,7 @@ * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ +#include <linux/arm-smccc.h> #include <linux/bug.h> #include <linux/cpu_pm.h> #include <linux/errno.h> @@ -554,8 +555,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_destroy_mpidr_data(vcpu->kvm); err = kvm_vgic_vcpu_init(vcpu); - if (err) + if (err) { + kvm_vgic_vcpu_destroy(vcpu); return err; + } err = kvm_share_hyp(vcpu, vcpu + 1); if (err) @@ -2638,6 +2641,22 @@ static int init_pkvm_host_sve_state(void) return 0; } +static int pkvm_check_sme_dvmsync_fw_call(void) +{ + struct arm_smccc_res res; + + if (!cpus_have_final_cap(ARM64_WORKAROUND_4193714)) + return 0; + + arm_smccc_1_1_smc(ARM_SMCCC_CPU_WORKAROUND_4193714, &res); + if (res.a0) { + kvm_err("pKVM requires firmware support for C1-Pro erratum 4193714\n"); + return -ENODEV; + } + + return 0; +} + /* * Finalizes the initialization of hyp mode, once everything else is initialized * and the initialziation process cannot fail. @@ -2838,6 +2857,10 @@ static int __init init_hyp_mode(void) if (err) goto out_err; + err = pkvm_check_sme_dvmsync_fw_call(); + if (err) + goto out_err; + err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 9f8f0ae8e86e..889c2c15d7bd 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1569,7 +1569,8 @@ int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) /* Do the stage-2 translation */ ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); out.esr = 0; - ret = kvm_walk_nested_s2(vcpu, ipa, &out); + scoped_guard(srcu, &vcpu->kvm->srcu) + ret = kvm_walk_nested_s2(vcpu, ipa, &out); if (ret < 0) return ret; @@ -1665,7 +1666,8 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) } /* Walk the guest's PT, looking for a match along the way */ - ret = walk_s1(vcpu, &wi, &wr, va); + scoped_guard(srcu, &vcpu->kvm->srcu) + ret = walk_s1(vcpu, &wi, &wr, va); switch (ret) { case -EINTR: /* We interrupted the walk on a match, return the level */ diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 98b2976837b1..e9b36a3b27bb 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -181,6 +181,8 @@ static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu) val |= CPACR_EL1_ZEN; if (cpus_have_final_cap(ARM64_SME)) val |= CPACR_EL1_SMEN; + if (cpus_have_final_cap(ARM64_HAS_S1POE)) + val |= CPACR_EL1_E0POE; write_sysreg(val, cpacr_el1); } @@ -245,7 +247,7 @@ static inline void __activate_traps_ich_hfgxtr(struct kvm_vcpu *vcpu) __activate_fgt(hctxt, vcpu, ICH_HFGITR_EL2); } -#define __deactivate_fgt(htcxt, vcpu, reg) \ +#define __deactivate_fgt(hctxt, vcpu, reg) \ do { \ write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ SYS_ ## reg); \ @@ -462,11 +464,13 @@ static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) { + u64 zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + /* * The vCPU's saved SVE state layout always matches the max VL of the * vCPU. Start off with the max VL so we can load the SVE state. */ - sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); + sve_cond_update_zcr_vq(zcr_el2, SYS_ZCR_EL2); __sve_restore_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr, true); @@ -476,8 +480,10 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) * nested guest, as the guest hypervisor could select a smaller VL. Slap * that into hardware before wrapping up. */ - if (is_nested_ctxt(vcpu)) - sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2); + if (is_nested_ctxt(vcpu)) { + zcr_el2 = min(zcr_el2, __vcpu_sys_reg(vcpu, ZCR_EL2)); + sve_cond_update_zcr_vq(zcr_el2, SYS_ZCR_EL2); + } write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR); } @@ -501,11 +507,11 @@ static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) return; if (vcpu_has_sve(vcpu)) { + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + /* A guest hypervisor may restrict the effective max VL. */ if (is_nested_ctxt(vcpu)) - zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2); - else - zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + zcr_el2 = min(zcr_el2, __vcpu_sys_reg(vcpu, ZCR_EL2)); write_sysreg_el2(zcr_el2, SYS_ZCR); diff --git a/arch/arm64/kvm/hyp/nvhe/clock.c b/arch/arm64/kvm/hyp/nvhe/clock.c index 32fc4313fe43..a7fc61976fd0 100644 --- a/arch/arm64/kvm/hyp/nvhe/clock.c +++ b/arch/arm64/kvm/hyp/nvhe/clock.c @@ -35,6 +35,9 @@ void trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) struct clock_data *clock = &trace_clock_data; u64 bank = clock->cur ^ 1; + if (!mult || shift >= 64) + return; + clock->data[bank].mult = mult; clock->data[bank].shift = shift; clock->data[bank].epoch_ns = epoch_ns; diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index f337770ec459..9393fe3ea6a1 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -120,7 +120,7 @@ SYM_FUNC_START(__hyp_do_panic) mov x29, x0 -#ifdef PKVM_DISABLE_STAGE2_ON_PANIC +#ifdef CONFIG_PKVM_DISABLE_STAGE2_ON_PANIC /* Ensure host stage-2 is disabled */ mrs x0, hcr_el2 bic x0, x0, #HCR_VM diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 28a471d1927c..25f04629014e 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -5,6 +5,7 @@ */ #include <linux/kvm_host.h> + #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> @@ -14,6 +15,7 @@ #include <hyp/fault.h> +#include <nvhe/arm-smccc.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> #include <nvhe/mem_protect.h> @@ -29,6 +31,19 @@ static struct hyp_pool host_s2_pool; static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm); #define current_vm (*this_cpu_ptr(&__current_vm)) +static void pkvm_sme_dvmsync_fw_call(void) +{ + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_4193714)) { + struct arm_smccc_res res; + + /* + * Ignore the return value. Probing for the workaround + * availability took place in init_hyp_mode(). + */ + hyp_smccc_1_1_smc(ARM_SMCCC_CPU_WORKAROUND_4193714, &res); + } +} + static void guest_lock_component(struct pkvm_hyp_vm *vm) { hyp_spin_lock(&vm->lock); @@ -574,8 +589,14 @@ static int host_stage2_set_owner_metadata_locked(phys_addr_t addr, u64 size, ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt, addr, size, &host_s2_pool, KVM_HOST_INVALID_PTE_TYPE_DONATION, annotation); - if (!ret) + if (!ret) { + /* + * After stage2 maintenance has happened, but before the page + * owner has changed. + */ + pkvm_sme_dvmsync_fw_call(); __host_update_page_state(addr, size, PKVM_NOPAGE); + } return ret; } @@ -1369,6 +1390,22 @@ unlock: return ret && ret != -EHWPOISON ? ret : 0; } +/* + * share/donate install at most one stage-2 leaf (PAGE_SIZE, or one + * KVM_PGTABLE_LAST_LEVEL - 1 block for share). kvm_mmu_cache_min_pages() + * bounds the worst-case allocation: exact for the PAGE_SIZE leaf, + * conservative by one for the block. + */ +static int __guest_check_pgtable_memcache(struct pkvm_hyp_vcpu *vcpu) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + + if (vcpu->vcpu.arch.pkvm_memcache.nr_pages < kvm_mmu_cache_min_pages(vm->pgt.mmu)) + return -ENOMEM; + + return 0; +} + int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu) { struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); @@ -1388,6 +1425,10 @@ int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu) if (ret) goto unlock; + ret = __guest_check_pgtable_memcache(vcpu); + if (ret) + goto unlock; + meta = host_stage2_encode_gfn_meta(vm, gfn); WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE, PKVM_ID_GUEST, meta)); @@ -1453,6 +1494,10 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu } } + ret = __guest_check_pgtable_memcache(vcpu); + if (ret) + goto unlock; + for_each_hyp_page(page, phys, size) { set_host_state(page, PKVM_PAGE_SHARED_OWNED); page->host_share_guest_count++; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index e7496eb85628..eb1c10120f9f 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -752,16 +752,30 @@ static struct pkvm_hyp_vcpu selftest_vcpu = { struct pkvm_hyp_vcpu *init_selftest_vm(void *virt) { struct hyp_page *p = hyp_virt_to_page(virt); + unsigned long min_pages, seeded = 0; int i; selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt)); + /* + * Mirror pkvm_refill_memcache() for the share/donate pre-checks; + * the selftest invokes those functions directly and would + * otherwise see an empty memcache. + */ + min_pages = kvm_mmu_cache_min_pages(&selftest_vm.kvm.arch.mmu); + for (i = 0; i < pkvm_selftest_pages(); i++) { if (p[i].refcount) continue; p[i].refcount = 1; - hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + if (seeded < min_pages) { + push_hyp_memcache(&selftest_vcpu.vcpu.arch.pkvm_memcache, + hyp_page_to_virt(&p[i]), hyp_virt_to_phys); + seeded++; + } else { + hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + } } selftest_vm.kvm.arch.pkvm.handle = __pkvm_reserve_vm(); diff --git a/arch/arm64/kvm/hyp/nvhe/trace.c b/arch/arm64/kvm/hyp/nvhe/trace.c index a6ca27b18e15..e7e150ab265f 100644 --- a/arch/arm64/kvm/hyp/nvhe/trace.c +++ b/arch/arm64/kvm/hyp/nvhe/trace.c @@ -164,13 +164,16 @@ static int hyp_trace_buffer_load(struct hyp_trace_buffer *trace_buffer, return ret; } -static bool hyp_trace_desc_validate(struct hyp_trace_desc *desc, size_t desc_size) +static bool hyp_trace_desc_is_valid(struct hyp_trace_desc *desc, size_t desc_size) { struct ring_buffer_desc *rb_desc; unsigned int cpu; size_t nr_bpages; void *desc_end; + if (!is_protected_kvm_enabled()) + return true; + /* * Both desc_size and bpages_backing_size are untrusted host-provided * values. We rely on __pkvm_host_donate_hyp() to enforce their validity. @@ -212,8 +215,10 @@ int __tracing_load(unsigned long desc_hva, size_t desc_size) if (ret) return ret; - if (!hyp_trace_desc_validate(desc, desc_size)) + if (!hyp_trace_desc_is_valid(desc, desc_size)) { + ret = -EINVAL; goto err_release_desc; + } hyp_spin_lock(&trace_buffer.lock); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 0c1defa5fb0f..91a7dfad6686 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -925,7 +925,9 @@ static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) static bool stage2_pte_executable(kvm_pte_t pte) { - return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); + enum kvm_pgtable_prot prot = kvm_pgtable_stage2_pte_prot(pte); + + return prot & (KVM_PGTABLE_PROT_UX | KVM_PGTABLE_PROT_PX); } static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 9db3f11a4754..1e8995add14f 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -663,7 +663,8 @@ static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par) host_ctxt = host_data_ptr(host_ctxt); vcpu = host_ctxt->__hyp_running_vcpu; - __deactivate_traps(vcpu); + if (vcpu) + __deactivate_traps(vcpu); sysreg_restore_host_state_vhe(host_ctxt); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n", diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c index 8b7f2bf2fba8..c4b3ee552131 100644 --- a/arch/arm64/kvm/hyp_trace.c +++ b/arch/arm64/kvm/hyp_trace.c @@ -189,7 +189,7 @@ static void hyp_trace_buffer_unshare_hyp(struct hyp_trace_buffer *trace_buffer, if (cpu > last_cpu) break; - __share_page(rb_desc->meta_va); + __unshare_page(rb_desc->meta_va); for (p = 0; p < rb_desc->nr_page_va; p++) __unshare_page(rb_desc->page_va[p]); } @@ -212,14 +212,15 @@ static int hyp_trace_buffer_share_hyp(struct hyp_trace_buffer *trace_buffer) } if (ret) { - for (p--; p >= 0; p--) + while (--p >= 0) __unshare_page(rb_desc->page_va[p]); + __unshare_page(rb_desc->meta_va); break; } } if (ret) - hyp_trace_buffer_unshare_hyp(trace_buffer, cpu--); + hyp_trace_buffer_unshare_hyp(trace_buffer, --cpu); return ret; } @@ -248,6 +249,7 @@ static struct trace_buffer_desc *hyp_trace_load(unsigned long size, void *priv) goto err_free_desc; trace_buffer->desc = desc; + trace_buffer->desc_size = desc_size; ret = hyp_trace_buffer_alloc_bpages_backing(trace_buffer, size); if (ret) @@ -297,6 +299,7 @@ static void hyp_trace_unload(struct trace_buffer_desc *desc, void *priv) hyp_trace_buffer_free_bpages_backing(trace_buffer); free_pages_exact(trace_buffer->desc, trace_buffer->desc_size); trace_buffer->desc = NULL; + trace_buffer->desc_size = 0; } static int hyp_trace_enable_tracing(bool enable, void *priv) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d089c107d9b7..4da9281312eb 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1576,21 +1576,24 @@ struct kvm_s2_fault_desc { static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) { bool write_fault, exec_fault; + bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt; unsigned long mmu_seq; struct page *page; struct kvm *kvm = s2fd->vcpu->kvm; - void *memcache; + void *memcache = NULL; kvm_pfn_t pfn; gfn_t gfn; int ret; - memcache = get_mmu_memcache(s2fd->vcpu); - ret = topup_mmu_memcache(s2fd->vcpu, memcache); - if (ret) - return ret; + if (!perm_fault) { + memcache = get_mmu_memcache(s2fd->vcpu); + ret = topup_mmu_memcache(s2fd->vcpu, memcache); + if (ret) + return ret; + } if (s2fd->nested) gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT; @@ -1631,9 +1634,19 @@ static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) goto out_unlock; } - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, - __pfn_to_phys(pfn), prot, - memcache, flags); + if (perm_fault) { + /* + * Drop the SW bits in favour of those stored in the + * PTE, which will be preserved. + */ + prot &= ~KVM_NV_GUEST_MAP_SZ; + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, s2fd->fault_ipa, + prot, flags); + } else { + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, + __pfn_to_phys(pfn), prot, + memcache, flags); + } out_unlock: kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 883b6c1008fb..6f7bc9a9992e 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -89,21 +89,28 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) * again, and there is no reason to affect the whole VM for this. */ num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU; - tmp = kvrealloc(kvm->arch.nested_mmus, - size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus), - GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!tmp) - return -ENOMEM; - swap(kvm->arch.nested_mmus, tmp); + if (num_mmus > kvm->arch.nested_mmus_size) { + tmp = kvcalloc(num_mmus, sizeof(*tmp), GFP_KERNEL_ACCOUNT); + if (!tmp) + return -ENOMEM; - /* - * If we went through a realocation, adjust the MMU back-pointers in - * the previously initialised kvm_pgtable structures. - */ - if (kvm->arch.nested_mmus != tmp) - for (int i = 0; i < kvm->arch.nested_mmus_size; i++) - kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i]; + write_lock(&kvm->mmu_lock); + + if (kvm->arch.nested_mmus_size) { + memcpy(tmp, kvm->arch.nested_mmus, + size_mul(sizeof(*tmp), kvm->arch.nested_mmus_size)); + + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) + tmp[i].pgt->mmu = &tmp[i]; + } + + swap(kvm->arch.nested_mmus, tmp); + + write_unlock(&kvm->mmu_lock); + + kvfree(tmp); + } for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++) ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]); @@ -1834,6 +1841,11 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) resx.res1 = VNCR_EL2_RES1; set_sysreg_masks(kvm, VNCR_EL2, resx); + /* ZCR_EL2 - bits 8:4 are RAZ/WI so treat them as RES0 */ + resx.res0 = ZCR_ELx_RES0 | GENMASK_ULL(8, 4); + resx.res1 = ZCR_ELx_RES1; + set_sysreg_masks(kvm, ZCR_EL2, resx); + out: for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) __vcpu_rmw_sys_reg(vcpu, sr, |=, 0); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index e1860acae641..c816db5d6761 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -174,8 +174,8 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) * action is to use PMCR.P, which will reset them to * 0 (the only use of the 'force' parameter). */ - val = __vcpu_sys_reg(vcpu, reg) & GENMASK(63, 32); - val |= lower_32_bits(val); + val = (__vcpu_sys_reg(vcpu, reg) & GENMASK(63, 32)) | + lower_32_bits(val); } __vcpu_assign_sys_reg(vcpu, reg, val); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 148fc3400ea8..fa5c93c7a135 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2862,21 +2862,16 @@ static bool access_zcr_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - unsigned int vq; - if (guest_hyp_sve_traps_enabled(vcpu)) { kvm_inject_nested_sve_trap(vcpu); return false; } - if (!p->is_write) { + if (!p->is_write) p->regval = __vcpu_sys_reg(vcpu, ZCR_EL2); - return true; - } + else + __vcpu_assign_sys_reg(vcpu, ZCR_EL2, p->regval); - vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1; - vq = min(vq, vcpu_sve_max_vq(vcpu)); - __vcpu_assign_sys_reg(vcpu, ZCR_EL2, vq - 1); return true; } diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 2ea9f1c7ebcd..1e3706ac3b8e 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -597,8 +597,10 @@ static void vgic_its_invalidate_cache(struct vgic_its *its) unsigned long idx; xa_for_each(&its->translation_cache, idx, irq) { - xa_erase(&its->translation_cache, idx); - vgic_put_irq(kvm, irq); + /* Only the context that erases the entry drops its cache ref. */ + irq = xa_erase(&its->translation_cache, idx); + if (irq) + vgic_put_irq(kvm, irq); } } @@ -2307,6 +2309,10 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id, /* dte entry is valid */ offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT; + /* Mimic the MAPD behaviour and reject invalid EID bits. */ + if (num_eventid_bits > VITS_TYPER_IDBITS) + return -EINVAL; + if (!vgic_its_check_id(its, baser, id, NULL)) return -EINVAL; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 0f3c5c7ca054..739800835920 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -1018,7 +1018,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, return vma_alloc_folio(flags, 0, vma, vaddr); } -bool tag_clear_highpages(struct page *page, int numpages) +bool tag_clear_highpages(struct page *page, int numpages, bool clear_pages) { /* * Check if MTE is supported and fall back to clear_highpage(). @@ -1026,13 +1026,16 @@ bool tag_clear_highpages(struct page *page, int numpages) * post_alloc_hook() will invoke tag_clear_highpages(). */ if (!system_supports_mte()) - return false; + return clear_pages; /* Newly allocated pages, shouldn't have been tagged yet */ for (int i = 0; i < numpages; i++, page++) { WARN_ON_ONCE(!try_page_mte_tagging(page)); - mte_zero_clear_page_tags(page_address(page)); + if (clear_pages) + mte_zero_clear_page_tags(page_address(page)); + else + mte_clear_page_tags(page_address(page)); set_page_mte_tagged(page); } - return true; + return false; } diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index 3a5c7f6e5aac..7dca0c6cdc84 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -9,6 +9,7 @@ generic-y += qrwlock.h generic-y += qrwlock_types.h generic-y += qspinlock.h generic-y += parport.h +generic-y += ring_buffer.h generic-y += user.h generic-y += vmlinux.lds.h generic-y += text-patching.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 1efa1e993d4b..0f887d4238ed 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -5,4 +5,5 @@ generic-y += extable.h generic-y += iomap.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += text-patching.h diff --git a/arch/loongarch/Kbuild b/arch/loongarch/Kbuild index beb8499dd8ed..1c7a0dbe5e72 100644 --- a/arch/loongarch/Kbuild +++ b/arch/loongarch/Kbuild @@ -3,7 +3,7 @@ obj-y += mm/ obj-y += net/ obj-y += vdso/ -obj-$(CONFIG_KVM) += kvm/ +obj-$(subst m,y,$(CONFIG_KVM)) += kvm/ # for cleaning subdir- += boot diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 3b042dbb2c41..606597da46b8 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -220,6 +220,7 @@ menu "Kernel type and options" choice prompt "Kernel type" + default 64BIT # Keep existing behavior config 32BIT bool "32-bit kernel" diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 47516aeea9d2..54fcfa1eac1f 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -55,9 +55,11 @@ endif ifdef CONFIG_32BIT tool-archpref = $(32bit-tool-archpref) UTS_MACHINE := loongarch32 +cflags-y += $(call cc-option,-m32) else tool-archpref = $(64bit-tool-archpref) UTS_MACHINE := loongarch64 +cflags-y += $(call cc-option,-m64) endif ifneq ($(SUBARCH),$(ARCH)) diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild index 9034b583a88a..7e92957baf6a 100644 --- a/arch/loongarch/include/asm/Kbuild +++ b/arch/loongarch/include/asm/Kbuild @@ -10,5 +10,6 @@ generic-y += qrwlock.h generic-y += user.h generic-y += ioctl.h generic-y += mmzone.h +generic-y += ring_buffer.h generic-y += statfs.h generic-y += text-patching.h diff --git a/arch/loongarch/include/asm/asm-prototypes.h b/arch/loongarch/include/asm/asm-prototypes.h index 704066b4f736..de0c17f3f49c 100644 --- a/arch/loongarch/include/asm/asm-prototypes.h +++ b/arch/loongarch/include/asm/asm-prototypes.h @@ -20,3 +20,23 @@ asmlinkage void noinstr __no_stack_protector ret_from_kernel_thread(struct task_ struct pt_regs *regs, int (*fn)(void *), void *fn_arg); + +struct kvm_run; +struct kvm_vcpu; +struct loongarch_fpu; + +void kvm_exc_entry(void); +int kvm_enter_guest(struct kvm_run *run, struct kvm_vcpu *vcpu); + +void kvm_save_fpu(struct loongarch_fpu *fpu); +void kvm_restore_fpu(struct loongarch_fpu *fpu); + +#ifdef CONFIG_CPU_HAS_LSX +void kvm_save_lsx(struct loongarch_fpu *fpu); +void kvm_restore_lsx(struct loongarch_fpu *fpu); +#endif + +#ifdef CONFIG_CPU_HAS_LASX +void kvm_save_lasx(struct loongarch_fpu *fpu); +void kvm_restore_lasx(struct loongarch_fpu *fpu); +#endif diff --git a/arch/loongarch/include/asm/efi.h b/arch/loongarch/include/asm/efi.h index eddc8e79b3fa..1ad764b18c3e 100644 --- a/arch/loongarch/include/asm/efi.h +++ b/arch/loongarch/include/asm/efi.h @@ -30,6 +30,8 @@ static inline unsigned long efi_get_kimg_min_align(void) return SZ_2M; } -#define EFI_KIMG_PREFERRED_ADDRESS PHYSADDR(VMLINUX_LOAD_ADDRESS) +unsigned long efi_get_kimg_kaslr_address(void); + +#define EFI_KIMG_PREFERRED_ADDRESS efi_get_kimg_kaslr_address() #endif /* _ASM_LOONGARCH_EFI_H */ diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 130cedbb6b39..776bc487a705 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -87,7 +87,6 @@ struct kvm_context { struct kvm_world_switch { int (*exc_entry)(void); int (*enter_guest)(struct kvm_run *run, struct kvm_vcpu *vcpu); - unsigned long page_order; }; #define MAX_PGTABLE_LEVELS 4 @@ -359,8 +358,6 @@ void kvm_exc_entry(void); int kvm_enter_guest(struct kvm_run *run, struct kvm_vcpu *vcpu); extern unsigned long vpid_mask; -extern const unsigned long kvm_exception_size; -extern const unsigned long kvm_enter_guest_size; extern struct kvm_world_switch *kvm_loongarch_ops; #define SW_GCSR (1 << 0) diff --git a/arch/loongarch/include/asm/linkage.h b/arch/loongarch/include/asm/linkage.h index a1bd6a3ee03a..ae937d1708b2 100644 --- a/arch/loongarch/include/asm/linkage.h +++ b/arch/loongarch/include/asm/linkage.h @@ -69,7 +69,7 @@ 9, 10, 11, 12, 13, 14, 15, 16, \ 17, 18, 19, 20, 21, 22, 23, 24, \ 25, 26, 27, 28, 29, 30, 31; \ - .cfi_offset \num, SC_REGS + \num * SZREG; \ + .cfi_offset \num, SC_REGS + \num * 8; \ .endr; \ \ nop; \ diff --git a/arch/loongarch/include/asm/paravirt.h b/arch/loongarch/include/asm/paravirt.h index 0111f0ad5f73..acae1c5e5f88 100644 --- a/arch/loongarch/include/asm/paravirt.h +++ b/arch/loongarch/include/asm/paravirt.h @@ -4,6 +4,12 @@ #ifdef CONFIG_PARAVIRT +#include <linux/jump_label.h> + +DECLARE_STATIC_KEY_FALSE(virt_preempt_key); +DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); +DECLARE_PER_CPU(struct kvm_steal_time, steal_time); + int __init pv_ipi_init(void); int __init pv_time_init(void); int __init pv_spinlock_init(void); diff --git a/arch/loongarch/include/asm/qspinlock.h b/arch/loongarch/include/asm/qspinlock.h index 0ee15b3b3937..fbfc6be82f26 100644 --- a/arch/loongarch/include/asm/qspinlock.h +++ b/arch/loongarch/include/asm/qspinlock.h @@ -3,12 +3,9 @@ #define _ASM_LOONGARCH_QSPINLOCK_H #include <asm/kvm_para.h> -#include <linux/jump_label.h> +#include <asm/paravirt.h> #ifdef CONFIG_PARAVIRT -DECLARE_STATIC_KEY_FALSE(virt_preempt_key); -DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); -DECLARE_PER_CPU(struct kvm_steal_time, steal_time); #define virt_spin_lock virt_spin_lock diff --git a/arch/loongarch/include/asm/vdso/gettimeofday.h b/arch/loongarch/include/asm/vdso/gettimeofday.h index bae76767c693..18ba403e1ed9 100644 --- a/arch/loongarch/include/asm/vdso/gettimeofday.h +++ b/arch/loongarch/include/asm/vdso/gettimeofday.h @@ -85,12 +85,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, return count; } -static inline bool loongarch_vdso_hres_capable(void) -{ - return true; -} -#define __arch_vdso_hres_capable loongarch_vdso_hres_capable - #endif /* CONFIG_GENERIC_GETTIMEOFDAY */ #endif /* !__ASSEMBLER__ */ diff --git a/arch/loongarch/kernel/kprobes.c b/arch/loongarch/kernel/kprobes.c index 8ba391cfabb0..1985ed30dd16 100644 --- a/arch/loongarch/kernel/kprobes.c +++ b/arch/loongarch/kernel/kprobes.c @@ -60,16 +60,18 @@ NOKPROBE_SYMBOL(arch_prepare_kprobe); /* Install breakpoint in text */ void arch_arm_kprobe(struct kprobe *p) { - *p->addr = KPROBE_BP_INSN; - flush_insn_slot(p); + u32 insn = KPROBE_BP_INSN; + + larch_insn_text_copy(p->addr, &insn, LOONGARCH_INSN_SIZE); } NOKPROBE_SYMBOL(arch_arm_kprobe); /* Remove breakpoint from text */ void arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_insn_slot(p); + u32 insn = p->opcode; + + larch_insn_text_copy(p->addr, &insn, LOONGARCH_INSN_SIZE); } NOKPROBE_SYMBOL(arch_disarm_kprobe); @@ -184,16 +186,16 @@ static bool reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { switch (kcb->kprobe_status) { - case KPROBE_HIT_SS: case KPROBE_HIT_SSDONE: case KPROBE_HIT_ACTIVE: kprobes_inc_nmissed_count(p); setup_singlestep(p, regs, kcb, 1); break; + case KPROBE_HIT_SS: case KPROBE_REENTER: pr_warn("Failed to recover from reentered kprobes.\n"); dump_kprobe(p); - WARN_ON_ONCE(1); + BUG(); break; default: WARN_ON(1); diff --git a/arch/loongarch/kernel/relocate.c b/arch/loongarch/kernel/relocate.c index 16f6a9b39659..4b61a9632a98 100644 --- a/arch/loongarch/kernel/relocate.c +++ b/arch/loongarch/kernel/relocate.c @@ -134,11 +134,23 @@ early_param("nokaslr", nokaslr); #define KASLR_DISABLED_MESSAGE "KASLR is disabled by %s in %s cmdline.\n" +/* + * Note: strictly-defined KASLR means the kernel's final runtime address + * has a random offset from the kernel's load address, which is implemented + * in relocate.c; broadly-defined KALSR means the kernel's final runtime + * address has a random offset from the kernel's link address (a.k.a. + * VMLINUX_LOAD_ADDRESS), which also include the efistlub implementation, + * kexec_file implementation and QEMU direct kernel boot. kaslr_disabled() + * return true only means strictly-defined KASLR is disabled. + */ static inline __init bool kaslr_disabled(void) { char *str; const char *builtin_cmdline = CONFIG_CMDLINE; + if (kaslr_offset()) + return true; /* KASLR is performed during early boot. */ + str = strstr(builtin_cmdline, "nokaslr"); if (str == builtin_cmdline || (str > builtin_cmdline && *(str - 1) == ' ')) { pr_info(KASLR_DISABLED_MESSAGE, "\'nokaslr\'", "built-in"); @@ -210,14 +222,52 @@ static inline void __init *determine_relocation_address(void) return RELOCATED_KASLR(destination); } +static unsigned long __init determine_initrd_address(unsigned long *size) +{ + unsigned long start = 0; + unsigned long key_length; + char *p, *endp, *key = "initrd="; + + key_length = strlen(key); + p = strstr(boot_command_line, key); + + if (!p) { + key = "initrdmem="; + key_length = strlen(key); + p = strstr(boot_command_line, key); + } + + if (p == boot_command_line || (p > boot_command_line && *(p - 1) == ' ')) { + p += key_length; + start = memparse(p, &endp); + if (*endp == ',') + *size = memparse(endp + 1, NULL); + } + + return start; +} + static inline int __init relocation_addr_valid(void *location_new) { + unsigned long kernel_start, kernel_size; + unsigned long initrd_start, initrd_size = 0; + if ((unsigned long)location_new & 0x00000ffff) return 0; /* Inappropriately aligned new location */ if ((unsigned long)location_new < (unsigned long)_end) return 0; /* New location overlaps original kernel */ + initrd_start = determine_initrd_address(&initrd_size); + if (initrd_start && initrd_size) { + kernel_start = PHYSADDR(location_new); + kernel_size = (unsigned long)_end - (unsigned long)_text; + + if (kernel_start < (initrd_start + initrd_size) && + initrd_start < (kernel_start + kernel_size)) + return 0; /* initrd/initramfs overlaps kernel */ + } + return 1; } #endif diff --git a/arch/loongarch/kvm/Makefile b/arch/loongarch/kvm/Makefile index ae469edec99c..a4d044da3aa7 100644 --- a/arch/loongarch/kvm/Makefile +++ b/arch/loongarch/kvm/Makefile @@ -7,11 +7,12 @@ include $(srctree)/virt/kvm/Makefile.kvm obj-$(CONFIG_KVM) += kvm.o +obj-y += switch.o + kvm-y += exit.o kvm-y += interrupt.o kvm-y += main.o kvm-y += mmu.o -kvm-y += switch.o kvm-y += timer.o kvm-y += tlb.o kvm-y += vcpu.o diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index da0ad89f2eb7..3b95cd0f989b 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -390,6 +390,7 @@ int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst) run->mmio.len = 8; break; default: + ret = EMULATE_FAIL; break; } break; diff --git a/arch/loongarch/kvm/interrupt.c b/arch/loongarch/kvm/interrupt.c index 32930959f7c2..a18c60dffbba 100644 --- a/arch/loongarch/kvm/interrupt.c +++ b/arch/loongarch/kvm/interrupt.c @@ -28,23 +28,29 @@ static unsigned int priority_to_irq[EXCCODE_INT_NUM] = { static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) { unsigned int irq = 0; + unsigned long old, new; clear_bit(priority, &vcpu->arch.irq_pending); if (priority < EXCCODE_INT_NUM) irq = priority_to_irq[priority]; - if (kvm_guest_has_msgint(&vcpu->arch) && (priority == INT_AVEC)) { - dmsintc_inject_irq(vcpu); - set_gcsr_estat(irq); - return 1; - } - switch (priority) { + case INT_AVEC: + if (!kvm_guest_has_msgint(&vcpu->arch)) + break; + dmsintc_inject_irq(vcpu); + fallthrough; case INT_TI: case INT_IPI: case INT_SWI0: case INT_SWI1: + old = kvm_read_hw_gcsr(LOONGARCH_CSR_TVAL); set_gcsr_estat(irq); + new = kvm_read_hw_gcsr(LOONGARCH_CSR_TVAL); + + /* Inject TI if TVAL inverted */ + if (new > old) + set_gcsr_estat(CPU_TIMER); break; case INT_HWI0 ... INT_HWI7: @@ -61,22 +67,28 @@ static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) static int kvm_irq_clear(struct kvm_vcpu *vcpu, unsigned int priority) { unsigned int irq = 0; + unsigned long old, new; clear_bit(priority, &vcpu->arch.irq_clear); if (priority < EXCCODE_INT_NUM) irq = priority_to_irq[priority]; - if (kvm_guest_has_msgint(&vcpu->arch) && (priority == INT_AVEC)) { - clear_gcsr_estat(irq); - return 1; - } - switch (priority) { + case INT_AVEC: + if (!kvm_guest_has_msgint(&vcpu->arch)) + break; + fallthrough; case INT_TI: case INT_IPI: case INT_SWI0: case INT_SWI1: + old = kvm_read_hw_gcsr(LOONGARCH_CSR_TVAL); clear_gcsr_estat(irq); + new = kvm_read_hw_gcsr(LOONGARCH_CSR_TVAL); + + /* Inject TI if TVAL inverted */ + if (new > old) + set_gcsr_estat(CPU_TIMER); break; case INT_HWI0 ... INT_HWI7: diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index 76ebff2faedd..f105a86143f5 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -348,8 +348,7 @@ void kvm_arch_disable_virtualization_cpu(void) static int kvm_loongarch_env_init(void) { - int cpu, order, ret; - void *addr; + int cpu, ret; struct kvm_context *context; vmcs = alloc_percpu(struct kvm_context); @@ -365,30 +364,8 @@ static int kvm_loongarch_env_init(void) return -ENOMEM; } - /* - * PGD register is shared between root kernel and kvm hypervisor. - * So world switch entry should be in DMW area rather than TLB area - * to avoid page fault reenter. - * - * In future if hardware pagetable walking is supported, we won't - * need to copy world switch code to DMW area. - */ - order = get_order(kvm_exception_size + kvm_enter_guest_size); - addr = (void *)__get_free_pages(GFP_KERNEL, order); - if (!addr) { - free_percpu(vmcs); - vmcs = NULL; - kfree(kvm_loongarch_ops); - kvm_loongarch_ops = NULL; - return -ENOMEM; - } - - memcpy(addr, kvm_exc_entry, kvm_exception_size); - memcpy(addr + kvm_exception_size, kvm_enter_guest, kvm_enter_guest_size); - flush_icache_range((unsigned long)addr, (unsigned long)addr + kvm_exception_size + kvm_enter_guest_size); - kvm_loongarch_ops->exc_entry = addr; - kvm_loongarch_ops->enter_guest = addr + kvm_exception_size; - kvm_loongarch_ops->page_order = order; + kvm_loongarch_ops->exc_entry = (void *)kvm_exc_entry; + kvm_loongarch_ops->enter_guest = (void *)kvm_enter_guest; vpid_mask = read_csr_gstat(); vpid_mask = (vpid_mask & CSR_GSTAT_GIDBIT) >> CSR_GSTAT_GIDBIT_SHIFT; @@ -428,16 +405,10 @@ static int kvm_loongarch_env_init(void) static void kvm_loongarch_env_exit(void) { - unsigned long addr; - if (vmcs) free_percpu(vmcs); if (kvm_loongarch_ops) { - if (kvm_loongarch_ops->exc_entry) { - addr = (unsigned long)kvm_loongarch_ops->exc_entry; - free_pages(addr, kvm_loongarch_ops->page_order); - } kfree(kvm_loongarch_ops); } diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index a7fa458e3360..e104897aa532 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -95,7 +95,7 @@ static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) else kvm->stat.pages--; - *pte = ctx->invalid_entry; + kvm_set_pte(pte, ctx->invalid_entry); return 1; } diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S index f1768b7a6194..936e4ae3e408 100644 --- a/arch/loongarch/kvm/switch.S +++ b/arch/loongarch/kvm/switch.S @@ -4,9 +4,11 @@ */ #include <linux/linkage.h> +#include <linux/kvm_types.h> #include <asm/asm.h> #include <asm/asmmacro.h> #include <asm/loongarch.h> +#include <asm/page.h> #include <asm/regdef.h> #include <asm/unwind_hints.h> @@ -100,11 +102,16 @@ * - is still in guest mode, such as pgd table/vmid registers etc, * - will fix with hw page walk enabled in future * load kvm_vcpu from reserved CSR KVM_VCPU_KS, and save a2 to KVM_TEMP_KS + * + * PGD register is shared between root kernel and kvm hypervisor. + * So world switch entry should be in DMW area rather than TLB area + * to avoid page fault re-enter. */ .text + .p2align PAGE_SHIFT .cfi_sections .debug_frame SYM_CODE_START(kvm_exc_entry) - UNWIND_HINT_UNDEFINED + UNWIND_HINT_END_OF_STACK csrwr a2, KVM_TEMP_KS csrrd a2, KVM_VCPU_KS addi.d a2, a2, KVM_VCPU_ARCH @@ -190,8 +197,8 @@ ret_to_host: kvm_restore_host_gpr a2 jr ra -SYM_INNER_LABEL(kvm_exc_entry_end, SYM_L_LOCAL) SYM_CODE_END(kvm_exc_entry) +EXPORT_SYMBOL_FOR_KVM(kvm_exc_entry) /* * int kvm_enter_guest(struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -215,8 +222,8 @@ SYM_FUNC_START(kvm_enter_guest) /* Save kvm_vcpu to kscratch */ csrwr a1, KVM_VCPU_KS kvm_switch_to_guest -SYM_INNER_LABEL(kvm_enter_guest_end, SYM_L_LOCAL) SYM_FUNC_END(kvm_enter_guest) +EXPORT_SYMBOL_FOR_KVM(kvm_enter_guest) SYM_FUNC_START(kvm_save_fpu) fpu_save_csr a0 t1 @@ -224,6 +231,7 @@ SYM_FUNC_START(kvm_save_fpu) fpu_save_cc a0 t1 t2 jr ra SYM_FUNC_END(kvm_save_fpu) +EXPORT_SYMBOL_FOR_KVM(kvm_save_fpu) SYM_FUNC_START(kvm_restore_fpu) fpu_restore_double a0 t1 @@ -231,6 +239,7 @@ SYM_FUNC_START(kvm_restore_fpu) fpu_restore_cc a0 t1 t2 jr ra SYM_FUNC_END(kvm_restore_fpu) +EXPORT_SYMBOL_FOR_KVM(kvm_restore_fpu) #ifdef CONFIG_CPU_HAS_LSX SYM_FUNC_START(kvm_save_lsx) @@ -239,6 +248,7 @@ SYM_FUNC_START(kvm_save_lsx) lsx_save_data a0 t1 jr ra SYM_FUNC_END(kvm_save_lsx) +EXPORT_SYMBOL_FOR_KVM(kvm_save_lsx) SYM_FUNC_START(kvm_restore_lsx) lsx_restore_data a0 t1 @@ -246,6 +256,7 @@ SYM_FUNC_START(kvm_restore_lsx) fpu_restore_csr a0 t1 t2 jr ra SYM_FUNC_END(kvm_restore_lsx) +EXPORT_SYMBOL_FOR_KVM(kvm_restore_lsx) #endif #ifdef CONFIG_CPU_HAS_LASX @@ -255,6 +266,7 @@ SYM_FUNC_START(kvm_save_lasx) lasx_save_data a0 t1 jr ra SYM_FUNC_END(kvm_save_lasx) +EXPORT_SYMBOL_FOR_KVM(kvm_save_lasx) SYM_FUNC_START(kvm_restore_lasx) lasx_restore_data a0 t1 @@ -262,10 +274,8 @@ SYM_FUNC_START(kvm_restore_lasx) fpu_restore_csr a0 t1 t2 jr ra SYM_FUNC_END(kvm_restore_lasx) +EXPORT_SYMBOL_FOR_KVM(kvm_restore_lasx) #endif - .section ".rodata" -SYM_DATA(kvm_exception_size, .quad kvm_exc_entry_end - kvm_exc_entry) -SYM_DATA(kvm_enter_guest_size, .quad kvm_enter_guest_end - kvm_enter_guest) #ifdef CONFIG_CPU_HAS_LBT STACK_FRAME_NON_STANDARD kvm_restore_fpu diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c index 29c2aaba63c3..8356fce0043f 100644 --- a/arch/loongarch/kvm/timer.c +++ b/arch/loongarch/kvm/timer.c @@ -96,15 +96,21 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu) * and set CSR TVAL with -1 */ write_gcsr_timertick(0); - __delay(2); /* Wait cycles until timer interrupt injected */ /* * Writing CSR_TINTCLR_TI to LOONGARCH_CSR_TINTCLR will clear * timer interrupt, and CSR TVAL keeps unchanged with -1, it * avoids spurious timer interrupt */ - if (!(estat & CPU_TIMER)) + if (!(estat & CPU_TIMER)) { + __delay(2); /* Wait cycles until timer interrupt injected */ + + /* Write TVAL with max value if no TI shot */ + estat = kvm_read_hw_gcsr(LOONGARCH_CSR_ESTAT); + if (!(estat & CPU_TIMER)) + write_gcsr_timertick(CSR_TCFG_VAL); gcsr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); + } return; } diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index 8cc5ee1c53ef..1317c718f896 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -125,7 +125,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; case KVM_CAP_NR_VCPUS: - r = num_online_cpus(); + r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 3f9ab54114c5..031b39eb081c 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -123,11 +123,7 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page = pfn_to_page(start_pfn); - /* With altmap the first mapped page is offset from @start */ - if (altmap) - page += vmem_altmap_offset(altmap); __remove_pages(start_pfn, nr_pages, altmap); } #endif diff --git a/arch/loongarch/pci/acpi.c b/arch/loongarch/pci/acpi.c index 0dde3ddcd544..b02698a338ee 100644 --- a/arch/loongarch/pci/acpi.c +++ b/arch/loongarch/pci/acpi.c @@ -61,11 +61,16 @@ static void acpi_release_root_info(struct acpi_pci_root_info *ci) static int acpi_prepare_root_resources(struct acpi_pci_root_info *ci) { int status; + unsigned long long pci_h = 0; struct resource_entry *entry, *tmp; struct acpi_device *device = ci->bridge; status = acpi_pci_probe_root_resources(ci); if (status > 0) { + acpi_evaluate_integer(device->handle, "PCIH", NULL, &pci_h); + if (pci_h) + return status; + resource_list_for_each_entry_safe(entry, tmp, &ci->resources) { if (entry->res->flags & IORESOURCE_MEM) { entry->offset = ci->root->mcfg_addr & GENMASK_ULL(63, 40); diff --git a/arch/loongarch/pci/pci.c b/arch/loongarch/pci/pci.c index d233ea2218fe..f33c7ea1443d 100644 --- a/arch/loongarch/pci/pci.c +++ b/arch/loongarch/pci/pci.c @@ -132,6 +132,9 @@ static void loongson_gpu_fixup_dma_hang(struct pci_dev *pdev, bool on) crtc_reg = regbase; crtc_offset = 0x400; break; + default: + iounmap(regbase); + return; } for (i = 0; i < CRTC_NUM_MAX; i++, crtc_reg += crtc_offset) { diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index 42aa96249828..9c9181bb4071 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -12,6 +12,8 @@ obj-vdso-$(CONFIG_GENERIC_GETTIMEOFDAY) += vgettimeofday.o ccflags-vdso := \ $(filter -I%,$(KBUILD_CFLAGS)) \ $(filter -E%,$(KBUILD_CFLAGS)) \ + $(filter -m32,$(KBUILD_CFLAGS)) \ + $(filter -m64,$(KBUILD_CFLAGS)) \ $(filter -march=%,$(KBUILD_CFLAGS)) \ $(filter -m%-float,$(KBUILD_CFLAGS)) \ $(CLANG_FLAGS) \ diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index b282e0dd8dc1..62543bf305ff 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -3,5 +3,6 @@ generated-y += syscall_table.h generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += spinlock.h generic-y += text-patching.h diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index 7178f990e8b3..0030309b47ad 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -5,6 +5,7 @@ generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h +generic-y += ring_buffer.h generic-y += syscalls.h generic-y += tlb.h generic-y += user.h diff --git a/arch/mips/dec/platform.c b/arch/mips/dec/platform.c index c4fcb8c58e01..723ce16cbfc0 100644 --- a/arch/mips/dec/platform.c +++ b/arch/mips/dec/platform.c @@ -10,6 +10,14 @@ #include <linux/mc146818rtc.h> #include <linux/platform_device.h> +#include <asm/bootinfo.h> + +#include <asm/dec/interrupts.h> +#include <asm/dec/ioasic_addrs.h> +#include <asm/dec/kn01.h> +#include <asm/dec/kn02.h> +#include <asm/dec/system.h> + static struct resource dec_rtc_resources[] = { { .name = "rtc", @@ -30,11 +38,110 @@ static struct platform_device dec_rtc_device = { .num_resources = ARRAY_SIZE(dec_rtc_resources), }; +static struct resource dec_dz_resources[] = { + { .name = "dz", .flags = IORESOURCE_MEM, }, + { .name = "dz", .flags = IORESOURCE_IRQ, }, +}; + +static struct platform_device dec_dz_device = { + .name = "dz", + .id = PLATFORM_DEVID_NONE, + .resource = dec_dz_resources, + .num_resources = ARRAY_SIZE(dec_dz_resources), +}; + +static struct platform_device *dec_dz_devices[] __initdata = { + &dec_dz_device, +}; + +static struct resource dec_zs_resources[][2] = { + { + { .name = "scc0", .flags = IORESOURCE_MEM, }, + { .name = "scc0", .flags = IORESOURCE_IRQ, }, + }, + { + { .name = "scc1", .flags = IORESOURCE_MEM, }, + { .name = "scc1", .flags = IORESOURCE_IRQ, }, + }, +}; + +static struct platform_device dec_zs_device[] = { + { + .name = "zs", + .id = 0, + .resource = dec_zs_resources[0], + .num_resources = ARRAY_SIZE(dec_zs_resources[0]), + }, + { + .name = "zs", + .id = 1, + .resource = dec_zs_resources[1], + .num_resources = ARRAY_SIZE(dec_zs_resources[1]), + }, +}; + static int __init dec_add_devices(void) { + struct platform_device *dec_zs_devices[ARRAY_SIZE(dec_zs_device)]; + int ret1, ret2, ret3; + int num_dz, num_zs; + int irq, i; + dec_rtc_resources[0].start = RTC_PORT(0); dec_rtc_resources[0].end = RTC_PORT(0) + dec_kn_slot_size - 1; - return platform_device_register(&dec_rtc_device); + + i = 0; + irq = dec_interrupt[DEC_IRQ_DZ11]; + if (IS_ENABLED(CONFIG_32BIT) && irq >= 0) { + resource_size_t base; + + switch (mips_machtype) { + case MACH_DS23100: + case MACH_DS5100: + base = dec_kn_slot_base + KN01_DZ11; + break; + default: + base = dec_kn_slot_base + KN02_DZ11; + break; + } + dec_dz_device.resource[0].start = base; + dec_dz_device.resource[0].end = base + dec_kn_slot_size - 1; + dec_dz_device.resource[1].start = irq; + dec_dz_device.resource[1].end = irq; + i++; + } + num_dz = i; + + i = 0; + irq = dec_interrupt[DEC_IRQ_SCC0]; + if (irq >= 0) { + resource_size_t base = dec_kn_slot_base + IOASIC_SCC0; + + dec_zs_device[i].resource[0].start = base; + dec_zs_device[i].resource[0].end = base + dec_kn_slot_size - 1; + dec_zs_device[i].resource[1].start = irq; + dec_zs_device[i].resource[1].end = irq; + dec_zs_devices[i] = &dec_zs_device[i]; + i++; + } + irq = dec_interrupt[DEC_IRQ_SCC1]; + if (irq >= 0) { + resource_size_t base = dec_kn_slot_base + IOASIC_SCC1; + + dec_zs_device[i].resource[0].start = base; + dec_zs_device[i].resource[0].end = base + dec_kn_slot_size - 1; + dec_zs_device[i].resource[1].start = irq; + dec_zs_device[i].resource[1].end = irq; + dec_zs_devices[i] = &dec_zs_device[i]; + i++; + } + num_zs = i; + + ret1 = platform_device_register(&dec_rtc_device); + ret2 = IS_ENABLED(CONFIG_32BIT) ? + platform_add_devices(dec_dz_devices, num_dz) : 0; + ret3 = platform_add_devices(dec_zs_devices, num_zs); + return ret1 ? ret1 : ret2 ? ret2 : ret3; } device_initcall(dec_add_devices); diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 684569b2ecd6..9771c3d85074 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -12,5 +12,6 @@ generic-y += mcs_spinlock.h generic-y += parport.h generic-y += qrwlock.h generic-y += qspinlock.h +generic-y += ring_buffer.h generic-y += user.h generic-y += text-patching.h diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index 28004301c236..0a2530964413 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild @@ -5,6 +5,7 @@ generic-y += cmpxchg.h generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += spinlock.h generic-y += user.h generic-y += text-patching.h diff --git a/arch/nios2/include/asm/linkage.h b/arch/nios2/include/asm/linkage.h index 211302301a8a..c4073235852b 100644 --- a/arch/nios2/include/asm/linkage.h +++ b/arch/nios2/include/asm/linkage.h @@ -12,4 +12,6 @@ #define __ALIGN .align 4 #define __ALIGN_STR ".align 4" +#define _THIS_IP_ ({ unsigned long __ip; asm volatile("nextpc %0" : "=r" (__ip)); __ip; }) + #endif diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index cef49d60d74c..8aa34621702d 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -8,4 +8,5 @@ generic-y += spinlock_types.h generic-y += spinlock.h generic-y += qrwlock_types.h generic-y += qrwlock.h +generic-y += ring_buffer.h generic-y += user.h diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index edab2a948352..4391783521bd 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -174,15 +174,21 @@ ifeq ($(KBUILD_EXTMOD),) # this hack. prepare: vdso_prepare vdso_prepare: prepare0 - $(if $(CONFIG_64BIT),$(Q)$(MAKE) \ - $(build)=arch/parisc/kernel/vdso64 include/generated/vdso64-offsets.h) - $(if $(CONFIG_PA11)$(CONFIG_COMPAT),$(Q)$(MAKE) \ +ifdef CONFIG_64BIT + $(Q)$(MAKE) $(build)=arch/parisc/kernel/vdso64 include/generated/vdso64-offsets.h + $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \ $(build)=arch/parisc/kernel/vdso32 include/generated/vdso32-offsets.h) +else + $(Q)$(MAKE) $(build)=arch/parisc/kernel/vdso32 include/generated/vdso32-offsets.h +endif endif -vdso-install-$(CONFIG_PA11) += arch/parisc/kernel/vdso32/vdso32.so +ifdef CONFIG_64BIT +vdso-install-y += arch/parisc/kernel/vdso64/vdso64.so vdso-install-$(CONFIG_COMPAT) += arch/parisc/kernel/vdso32/vdso32.so -vdso-install-$(CONFIG_64BIT) += arch/parisc/kernel/vdso64/vdso64.so +else +vdso-install-y += arch/parisc/kernel/vdso32/vdso32.so +endif install: KBUILD_IMAGE := vmlinux zinstall: KBUILD_IMAGE := vmlinuz diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 4fb596d94c89..d48d158f7241 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -4,4 +4,5 @@ generated-y += syscall_table_64.h generic-y += agp.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += user.h diff --git a/arch/parisc/include/asm/vdso.h b/arch/parisc/include/asm/vdso.h index 5501560f5ffe..e5cca3c9c8e7 100644 --- a/arch/parisc/include/asm/vdso.h +++ b/arch/parisc/include/asm/vdso.h @@ -6,13 +6,14 @@ #ifdef CONFIG_64BIT #include <generated/vdso64-offsets.h> +#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name)) #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) #include <generated/vdso32-offsets.h> -#endif - -#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name)) #define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name)) +#else +#define VDSO32_SYMBOL(tsk, name) 0UL +#endif #endif /* __ASSEMBLER__ */ diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile index 2f3441769ac5..49f937c2abbe 100644 --- a/arch/parisc/kernel/Makefile +++ b/arch/parisc/kernel/Makefile @@ -46,6 +46,9 @@ obj-$(CONFIG_KEXEC_FILE) += kexec_file.o # vdso obj-y += vdso.o -obj-$(CONFIG_64BIT) += vdso64/ -obj-$(CONFIG_PA11) += vdso32/ +ifdef CONFIG_64BIT +obj-y += vdso64/ obj-$(CONFIG_COMPAT) += vdso32/ +else +obj-y += vdso32/ +endif diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index bc47bbe3026e..b52ad704ec8a 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -41,9 +41,7 @@ const struct dma_map_ops *hppa_dma_ops __ro_after_init; EXPORT_SYMBOL(hppa_dma_ops); -static struct device root = { - .init_name = "parisc", -}; +static struct device *root; static inline int check_dev(struct device *dev) { @@ -89,7 +87,7 @@ static int for_each_padev(int (*fn)(struct device *, void *), void * data) .obj = data, .fn = fn, }; - return device_for_each_child(&root, &recurse_data, descend_children); + return device_for_each_child(root, &recurse_data, descend_children); } /** @@ -290,7 +288,7 @@ const struct parisc_device * find_pa_parent_type(const struct parisc_device *padev, int type) { const struct device *dev = &padev->dev; - while (dev != &root) { + while (dev != root) { struct parisc_device *candidate = to_parisc_device(dev); if (candidate->id.hw_type == type) return candidate; @@ -319,7 +317,7 @@ static void get_node_path(struct device *dev, struct hardware_path *path) dev = dev->parent; } - while (dev != &root) { + while (dev != root) { if (dev_is_pci(dev)) { unsigned int devfn = to_pci_dev(dev)->devfn; path->bc[i--] = PCI_SLOT(devfn) | (PCI_FUNC(devfn)<< 5); @@ -482,7 +480,7 @@ static struct parisc_device * __init alloc_tree_node( static struct parisc_device *create_parisc_device(struct hardware_path *modpath) { int i; - struct device *parent = &root; + struct device *parent = root; for (i = 0; i < 6; i++) { if (modpath->bc[i] == -1) continue; @@ -755,7 +753,7 @@ parse_tree_node(struct device *parent, int index, struct hardware_path *modpath) struct device *hwpath_to_device(struct hardware_path *modpath) { int i; - struct device *parent = &root; + struct device *parent = root; for (i = 0; i < 6; i++) { if (modpath->bc[i] == -1) continue; @@ -880,7 +878,7 @@ void __init walk_central_bus(void) { walk_native_bus(CENTRAL_BUS_ADDR, CENTRAL_BUS_ADDR + (MAX_NATIVE_DEVICES * NATIVE_DEVICE_OFFSET), - &root); + root); } static __init void print_parisc_device(struct parisc_device *dev) @@ -907,9 +905,10 @@ void __init init_parisc_bus(void) { if (bus_register(&parisc_bus_type)) panic("Could not register PA-RISC bus type\n"); - if (device_register(&root)) + + root = root_device_register("parisc"); + if (IS_ERR(root)) panic("Could not register PA-RISC root device\n"); - get_device(&root); } static __init void qemu_header(void) diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index f15e5920080b..e8718bc13eeb 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -83,11 +83,10 @@ config MSI_BITMAP_SELFTEST depends on DEBUG_KERNEL config GUEST_STATE_BUFFER_TEST - def_tristate n + def_tristate KUNIT_ALL_TESTS prompt "Enable Guest State Buffer unit tests" depends on KUNIT depends on KVM_BOOK3S_HV_POSSIBLE - default KUNIT_ALL_TESTS help The Guest State Buffer is a data format specified in the PAPR. It is by hcalls to communicate the state of L2 guests between diff --git a/arch/powerpc/configs/amigaone_defconfig b/arch/powerpc/configs/amigaone_defconfig index 69ef3dc31c4b..7a515390646b 100644 --- a/arch/powerpc/configs/amigaone_defconfig +++ b/arch/powerpc/configs/amigaone_defconfig @@ -76,7 +76,6 @@ CONFIG_SERIAL_8250_CONSOLE=y # CONFIG_HW_RANDOM is not set # CONFIG_HWMON is not set CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_RADEON=y CONFIG_FB_3DFX=y diff --git a/arch/powerpc/configs/chrp32_defconfig b/arch/powerpc/configs/chrp32_defconfig index b799c95480ae..66eae5b7e16c 100644 --- a/arch/powerpc/configs/chrp32_defconfig +++ b/arch/powerpc/configs/chrp32_defconfig @@ -76,7 +76,6 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_NVRAM=y # CONFIG_HWMON is not set CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_OF=y CONFIG_FB_MATROX=y CONFIG_FB_MATROX_MILLENIUM=y diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig index 04bbb37f5978..5ca1676e6058 100644 --- a/arch/powerpc/configs/g5_defconfig +++ b/arch/powerpc/configs/g5_defconfig @@ -85,6 +85,8 @@ CONFIG_PMAC_SMU=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_WINDFARM=y CONFIG_WINDFARM_PM81=y +CONFIG_WINDFARM_PM72=y +CONFIG_WINDFARM_RM31=y CONFIG_WINDFARM_PM91=y CONFIG_WINDFARM_PM112=y CONFIG_WINDFARM_PM121=y @@ -121,7 +123,6 @@ CONFIG_I2C_CHARDEV=y CONFIG_AGP=m CONFIG_AGP_UNINORTH=m CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_OF=y CONFIG_FB_NVIDIA=y diff --git a/arch/powerpc/configs/pasemi_defconfig b/arch/powerpc/configs/pasemi_defconfig index 8bbf51b38480..89bcbeb05067 100644 --- a/arch/powerpc/configs/pasemi_defconfig +++ b/arch/powerpc/configs/pasemi_defconfig @@ -98,7 +98,6 @@ CONFIG_SENSORS_LM85=y CONFIG_SENSORS_LM90=y CONFIG_DRM=y CONFIG_DRM_RADEON=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_VGA16=y CONFIG_FB_NVIDIA=y diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index cc9802420237..5d32c2767a65 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -196,7 +196,6 @@ CONFIG_I2C_CHARDEV=y # CONFIG_PTP_1588_CLOCK is not set CONFIG_DRM=y CONFIG_DRM_AST=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_OF=y CONFIG_FB_MATROX=m CONFIG_FB_MATROX_MILLENIUM=y diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 3bf518e3a573..6316ca4df25d 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -249,7 +249,6 @@ CONFIG_I2C_CHARDEV=y CONFIG_I2C_AMD8111=y CONFIG_I2C_PASEMI=y CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_OF=y CONFIG_FB_MATROX=y CONFIG_FB_MATROX_MILLENIUM=y diff --git a/arch/powerpc/configs/ppc64e_defconfig b/arch/powerpc/configs/ppc64e_defconfig index 0fd49f67331f..20cc17dce94d 100644 --- a/arch/powerpc/configs/ppc64e_defconfig +++ b/arch/powerpc/configs/ppc64e_defconfig @@ -118,7 +118,6 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_AMD8111=y CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_OF=y CONFIG_FB_MATROX=y CONFIG_FB_MATROX_MILLENIUM=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index ccabc6e17168..eda1fec7ffd9 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -393,6 +393,7 @@ CONFIG_NETCONSOLE=m CONFIG_TUN=m CONFIG_VETH=m CONFIG_VIRTIO_NET=m +CONFIG_EL3=m CONFIG_VORTEX=m CONFIG_TYPHOON=m CONFIG_ADAPTEC_STARFIRE=m diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index ff1bed4b6d2c..005536ee75bb 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -214,7 +214,6 @@ CONFIG_SENSORS_IBMPOWERNV=m CONFIG_DRM=m CONFIG_DRM_AST=m CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 2e23533b67e3..805b5aeebb6f 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -5,4 +5,5 @@ generated-y += syscall_table_spu.h generic-y += agp.h generic-y += mcs_spinlock.h generic-y += qrwlock.h +generic-y += ring_buffer.h generic-y += early_ioremap.h diff --git a/arch/powerpc/include/asm/pmac_low_i2c.h b/arch/powerpc/include/asm/pmac_low_i2c.h index 21bd7297c87f..fead8fae08ab 100644 --- a/arch/powerpc/include/asm/pmac_low_i2c.h +++ b/arch/powerpc/include/asm/pmac_low_i2c.h @@ -79,10 +79,6 @@ extern int pmac_i2c_match_adapter(struct device_node *dev, struct i2c_adapter *adapter); -/* (legacy) Locking functions exposed to i2c-keywest */ -extern int pmac_low_i2c_lock(struct device_node *np); -extern int pmac_low_i2c_unlock(struct device_node *np); - /* Access functions for platform code */ extern int pmac_i2c_open(struct pmac_i2c_bus *bus, int polled); extern void pmac_i2c_close(struct pmac_i2c_bus *bus); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 4bbeb8644d3d..b4472288e0d4 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -458,6 +458,10 @@ DEFINE_PER_CPU(u8, irq_work_pending); #endif /* 32 vs 64 bit */ +/* + * Must be called with preemption disabled since it updates + * per-CPU irq_work state and programs the local CPU decrementer. + */ void arch_irq_work_raise(void) { /* @@ -471,10 +475,8 @@ void arch_irq_work_raise(void) * which could get tangled up if we're messing with the same state * here. */ - preempt_disable(); set_irq_work_pending_flag(); set_dec(1); - preempt_enable(); } static void set_dec_or_work(u64 val) diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile index 8834dfe9d727..368759f81708 100644 --- a/arch/powerpc/kernel/vdso/Makefile +++ b/arch/powerpc/kernel/vdso/Makefile @@ -62,6 +62,12 @@ CC32FLAGSREMOVE += -fno-stack-clash-protection # 32-bit one. clang validates the values passed to these arguments during # parsing, even when -fno-stack-protector is passed afterwards. CC32FLAGSREMOVE += -mstack-protector-guard% +# ftrace is disabled for the vdso but arch/powerpc/Makefile adds this define to +# KBUILD_CPPFLAGS, which enables use of the 'patchable_function_entry' +# attribute in the 'inline' define via 'notrace'. This attribute is not +# supported for the powerpcle target, resulting in many instances of +# -Wunknown-attributes. +CC32FLAGSREMOVE += -DCC_USING_PATCHABLE_FUNCTION_ENTRY endif LD32FLAGS := -Wl,-soname=linux-vdso32.so.1 AS32FLAGS := -D__VDSO32__ diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile index 470eb0453e17..ec7a0eed75dc 100644 --- a/arch/powerpc/kexec/Makefile +++ b/arch/powerpc/kexec/Makefile @@ -16,4 +16,4 @@ GCOV_PROFILE_core_$(BITS).o := n KCOV_INSTRUMENT_core_$(BITS).o := n UBSAN_SANITIZE_core_$(BITS).o := n KASAN_SANITIZE_core.o := n -KASAN_SANITIZE_core_$(BITS) := n +KASAN_SANITIZE_core_$(BITS).o := n diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index 554b248002b4..57e897b60db8 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -52,7 +52,14 @@ int exit_vmx_usercopy(void) } EXPORT_SYMBOL(exit_vmx_usercopy); -int enter_vmx_ops(void) +/* + * Can be called from kexec copy_page() path with MMU off. The kexec + * code sets preempt_count to HARDIRQ_OFFSET so we return early here. + * Since in_interrupt() is always inline, __no_sanitize_address on this + * function is sufficient to avoid KASAN shadow memory accesses in real + * mode. + */ +int __no_sanitize_address enter_vmx_ops(void) { if (in_interrupt()) return 0; diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 8b0081441f85..2e6adf5b95c4 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2242,6 +2242,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, const u64 last_period = event->hw.last_period; s64 prev, delta, left; int record = 0; + int mark_event = regs->dsisr & MMCRA_SAMPLE_ENABLE; if (event->hw.state & PERF_HES_STOPPED) { write_pmc(event->hw.idx, 0); @@ -2304,9 +2305,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val, * In ISA v3.0 and before values "0" and "7" are considered reserved. * In ISA v3.1, value "7" has been used to indicate "larx/stcx". * Drop the sample if "type" has reserved values for this field with a - * ISA version check. + * ISA version check for marked events. */ - if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && + if (mark_event && event->attr.sample_type & PERF_SAMPLE_DATA_SRC && ppmu->get_mem_data_src) { val = (regs->dar & SIER_TYPE_MASK) >> SIER_TYPE_SHIFT; if (val == 0 || (val == 7 && !cpu_has_feature(CPU_FTR_ARCH_31))) { diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index 5cac2cf3bd1e..10c82cf8f5b3 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -210,7 +210,7 @@ static ssize_t processor_bus_topology_show(struct device *dev, struct device_att 0, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; @@ -244,12 +244,14 @@ static ssize_t processor_bus_topology_show(struct device *dev, struct device_att starting_index, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; } +out_success: + put_cpu_var(hv_gpci_reqb); return n; out: @@ -278,7 +280,7 @@ static ssize_t processor_config_show(struct device *dev, struct device_attribute 0, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; @@ -312,12 +314,14 @@ static ssize_t processor_config_show(struct device *dev, struct device_attribute starting_index, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; } +out_success: + put_cpu_var(hv_gpci_reqb); return n; out: @@ -346,7 +350,7 @@ static ssize_t affinity_domain_via_virtual_processor_show(struct device *dev, 0, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; @@ -382,12 +386,14 @@ static ssize_t affinity_domain_via_virtual_processor_show(struct device *dev, starting_index, secondary_index, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; } +out_success: + put_cpu_var(hv_gpci_reqb); return n; out: @@ -416,7 +422,7 @@ static ssize_t affinity_domain_via_domain_show(struct device *dev, struct device 0, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; @@ -448,12 +454,14 @@ static ssize_t affinity_domain_via_domain_show(struct device *dev, struct device starting_index, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; } +out_success: + put_cpu_var(hv_gpci_reqb); return n; out: diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c index a5001d32f978..6f674f86dc85 100644 --- a/arch/powerpc/platforms/44x/warp.c +++ b/arch/powerpc/platforms/44x/warp.c @@ -293,6 +293,8 @@ static int pika_dtm_thread(void __iomem *fpga) schedule_timeout(HZ); } + put_device(&client->dev); + return 0; } diff --git a/arch/powerpc/platforms/82xx/km82xx.c b/arch/powerpc/platforms/82xx/km82xx.c index 99f0f0f41876..4ad223525e89 100644 --- a/arch/powerpc/platforms/82xx/km82xx.c +++ b/arch/powerpc/platforms/82xx/km82xx.c @@ -27,8 +27,8 @@ static void __init km82xx_pic_init(void) { - struct device_node *np __free(device_node); - np = of_find_compatible_node(NULL, NULL, "fsl,pq2-pic"); + struct device_node *np __free(device_node) = of_find_compatible_node(NULL, + NULL, "fsl,pq2-pic"); if (!np) { pr_err("PIC init: can not find cpm-pic node\n"); diff --git a/arch/powerpc/platforms/8xx/cpm1.c b/arch/powerpc/platforms/8xx/cpm1.c index 7433be7d66ee..f00734f0590c 100644 --- a/arch/powerpc/platforms/8xx/cpm1.c +++ b/arch/powerpc/platforms/8xx/cpm1.c @@ -477,7 +477,7 @@ int cpm1_gpiochip_add16(struct device *dev) struct device_node *np = dev->of_node; struct cpm1_gpio16_chip *cpm1_gc; struct gpio_chip *gc; - u16 mask; + u32 mask; cpm1_gc = devm_kzalloc(dev, sizeof(*cpm1_gc), GFP_KERNEL); if (!cpm1_gc) @@ -485,7 +485,7 @@ int cpm1_gpiochip_add16(struct device *dev) spin_lock_init(&cpm1_gc->lock); - if (!of_property_read_u16(np, "fsl,cpm1-gpio-irq-mask", &mask)) { + if (!of_property_read_u32(np, "fsl,cpm1-gpio-irq-mask", &mask)) { int i, j; for (i = 0, j = 0; i < 16; i++) diff --git a/arch/powerpc/platforms/pasemi/pci.c b/arch/powerpc/platforms/pasemi/pci.c index 60f990a336c4..2df955274652 100644 --- a/arch/powerpc/platforms/pasemi/pci.c +++ b/arch/powerpc/platforms/pasemi/pci.c @@ -272,13 +272,12 @@ void __init pas_pci_init(void) { struct device_node *root = of_find_node_by_path("/"); struct device_node *np; - int res; pci_set_flags(PCI_SCAN_ALL_PCIE_DEVS); np = of_find_compatible_node(root, NULL, "pasemi,rootbus"); if (np) { - res = pas_add_bridge(np); + pas_add_bridge(np); of_node_put(np); } of_node_put(root); diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index 73b7f4e8c047..da72a30ab865 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -1058,40 +1058,6 @@ int pmac_i2c_match_adapter(struct device_node *dev, struct i2c_adapter *adapter) } EXPORT_SYMBOL_GPL(pmac_i2c_match_adapter); -int pmac_low_i2c_lock(struct device_node *np) -{ - struct pmac_i2c_bus *bus, *found = NULL; - - list_for_each_entry(bus, &pmac_i2c_busses, link) { - if (np == bus->controller) { - found = bus; - break; - } - } - if (!found) - return -ENODEV; - return pmac_i2c_open(bus, 0); -} -EXPORT_SYMBOL_GPL(pmac_low_i2c_lock); - -int pmac_low_i2c_unlock(struct device_node *np) -{ - struct pmac_i2c_bus *bus, *found = NULL; - - list_for_each_entry(bus, &pmac_i2c_busses, link) { - if (np == bus->controller) { - found = bus; - break; - } - } - if (!found) - return -ENODEV; - pmac_i2c_close(bus); - return 0; -} -EXPORT_SYMBOL_GPL(pmac_low_i2c_unlock); - - int pmac_i2c_open(struct pmac_i2c_bus *bus, int polled) { int rc; diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c index 12c473768c39..9109c218a060 100644 --- a/arch/powerpc/platforms/ps3/device-init.c +++ b/arch/powerpc/platforms/ps3/device-init.c @@ -950,8 +950,6 @@ static int __init ps3_start_probe_thread(enum ps3_bus_type bus_type) static int __init ps3_register_devices(void) { - int result; - if (!firmware_has_feature(FW_FEATURE_PS3_LV1)) return -ENODEV; @@ -959,7 +957,7 @@ static int __init ps3_register_devices(void) /* ps3_repository_dump_bus_info(); */ - result = ps3_start_probe_thread(PS3_BUS_TYPE_STORAGE); + ps3_start_probe_thread(PS3_BUS_TYPE_STORAGE); ps3_register_vuart_devices(); diff --git a/arch/powerpc/platforms/pseries/htmdump.c b/arch/powerpc/platforms/pseries/htmdump.c index 742ec52c9d4d..489a80e87082 100644 --- a/arch/powerpc/platforms/pseries/htmdump.c +++ b/arch/powerpc/platforms/pseries/htmdump.c @@ -16,6 +16,7 @@ static void *htm_buf; static void *htm_status_buf; static void *htm_info_buf; static void *htm_caps_buf; +static void *htm_mem_buf; static u32 nodeindex; static u32 nodalchipindex; static u32 coreindexonchip; @@ -86,7 +87,7 @@ static ssize_t htm_return_check(long rc) static ssize_t htmdump_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - void *htm_buf = filp->private_data; + void *htm_buf_data = filp->private_data; unsigned long page, read_size, available; loff_t offset; long rc, ret; @@ -100,7 +101,7 @@ static ssize_t htmdump_read(struct file *filp, char __user *ubuf, * - last three values are address, size and offset */ rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, - htmtype, H_HTM_OP_DUMP_DATA, virt_to_phys(htm_buf), + htmtype, H_HTM_OP_DUMP_DATA, virt_to_phys(htm_buf_data), PAGE_SIZE, page); ret = htm_return_check(rc); @@ -112,7 +113,61 @@ static ssize_t htmdump_read(struct file *filp, char __user *ubuf, available = PAGE_SIZE; read_size = min(count, available); *ppos += read_size; - return simple_read_from_buffer(ubuf, count, &offset, htm_buf, available); + return simple_read_from_buffer(ubuf, count, &offset, htm_buf_data, available); +} + +static ssize_t htmsystem_mem_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + void *htm_mem_data = filp->private_data; + long rc, ret; + u64 *num_entries; + u64 to_copy = 0; + loff_t offset = 0; + u64 mem_offset = 0; + + /* + * Invoke H_HTM call with: + * - operation as htm status (H_HTM_OP_STATUS) + * - last three values as addr, size and offset. "offset" + * is value from output buffer header that points to next + * entry to dump. 0 is the first entry to dump. next entry + * is read from the output bufferbyte offset 0x8. + * + * When first time hcall is invoked, mem_offset should be + * zero because zero is the first entry. + * In the next hcall, offset of next entry to read from is + * picked from output buffer header itself. So don't fill + * mem_offset for first read. + * + * If there is no further data to read in next iteration, + * offset value from output buffer header will point to -1. + */ + if (*ppos) { + mem_offset = *(u64 *)(htm_mem_data + 0x8); + if (mem_offset == -1) + return 0; + } + rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, + htmtype, H_HTM_OP_DUMP_SYSMEM_CONF, virt_to_phys(htm_mem_data), + PAGE_SIZE, be64_to_cpu(mem_offset)); + ret = htm_return_check(rc); + if (ret <= 0) { + pr_debug("H_HTM hcall returned for op: H_HTM_OP_DUMP_SYSMEM_CONF with hcall returning %ld\n", ret); + return ret; + } + + /* + * HTM system mem buffer, start of buffer + 0x10 gives the + * number of HTM entries in the buffer. + * So total count to copy is: + * 32 bytes (for first 5 fields) + (number of HTM entries * entry size) + */ + num_entries = htm_mem_data + 0x10; + to_copy = 32 + (be64_to_cpu(*num_entries) * 32); + + *ppos += to_copy; + return simple_read_from_buffer(ubuf, count, &offset, htm_mem_data, to_copy); } static const struct file_operations htmdump_fops = { @@ -121,6 +176,12 @@ static const struct file_operations htmdump_fops = { .open = simple_open, }; +static const struct file_operations htmsystem_mem_fops = { + .llseek = NULL, + .read = htmsystem_mem_read, + .open = simple_open, +}; + static int htmconfigure_set(void *data, u64 val) { long rc, ret; @@ -226,20 +287,31 @@ static int htmstart_get(void *data, u64 *val) static ssize_t htmstatus_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - void *htm_status_buf = filp->private_data; + void *htm_status_data = filp->private_data; long rc, ret; u64 *num_entries; u64 to_copy; int htmstatus_flag; + loff_t offset = 0; + u64 status_offset = 0; /* * Invoke H_HTM call with: * - operation as htm status (H_HTM_OP_STATUS) - * - last three values as addr, size and offset + * - last three values as addr, size and offset. + * "offset" is value from output buffer header + * that points to next entry to dump. 0 is the first + * entry to dump. next entry is read from the output + * bufferbyte offset 0x8. */ + if (*ppos) { + status_offset = *(u64 *)(htm_status_data + 0x8); + if (status_offset == -1) + return 0; + } rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, - htmtype, H_HTM_OP_STATUS, virt_to_phys(htm_status_buf), - PAGE_SIZE, 0); + htmtype, H_HTM_OP_STATUS, virt_to_phys(htm_status_data), + PAGE_SIZE, be64_to_cpu(status_offset)); ret = htm_return_check(rc); if (ret <= 0) { @@ -255,13 +327,15 @@ static ssize_t htmstatus_read(struct file *filp, char __user *ubuf, * So total count to copy is: * 32 bytes (for first 7 fields) + (number of HTM entries * entry size) */ - num_entries = htm_status_buf + 0x10; + num_entries = htm_status_data + 0x10; if (htmtype == 0x2) htmstatus_flag = 0x8; else htmstatus_flag = 0x6; to_copy = 32 + (be64_to_cpu(*num_entries) * htmstatus_flag); - return simple_read_from_buffer(ubuf, count, ppos, htm_status_buf, to_copy); + *ppos += to_copy; + + return simple_read_from_buffer(ubuf, count, &offset, htm_status_data, to_copy); } static const struct file_operations htmstatus_fops = { @@ -273,19 +347,30 @@ static const struct file_operations htmstatus_fops = { static ssize_t htminfo_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - void *htm_info_buf = filp->private_data; + void *htm_info_data = filp->private_data; long rc, ret; u64 *num_entries; u64 to_copy; + loff_t offset = 0; + u64 info_offset = 0; /* * Invoke H_HTM call with: * - operation as htm status (H_HTM_OP_STATUS) * - last three values as addr, size and offset + * "offset" is value from output buffer header + * that points to next entry to dump. 0 is the first + * entry to dump. next entry is read from the output + * bufferbyte offset 0x8. */ + if (*ppos) { + info_offset = *(u64 *)(htm_info_data + 0x8); + if (info_offset == -1) + return 0; + } rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, - htmtype, H_HTM_OP_DUMP_SYSPROC_CONF, virt_to_phys(htm_info_buf), - PAGE_SIZE, 0); + htmtype, H_HTM_OP_DUMP_SYSPROC_CONF, virt_to_phys(htm_info_data), + PAGE_SIZE, be64_to_cpu(info_offset)); ret = htm_return_check(rc); if (ret <= 0) { @@ -301,15 +386,17 @@ static ssize_t htminfo_read(struct file *filp, char __user *ubuf, * So total count to copy is: * 32 bytes (for first 5 fields) + (number of HTM entries * entry size) */ - num_entries = htm_info_buf + 0x10; + num_entries = htm_info_data + 0x10; to_copy = 32 + (be64_to_cpu(*num_entries) * 16); - return simple_read_from_buffer(ubuf, count, ppos, htm_info_buf, to_copy); + + *ppos += to_copy; + return simple_read_from_buffer(ubuf, count, &offset, htm_info_data, to_copy); } static ssize_t htmcaps_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - void *htm_caps_buf = filp->private_data; + void *htm_caps_data = filp->private_data; long rc, ret; /* @@ -319,7 +406,7 @@ static ssize_t htmcaps_read(struct file *filp, char __user *ubuf, * and zero */ rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, - htmtype, H_HTM_OP_CAPABILITIES, virt_to_phys(htm_caps_buf), + htmtype, H_HTM_OP_CAPABILITIES, virt_to_phys(htm_caps_data), 0x80, 0); ret = htm_return_check(rc); @@ -328,7 +415,7 @@ static ssize_t htmcaps_read(struct file *filp, char __user *ubuf, return ret; } - return simple_read_from_buffer(ubuf, count, ppos, htm_caps_buf, 0x80); + return simple_read_from_buffer(ubuf, count, ppos, htm_caps_data, 0x80); } static const struct file_operations htminfo_fops = { @@ -457,9 +544,17 @@ static int htmdump_init_debugfs(void) return -ENOMEM; } + /* Memory to present HTM system memory configuration */ + htm_mem_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!htm_mem_buf) { + pr_err("Failed to allocate htm mem buf\n"); + return -ENOMEM; + } + debugfs_create_file("htmstatus", 0400, htmdump_debugfs_dir, htm_status_buf, &htmstatus_fops); debugfs_create_file("htminfo", 0400, htmdump_debugfs_dir, htm_info_buf, &htminfo_fops); debugfs_create_file("htmcaps", 0400, htmdump_debugfs_dir, htm_caps_buf, &htmcaps_fops); + debugfs_create_file("htmsystem_mem", 0400, htmdump_debugfs_dir, htm_mem_buf, &htmsystem_mem_fops); return 0; } @@ -482,6 +577,10 @@ static void __exit htmdump_exit(void) { debugfs_remove_recursive(htmdump_debugfs_dir); kfree(htm_buf); + kfree(htm_status_buf); + kfree(htm_info_buf); + kfree(htm_caps_buf); + kfree(htm_mem_buf); } module_init(htmdump_init); diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 14ae480d060a..0c40bdde45e2 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -190,33 +190,34 @@ static int hvpipe_rtas_recv_msg(char __user *buf, int size) return -ENOMEM; } - ret = rtas_ibm_receive_hvpipe_msg(work_area, &srcID, - &bytes_written); - if (!ret) { - /* - * Recv HVPIPE RTAS is successful. - * When releasing FD or no one is waiting on the - * specific source, issue recv HVPIPE RTAS call - * so that pipe is not blocked - this func is called - * with NULL buf. - */ - if (buf) { - if (size < bytes_written) { - pr_err("Received the payload size = %d, but the buffer size = %d\n", - bytes_written, size); - bytes_written = size; - } - ret = copy_to_user(buf, - rtas_work_area_raw_buf(work_area), - bytes_written); - if (!ret) - ret = bytes_written; - } - } else { - pr_err("ibm,receive-hvpipe-msg failed with %d\n", - ret); + /* + * Recv HVPIPE RTAS is successful. + * When releasing FD or no one is waiting on the + * specific source, issue recv HVPIPE RTAS call + * so that pipe is not blocked - this func is called + * with NULL buf. + */ + ret = rtas_ibm_receive_hvpipe_msg(work_area, &srcID, &bytes_written); + if (ret) { + pr_err("ibm,receive-hvpipe-msg failed with %d\n", ret); + goto out; } + if (!buf) + goto out; + + if (size < bytes_written) { + pr_err("Received the payload size = %d, but the buffer size = %d\n", + bytes_written, size); + bytes_written = size; + } + + if (copy_to_user(buf, rtas_work_area_raw_buf(work_area), bytes_written)) + ret = -EFAULT; + else + ret = bytes_written; + +out: rtas_work_area_free(work_area); return ret; } @@ -327,8 +328,8 @@ static ssize_t papr_hvpipe_handle_read(struct file *file, { struct hvpipe_source_info *src_info = file->private_data; - struct papr_hvpipe_hdr hdr; - long ret; + struct papr_hvpipe_hdr hdr = {}; + ssize_t ret = 0; /* * Return -ENXIO during migration @@ -376,7 +377,7 @@ static ssize_t papr_hvpipe_handle_read(struct file *file, ret = copy_to_user(buf, &hdr, HVPIPE_HDR_LEN); if (ret) - return ret; + return -EFAULT; /* * Message event has payload, so get the payload with @@ -385,19 +386,23 @@ static ssize_t papr_hvpipe_handle_read(struct file *file, if (hdr.flags & HVPIPE_MSG_AVAILABLE) { ret = hvpipe_rtas_recv_msg(buf + HVPIPE_HDR_LEN, size - HVPIPE_HDR_LEN); - if (ret > 0) { + /* + * Always clear MSG_AVAILABLE once the RTAS call has drained + * the message, regardless of whether copy_to_user succeeded. + */ + if (ret >= 0 || ret == -EFAULT) src_info->hvpipe_status &= ~HVPIPE_MSG_AVAILABLE; - ret += HVPIPE_HDR_LEN; - } } else if (hdr.flags & HVPIPE_LOST_CONNECTION) { /* * Hypervisor is closing the pipe for the specific * source. So notify user space. */ src_info->hvpipe_status &= ~HVPIPE_LOST_CONNECTION; - ret = HVPIPE_HDR_LEN; } + if (ret >= 0) + ret += HVPIPE_HDR_LEN; + return ret; } @@ -444,16 +449,18 @@ static int papr_hvpipe_handle_release(struct inode *inode, struct file *file) { struct hvpipe_source_info *src_info; + unsigned long flags; /* * Hold the lock, remove source from src_list, reset the * hvpipe status and release the lock to prevent any race * with message event IRQ. */ - spin_lock(&hvpipe_src_list_lock); + spin_lock_irqsave(&hvpipe_src_list_lock, flags); src_info = file->private_data; list_del(&src_info->list); file->private_data = NULL; + spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); /* * If the pipe for this specific source has any pending * payload, issue recv HVPIPE RTAS so that pipe will not @@ -461,10 +468,8 @@ static int papr_hvpipe_handle_release(struct inode *inode, */ if (src_info->hvpipe_status & HVPIPE_MSG_AVAILABLE) { src_info->hvpipe_status = 0; - spin_unlock(&hvpipe_src_list_lock); hvpipe_rtas_recv_msg(NULL, 0); - } else - spin_unlock(&hvpipe_src_list_lock); + } kfree(src_info); return 0; @@ -479,50 +484,53 @@ static const struct file_operations papr_hvpipe_handle_ops = { static int papr_hvpipe_dev_create_handle(u32 srcID) { - struct hvpipe_source_info *src_info __free(kfree) = NULL; - - spin_lock(&hvpipe_src_list_lock); - /* - * Do not allow more than one process communicates with - * each source. - */ - src_info = hvpipe_find_source(srcID); - if (src_info) { - spin_unlock(&hvpipe_src_list_lock); - pr_err("pid(%d) is already using the source(%d)\n", - src_info->tsk->pid, srcID); - return -EALREADY; - } - spin_unlock(&hvpipe_src_list_lock); + struct hvpipe_source_info *src_info; + int fd; + unsigned long flags; src_info = kzalloc_obj(*src_info, GFP_KERNEL_ACCOUNT); if (!src_info) return -ENOMEM; src_info->srcID = srcID; - src_info->tsk = current; init_waitqueue_head(&src_info->recv_wqh); - FD_PREPARE(fdf, O_RDONLY | O_CLOEXEC, - anon_inode_getfile("[papr-hvpipe]", &papr_hvpipe_handle_ops, - (void *)src_info, O_RDWR)); - if (fdf.err) - return fdf.err; - - retain_and_null_ptr(src_info); - spin_lock(&hvpipe_src_list_lock); /* - * If two processes are executing ioctl() for the same - * source ID concurrently, prevent the second process to - * acquire FD. + * Do not allow more than one process communicates with + * each source. */ + spin_lock_irqsave(&hvpipe_src_list_lock, flags); if (hvpipe_find_source(srcID)) { - spin_unlock(&hvpipe_src_list_lock); + spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); + pr_err("pid(%s:%d) could not get the source(%d)\n", + current->comm, task_pid_nr(current), srcID); + kfree(src_info); return -EALREADY; } list_add(&src_info->list, &hvpipe_src_list); - spin_unlock(&hvpipe_src_list_lock); - return fd_publish(fdf); + spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); + + fd = FD_ADD(O_RDONLY | O_CLOEXEC, + anon_inode_getfile("[papr-hvpipe]", &papr_hvpipe_handle_ops, + (void *)src_info, O_RDWR)); + if (fd < 0) { + spin_lock_irqsave(&hvpipe_src_list_lock, flags); + list_del(&src_info->list); + spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); + /* + * if we fail to add FD, that means no userspace program is + * polling. In that case if there is a msg pending because the + * interrupt was fired after the src_info was added to the + * global list, then let's consume it here, to unblock the + * hvpipe + */ + if (src_info->hvpipe_status & HVPIPE_MSG_AVAILABLE) + hvpipe_rtas_recv_msg(NULL, 0); + kfree(src_info); + return fd; + } + + return fd; } /* @@ -685,20 +693,19 @@ static int __init enable_hvpipe_IRQ(void) struct device_node *np; hvpipe_check_exception_token = rtas_function_token(RTAS_FN_CHECK_EXCEPTION); - if (hvpipe_check_exception_token == RTAS_UNKNOWN_SERVICE) + if (hvpipe_check_exception_token == RTAS_UNKNOWN_SERVICE) return -ENODEV; /* hvpipe events */ np = of_find_node_by_path("/event-sources/ibm,hvpipe-msg-events"); - if (np != NULL) { - request_event_sources_irqs(np, hvpipe_event_interrupt, - "HPIPE_EVENT"); - of_node_put(np); - } else { - pr_err("Can not enable hvpipe event IRQ\n"); + if (!np) { + pr_err("No device node found, could not enable hvpipe event IRQ\n"); return -ENODEV; } + request_event_sources_irqs(np, hvpipe_event_interrupt, "HPIPE_EVENT"); + of_node_put(np); + return 0; } @@ -775,23 +782,29 @@ static int __init papr_hvpipe_init(void) } ret = enable_hvpipe_IRQ(); - if (!ret) { - ret = set_hvpipe_sys_param(1); - if (!ret) - ret = misc_register(&papr_hvpipe_dev); - } + if (ret) + goto out_wq; - if (!ret) { - pr_info("hvpipe feature is enabled\n"); - hvpipe_feature = true; - return 0; - } + ret = misc_register(&papr_hvpipe_dev); + if (ret) + goto out_wq; - pr_err("hvpipe feature is not enabled %d\n", ret); + ret = set_hvpipe_sys_param(1); + if (ret) + goto out_misc; + + pr_info("hvpipe feature is enabled\n"); + hvpipe_feature = true; + return 0; + +out_misc: + misc_deregister(&papr_hvpipe_dev); +out_wq: destroy_workqueue(papr_hvpipe_wq); out: kfree(papr_hvpipe_work); papr_hvpipe_work = NULL; + pr_err("hvpipe feature is not enabled %d\n", ret); return ret; } machine_device_initcall(pseries, papr_hvpipe_init); diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.h b/arch/powerpc/platforms/pseries/papr-hvpipe.h index c343f4230865..4bdf7bb2fc4d 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.h +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.h @@ -21,7 +21,6 @@ struct hvpipe_source_info { u32 srcID; u32 hvpipe_status; wait_queue_head_t recv_wqh; /* wake up poll() waitq */ - struct task_struct *tsk; }; /* diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index d235396c4514..c5754942cf85 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -937,6 +937,28 @@ config RISCV_VECTOR_MISALIGNED help Enable detecting support for vector misaligned loads and stores. +config RISCV_SBI_FWFT_DELEGATE_MISALIGNED + bool "Request firmware delegation of unaligned access exceptions" + depends on RISCV_SBI + depends on NONPORTABLE + help + Use SBI FWFT to request delegation of load address misaligned and + store address misaligned exceptions, if possible, and prefer Linux + kernel emulation of these accesses to firmware emulation. + + Unfortunately, Linux's emulation is still incomplete. Namely, it + currently does not handle vector instructions and KVM guest accesses. + On platforms where these accesses would have been handled by firmware, + enabling this causes unexpected kernel oopses, userspaces crashes and + KVM guest crashes. If you are sure that these are not a problem for + your platform, you can say Y here, which may improve performance. + + Saying N here will not worsen emulation support for unaligned accesses + even in the case where the firmware also has incomplete support. It + simply keeps the firmware's emulation enabled. + + If you don't know what to do here, say N. + choice prompt "Unaligned Accesses Support" default RISCV_PROBE_UNALIGNED_ACCESS diff --git a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi index 2d14e92f068d..9078e5b1e49c 100644 --- a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi +++ b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi @@ -101,16 +101,6 @@ status = "okay"; }; -&i2c0 { - pinctrl-names = "default"; - pinctrl-0 = <&i2c0_fabric>; -}; - -&i2c1 { - pinctrl-names = "default"; - pinctrl-0 = <&i2c1_mssio>; -}; - &mmuart1 { pinctrl-names = "default"; pinctrl-0 = <&uart1_fabric>; diff --git a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-prod.dts b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-prod.dts index 8afedece89d1..636493f6584d 100644 --- a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-prod.dts +++ b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-prod.dts @@ -14,6 +14,16 @@ "microchip,mpfs"; }; +&i2c0 { + pinctrl-names = "default"; + pinctrl-0 = <&i2c0_fabric>; +}; + +&i2c1 { + pinctrl-names = "default"; + pinctrl-0 = <&i2c1_mssio>; +}; + &syscontroller { microchip,bitstream-flash = <&sys_ctrl_flash>; }; diff --git a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts index 556aa9638282..6fadce815c9a 100644 --- a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts +++ b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts @@ -11,3 +11,22 @@ "microchip,mpfs-icicle-kit", "microchip,mpfs"; }; + +&i2c0 { + pinctrl-names = "default"; + pinctrl-0 = <&i2c0_fabric>; +}; + +/* + * Due to silicon errata, routing via MSS IOs doesn't work on ES devices. + * Instead, i2c1, appearing on B1/C1, which are normally MSS IOs, is routed + * via the fabric and back to B1/C1 via "fabric-test" functionality. + * This is done silently by Libero, so the iomux0 setting for i2c1 has to + * be fabric IO, despite tooling etc saying that MSS IOs are used. + * + * See Section 3.3 of https://ww1.microchip.com/downloads/aemDocuments/documents/FPGA/ProductDocuments/Errata/polarfiresoc/microsemi_polarfire_soc_fpga_egineering_samples_errata_er0219_v1.pdf + */ +&i2c1 { + pinctrl-names = "default"; + pinctrl-0 = <&i2c1_fabric>; +}; diff --git a/arch/riscv/boot/dts/starfive/jh7110-common.dtsi b/arch/riscv/boot/dts/starfive/jh7110-common.dtsi index 8cfe8033305d..a7a1c09a2c90 100644 --- a/arch/riscv/boot/dts/starfive/jh7110-common.dtsi +++ b/arch/riscv/boot/dts/starfive/jh7110-common.dtsi @@ -135,29 +135,6 @@ clock-frequency = <49152000>; }; -&camss { - assigned-clocks = <&ispcrg JH7110_ISPCLK_DOM4_APB_FUNC>, - <&ispcrg JH7110_ISPCLK_MIPI_RX0_PXL>; - assigned-clock-rates = <49500000>, <198000000>; - - ports { - #address-cells = <1>; - #size-cells = <0>; - - port@0 { - reg = <0>; - }; - - port@1 { - reg = <1>; - - camss_from_csi2rx: endpoint { - remote-endpoint = <&csi2rx_to_camss>; - }; - }; - }; -}; - &csi2rx { assigned-clocks = <&ispcrg JH7110_ISPCLK_VIN_SYS>; assigned-clock-rates = <297000000>; @@ -175,9 +152,7 @@ port@1 { reg = <1>; - csi2rx_to_camss: endpoint { - remote-endpoint = <&camss_from_csi2rx>; - }; + /* remote CAMSS endpoint */ }; }; }; diff --git a/arch/riscv/boot/dts/starfive/jh7110.dtsi b/arch/riscv/boot/dts/starfive/jh7110.dtsi index 6e56e9d20bb0..9c3e4598747e 100644 --- a/arch/riscv/boot/dts/starfive/jh7110.dtsi +++ b/arch/riscv/boot/dts/starfive/jh7110.dtsi @@ -1199,34 +1199,6 @@ #phy-cells = <0>; }; - camss: isp@19840000 { - compatible = "starfive,jh7110-camss"; - reg = <0x0 0x19840000 0x0 0x10000>, - <0x0 0x19870000 0x0 0x30000>; - reg-names = "syscon", "isp"; - clocks = <&ispcrg JH7110_ISPCLK_DOM4_APB_FUNC>, - <&ispcrg JH7110_ISPCLK_ISPV2_TOP_WRAPPER_C>, - <&ispcrg JH7110_ISPCLK_DVP_INV>, - <&ispcrg JH7110_ISPCLK_VIN_P_AXI_WR>, - <&ispcrg JH7110_ISPCLK_MIPI_RX0_PXL>, - <&syscrg JH7110_SYSCLK_ISP_TOP_CORE>, - <&syscrg JH7110_SYSCLK_ISP_TOP_AXI>; - clock-names = "apb_func", "wrapper_clk_c", "dvp_inv", - "axiwr", "mipi_rx0_pxl", "ispcore_2x", - "isp_axi"; - resets = <&ispcrg JH7110_ISPRST_ISPV2_TOP_WRAPPER_P>, - <&ispcrg JH7110_ISPRST_ISPV2_TOP_WRAPPER_C>, - <&ispcrg JH7110_ISPRST_VIN_P_AXI_RD>, - <&ispcrg JH7110_ISPRST_VIN_P_AXI_WR>, - <&syscrg JH7110_SYSRST_ISP_TOP>, - <&syscrg JH7110_SYSRST_ISP_TOP_AXI>; - reset-names = "wrapper_p", "wrapper_c", "axird", - "axiwr", "isp_top_n", "isp_top_axi"; - power-domains = <&pwrc JH7110_PD_ISP>; - interrupts = <92>, <87>, <90>, <88>; - status = "disabled"; - }; - voutcrg: clock-controller@295c0000 { compatible = "starfive,jh7110-voutcrg"; reg = <0x0 0x295c0000 0x0 0x10000>; diff --git a/arch/riscv/errata/mips/errata.c b/arch/riscv/errata/mips/errata.c index e984a8152208..2c3dc2259e93 100644 --- a/arch/riscv/errata/mips/errata.c +++ b/arch/riscv/errata/mips/errata.c @@ -57,7 +57,7 @@ void mips_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, } tmp = (1U << alt->patch_id); - if (cpu_req_errata && tmp) { + if (cpu_req_errata & tmp) { mutex_lock(&text_mutex); patch_text_nosync(ALT_OLD_PTR(alt), ALT_ALT_PTR(alt), alt->alt_len); diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index bd5fc9403295..7721b63642f4 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -14,5 +14,6 @@ generic-y += ticket_spinlock.h generic-y += qrwlock.h generic-y += qrwlock_types.h generic-y += qspinlock.h +generic-y += ring_buffer.h generic-y += user.h generic-y += vmlinux.lds.h diff --git a/arch/riscv/include/asm/syscall_wrapper.h b/arch/riscv/include/asm/syscall_wrapper.h index ac80216549ff..226289c3b5c8 100644 --- a/arch/riscv/include/asm/syscall_wrapper.h +++ b/arch/riscv/include/asm/syscall_wrapper.h @@ -32,6 +32,10 @@ asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *); __diag_push(); \ __diag_ignore(GCC, 8, "-Wattribute-alias", \ "Type aliasing is used to sanitize syscall arguments"); \ + __diag_ignore(clang, 23, "-Wunknown-warning-option", \ + "Avoid breaking versions without -Wattribute-alias"); \ + __diag_ignore(clang, 23, "-Wattribute-alias", \ + "Type aliasing is used to sanitize syscall arguments"); \ static long __se_##prefix##name(ulong, ulong, ulong, ulong, ulong, ulong, \ ulong) \ __attribute__((alias(__stringify(___se_##prefix##name)))); \ diff --git a/arch/riscv/kernel/compat_signal.c b/arch/riscv/kernel/compat_signal.c index 6ec4e34255a9..cf3eb33a11e4 100644 --- a/arch/riscv/kernel/compat_signal.c +++ b/arch/riscv/kernel/compat_signal.c @@ -107,6 +107,8 @@ static long compat_restore_sigcontext(struct pt_regs *regs, /* sc_regs is structured the same as the start of pt_regs */ err = __copy_from_user(&cregs, &sc->sc_regs, sizeof(sc->sc_regs)); + if (unlikely(err)) + return err; cregs_to_regs(&cregs, regs); diff --git a/arch/riscv/kernel/copy-unaligned.S b/arch/riscv/kernel/copy-unaligned.S index 2b3d9398c113..90f3549621f7 100644 --- a/arch/riscv/kernel/copy-unaligned.S +++ b/arch/riscv/kernel/copy-unaligned.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2023 Rivos Inc. */ +#include <linux/cfi_types.h> #include <linux/linkage.h> #include <asm/asm.h> @@ -9,7 +10,7 @@ /* void __riscv_copy_words_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using word loads and stores. */ /* Note: The size is truncated to a multiple of 8 * SZREG */ -SYM_FUNC_START(__riscv_copy_words_unaligned) +SYM_TYPED_FUNC_START(__riscv_copy_words_unaligned) andi a4, a2, ~((8*SZREG)-1) beqz a4, 2f add a3, a1, a4 @@ -41,7 +42,7 @@ SYM_FUNC_END(__riscv_copy_words_unaligned) /* void __riscv_copy_bytes_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using only byte accesses. */ /* Note: The size is truncated to a multiple of 8 */ -SYM_FUNC_START(__riscv_copy_bytes_unaligned) +SYM_TYPED_FUNC_START(__riscv_copy_bytes_unaligned) andi a4, a2, ~(8-1) beqz a4, 2f add a3, a1, a4 diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 1734f9a4c2fd..f46aa5602d74 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -896,10 +896,8 @@ static void __init riscv_fill_hwcap_from_isa_string(unsigned long *isa2hwcap) * CPU cores with the ratified spec will contain non-zero * marchid. */ - if (acpi_disabled && boot_vendorid == THEAD_VENDOR_ID && boot_archid == 0x0) { - this_hwcap &= ~isa2hwcap[RISCV_ISA_EXT_v]; + if (acpi_disabled && boot_vendorid == THEAD_VENDOR_ID && boot_archid == 0x0) clear_bit(RISCV_ISA_EXT_v, source_isa); - } riscv_resolve_isa(source_isa, isainfo->isa, &this_hwcap, isa2hwcap); @@ -1104,16 +1102,16 @@ early_param("riscv_isa_fallback", riscv_isa_fallback_setup); void __init riscv_fill_hwcap(void) { char print_str[NUM_ALPHA_EXTS + 1]; - unsigned long isa2hwcap[26] = {0}; + unsigned long isa2hwcap[RISCV_ISA_EXT_BASE] = {0}; int i, j; - isa2hwcap['i' - 'a'] = COMPAT_HWCAP_ISA_I; - isa2hwcap['m' - 'a'] = COMPAT_HWCAP_ISA_M; - isa2hwcap['a' - 'a'] = COMPAT_HWCAP_ISA_A; - isa2hwcap['f' - 'a'] = COMPAT_HWCAP_ISA_F; - isa2hwcap['d' - 'a'] = COMPAT_HWCAP_ISA_D; - isa2hwcap['c' - 'a'] = COMPAT_HWCAP_ISA_C; - isa2hwcap['v' - 'a'] = COMPAT_HWCAP_ISA_V; + isa2hwcap[RISCV_ISA_EXT_i] = COMPAT_HWCAP_ISA_I; + isa2hwcap[RISCV_ISA_EXT_m] = COMPAT_HWCAP_ISA_M; + isa2hwcap[RISCV_ISA_EXT_a] = COMPAT_HWCAP_ISA_A; + isa2hwcap[RISCV_ISA_EXT_f] = COMPAT_HWCAP_ISA_F; + isa2hwcap[RISCV_ISA_EXT_d] = COMPAT_HWCAP_ISA_D; + isa2hwcap[RISCV_ISA_EXT_c] = COMPAT_HWCAP_ISA_C; + isa2hwcap[RISCV_ISA_EXT_v] = COMPAT_HWCAP_ISA_V; if (!acpi_disabled) { riscv_fill_hwcap_from_isa_string(isa2hwcap); diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 93de2e7a3074..793bcee46182 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -577,8 +577,8 @@ static int compat_riscv_gpr_set(struct task_struct *target, struct compat_user_regs_struct cregs; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &cregs, 0, -1); - - cregs_to_regs(&cregs, task_pt_regs(target)); + if (!ret) + cregs_to_regs(&cregs, task_pt_regs(target)); return ret; } diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 2a27d3ff4ac6..81b7682e6c6d 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -584,7 +584,7 @@ static int cpu_online_check_unaligned_access_emulated(unsigned int cpu) static bool misaligned_traps_delegated; -#ifdef CONFIG_RISCV_SBI +#if defined(CONFIG_RISCV_SBI_FWFT_DELEGATE_MISALIGNED) static int cpu_online_sbi_unaligned_setup(unsigned int cpu) { diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c index 6eaa0d94fdfe..cbfb4e495e9f 100644 --- a/arch/riscv/kernel/usercfi.c +++ b/arch/riscv/kernel/usercfi.c @@ -109,15 +109,16 @@ void set_indir_lp_lock(struct task_struct *task, bool lock) task->thread_info.user_cfi_state.ufcfi_locked = lock; } /* - * If size is 0, then to be compatible with regular stack we want it to be as big as - * regular stack. Else PAGE_ALIGN it and return back + * The shadow stack only stores the return address and not any variables + * this should be more than sufficient for most applications. + * Else PAGE_ALIGN it and return back */ static unsigned long calc_shstk_size(unsigned long size) { if (size) return PAGE_ALIGN(size); - return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); + return PAGE_ALIGN(min(rlimit(RLIMIT_STACK) / 2, SZ_2G)); } /* diff --git a/arch/riscv/kernel/vec-copy-unaligned.S b/arch/riscv/kernel/vec-copy-unaligned.S index 7ce4de6f6e69..361039f7b944 100644 --- a/arch/riscv/kernel/vec-copy-unaligned.S +++ b/arch/riscv/kernel/vec-copy-unaligned.S @@ -2,6 +2,7 @@ /* Copyright (C) 2024 Rivos Inc. */ #include <linux/args.h> +#include <linux/cfi_types.h> #include <linux/linkage.h> #include <asm/asm.h> @@ -16,7 +17,7 @@ /* void __riscv_copy_vec_words_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using word loads and stores. */ /* Note: The size is truncated to a multiple of WORD_EEW */ -SYM_FUNC_START(__riscv_copy_vec_words_unaligned) +SYM_TYPED_FUNC_START(__riscv_copy_vec_words_unaligned) andi a4, a2, ~(WORD_EEW-1) beqz a4, 2f add a3, a1, a4 @@ -38,7 +39,7 @@ SYM_FUNC_END(__riscv_copy_vec_words_unaligned) /* void __riscv_copy_vec_bytes_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using only byte accesses. */ /* Note: The size is truncated to a multiple of 8 */ -SYM_FUNC_START(__riscv_copy_vec_bytes_unaligned) +SYM_TYPED_FUNC_START(__riscv_copy_vec_bytes_unaligned) andi a4, a2, ~(8-1) beqz a4, 2f add a3, a1, a4 diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c index 4d89b94128ae..f09f9251d1f0 100644 --- a/arch/riscv/kvm/vcpu_insn.c +++ b/arch/riscv/kvm/vcpu_insn.c @@ -415,7 +415,6 @@ int kvm_riscv_vcpu_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run, shift = 8 * (sizeof(ulong) - len); } else if ((insn & INSN_MASK_LBU) == INSN_MATCH_LBU) { len = 1; - shift = 8 * (sizeof(ulong) - len); #ifdef CONFIG_64BIT } else if ((insn & INSN_MASK_LD) == INSN_MATCH_LD) { len = 8; @@ -649,22 +648,22 @@ int kvm_riscv_vcpu_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) case 1: data8 = *((u8 *)run->mmio.data); SET_RD(insn, &vcpu->arch.guest_context, - (ulong)data8 << shift >> shift); + (long)((ulong)data8 << shift) >> shift); break; case 2: data16 = *((u16 *)run->mmio.data); SET_RD(insn, &vcpu->arch.guest_context, - (ulong)data16 << shift >> shift); + (long)((ulong)data16 << shift) >> shift); break; case 4: data32 = *((u32 *)run->mmio.data); SET_RD(insn, &vcpu->arch.guest_context, - (ulong)data32 << shift >> shift); + (long)((ulong)data32 << shift) >> shift); break; case 8: data64 = *((u64 *)run->mmio.data); SET_RD(insn, &vcpu->arch.guest_context, - (ulong)data64 << shift >> shift); + (long)((ulong)data64 << shift) >> shift); break; default: return -EOPNOTSUPP; diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c index a935ed96bc17..bb46dcbfb24d 100644 --- a/arch/riscv/kvm/vcpu_pmu.c +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -453,8 +453,10 @@ int kvm_riscv_vcpu_pmu_snapshot_set_shmem(struct kvm_vcpu *vcpu, unsigned long s } kvpmu->sdata = kzalloc(snapshot_area_size, GFP_ATOMIC); - if (!kvpmu->sdata) - return -ENOMEM; + if (!kvpmu->sdata) { + sbiret = SBI_ERR_FAILURE; + goto out; + } /* No need to check writable slot explicitly as kvm_vcpu_write_guest does it internally */ if (kvm_vcpu_write_guest(vcpu, saddr, kvpmu->sdata, snapshot_area_size)) { @@ -499,8 +501,10 @@ int kvm_riscv_vcpu_pmu_event_info(struct kvm_vcpu *vcpu, unsigned long saddr_low } einfo = kzalloc(shmem_size, GFP_KERNEL); - if (!einfo) - return -ENOMEM; + if (!einfo) { + ret = SBI_ERR_FAILURE; + goto out; + } ret = kvm_vcpu_read_guest(vcpu, shmem, einfo, shmem_size); if (ret) { diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c index 3b834709b429..60e50296a008 100644 --- a/arch/riscv/kvm/vcpu_sbi_sta.c +++ b/arch/riscv/kvm/vcpu_sbi_sta.c @@ -46,7 +46,7 @@ void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu) gfn = shmem >> PAGE_SHIFT; hva = kvm_vcpu_gfn_to_hva(vcpu, gfn); - if (WARN_ON(kvm_is_error_hva(hva))) { + if (kvm_is_error_hva(hva)) { vcpu->arch.sta.shmem = INVALID_GPA; return; } diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c index 188d5ea5b3b8..c9c323d4577a 100644 --- a/arch/riscv/kvm/vcpu_sbi_v01.c +++ b/arch/riscv/kvm/vcpu_sbi_v01.c @@ -55,6 +55,8 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, for_each_set_bit(i, &hmask, BITS_PER_LONG) { rvcpu = kvm_get_vcpu_by_id(vcpu->kvm, i); + if (!rvcpu) + continue; ret = kvm_riscv_vcpu_set_interrupt(rvcpu, IRQ_VS_SOFT); if (ret < 0) break; diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index decd7df40fa4..fa8d2f6f554b 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -792,6 +792,27 @@ static void __init set_mmap_rnd_bits_max(void) mmap_rnd_bits_max = MMAP_VA_BITS - PAGE_SHIFT - 3; } +static bool __init is_vaddr_valid(unsigned long va) +{ + unsigned long up = 0; + + switch (satp_mode) { + case SATP_MODE_39: + up = 1UL << 38; + break; + case SATP_MODE_48: + up = 1UL << 47; + break; + case SATP_MODE_57: + up = 1UL << 56; + break; + default: + return false; + } + + return (va < up) || (va >= (ULONG_MAX - up + 1)); +} + /* * There is a simple way to determine if 4-level is supported by the * underlying hardware: establish 1:1 mapping in 4-level page table mode @@ -833,6 +854,9 @@ static __init void set_satp_mode(uintptr_t dtb_pa) set_satp_mode_pmd + PMD_SIZE, PMD_SIZE, PAGE_KERNEL_EXEC); retry: + if (!is_vaddr_valid(set_satp_mode_pmd)) + goto out; + create_pgd_mapping(early_pg_dir, set_satp_mode_pmd, pgtable_l5_enabled ? @@ -855,6 +879,7 @@ retry: disable_pgtable_l4(); } +out: memset(early_pg_dir, 0, PAGE_SIZE); memset(early_p4d, 0, PAGE_SIZE); memset(early_pud, 0, PAGE_SIZE); diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index c28f9a7d0bd8..730c90b4a876 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -56,6 +56,10 @@ CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y CONFIG_VFIO_CCW=m CONFIG_VFIO_AP=m +CONFIG_VFIO_DEVICE_CDEV=y +CONFIG_IOMMUFD_DRIVER=y +CONFIG_IOMMUFD_DRIVER_CORE=y +CONFIG_IOMMUFD=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_S390_HYPFS_FS=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index d89c988f33ea..dd5fc1426c88 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -54,6 +54,10 @@ CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y CONFIG_VFIO_CCW=m CONFIG_VFIO_AP=m +CONFIG_VFIO_DEVICE_CDEV=y +CONFIG_IOMMUFD_DRIVER=y +CONFIG_IOMMUFD_DRIVER_CORE=y +CONFIG_IOMMUFD=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_S390_HYPFS_FS=y diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 80bad7de7a04..0c1fc47c3ba0 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -7,3 +7,4 @@ generated-y += unistd_nr.h generic-y += asm-offsets.h generic-y += mcs_spinlock.h generic-y += mmzone.h +generic-y += ring_buffer.h diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index 59017fd3d935..50a270edb020 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -12,12 +12,11 @@ #if defined(CONFIG_BUG) && defined(CONFIG_CC_HAS_ASM_IMMEDIATE_STRINGS) #ifdef CONFIG_DEBUG_BUGVERBOSE -#define __BUG_ENTRY_VERBOSE(format, file, line) \ - " .long " format " - . # bug_entry::format\n" \ +#define __BUG_ENTRY_VERBOSE(file, line) \ " .long " file " - . # bug_entry::file\n" \ " .short " line " # bug_entry::line\n" #else -#define __BUG_ENTRY_VERBOSE(format, file, line) +#define __BUG_ENTRY_VERBOSE(file, line) #endif #ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED @@ -28,9 +27,10 @@ #define __BUG_ENTRY(format, file, line, flags, size) \ " .section __bug_table,\"aw\"\n" \ - "1: .long 0b - . # bug_entry::bug_addr\n" \ - __BUG_ENTRY_VERBOSE(format, file, line) \ - " .short "flags" # bug_entry::flags\n" \ + "1: .long 0b - . # bug_entry::bug_addr\n"\ + " .long " format " - . # bug_entry::format\n" \ + __BUG_ENTRY_VERBOSE(file, line) \ + " .short "flags" # bug_entry::flags\n" \ " .org 1b+"size"\n" \ " .previous" diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h index 2d3ae421077e..d2b616604a46 100644 --- a/arch/s390/include/asm/gmap_helpers.h +++ b/arch/s390/include/asm/gmap_helpers.h @@ -12,5 +12,6 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr); void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end); int gmap_helper_disable_cow_sharing(void); void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr); +pte_t *try_get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); #endif /* _ASM_S390_GMAP_HELPERS_H */ diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h index df3fb7d8227b..1b3ac553a642 100644 --- a/arch/s390/include/asm/linkage.h +++ b/arch/s390/include/asm/linkage.h @@ -7,4 +7,6 @@ #define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x07 #define __ALIGN_STR __stringify(__ALIGN) +#define _THIS_IP_ ({ unsigned long __ip; asm volatile("larl %0, ." : "=d" (__ip)); __ip; }) + #endif diff --git a/arch/s390/kernel/perf_pai.c b/arch/s390/kernel/perf_pai.c index 86f71a3d1ef2..cdb8006220ca 100644 --- a/arch/s390/kernel/perf_pai.c +++ b/arch/s390/kernel/perf_pai.c @@ -186,6 +186,13 @@ static u64 pai_getctr(unsigned long *page, int nr, unsigned long offset) return page[nr]; } +static void pai_setctr(unsigned long *page, int nr, unsigned long offset, u64 v) +{ + if (offset) + nr += offset / sizeof(*page); + page[nr] = v; +} + /* Read the counter values. Return value from location in CMP. For base * event xxx_ALL sum up all events. Returns counter value. */ @@ -551,6 +558,8 @@ static void paicrypt_del(struct perf_event *event, int flags) /* Create raw data and save it in buffer. Calculate the delta for each * counter between this invocation and the last invocation. * Returns number of bytes copied. + * After reading from PAI counter page, save the read value to the old + * page to calculate PAI counter deltas. * Saves only entries with positive counter difference of the form * 2 bytes: Number of counter * 8 bytes: Value of counter @@ -562,16 +571,22 @@ static size_t pai_copy(struct pai_userdata *userdata, unsigned long *page, int i, outidx = 0; for (i = 1; i <= pp->num_avail; i++) { - u64 val = 0, val_old = 0; + u64 val = 0, val_old = 0, val_k = 0, val_old_k = 0; if (!exclude_kernel) { - val += pai_getctr(page, i, pp->kernel_offset); - val_old += pai_getctr(page_old, i, pp->kernel_offset); + val_k = pai_getctr(page, i, pp->kernel_offset); + val_old_k = pai_getctr(page_old, i, pp->kernel_offset); + if (val_k != val_old_k) + pai_setctr(page_old, i, pp->kernel_offset, val_k); } if (!exclude_user) { - val += pai_getctr(page, i, 0); - val_old += pai_getctr(page_old, i, 0); + val = pai_getctr(page, i, 0); + val_old = pai_getctr(page_old, i, 0); + if (val != val_old) + pai_setctr(page_old, i, 0, val); } + val += val_k; + val_old += val_old_k; if (val >= val_old) val -= val_old; else @@ -602,8 +617,6 @@ static size_t pai_copy(struct pai_userdata *userdata, unsigned long *page, static int pai_push_sample(size_t rawsize, struct pai_map *cpump, struct perf_event *event) { - int idx = PAI_PMU_IDX(event); - struct pai_pmu *pp = &pai_pmu[idx]; struct perf_sample_data data; struct perf_raw_record raw; struct pt_regs regs; @@ -634,8 +647,6 @@ static int pai_push_sample(size_t rawsize, struct pai_map *cpump, overflow = perf_event_overflow(event, &data, ®s); perf_event_update_userpage(event); - /* Save crypto counter lowcore page after reading event data. */ - memcpy((void *)PAI_SAVE_AREA(event), cpump->area, pp->area_size); return overflow; } @@ -651,7 +662,7 @@ static void pai_have_sample(struct perf_event *event, struct pai_map *cpump) rawsize = pai_copy(cpump->save, cpump->area, pp, (unsigned long *)PAI_SAVE_AREA(event), event->attr.exclude_user, - event->attr.exclude_kernel); + !pp->kernel_offset ? true : event->attr.exclude_kernel); if (rawsize) /* No incremented counters */ pai_push_sample(rawsize, cpump, event); } diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 1913a5566ac2..1377c6f3f670 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -192,17 +192,21 @@ static void tl_to_masks(struct sysinfo_15_1_x *info) end = (union topology_entry *)((unsigned long)info + info->length); while (tle < end) { switch (tle->nl) { + /* + * Adjust drawer_id, book_id, and socked_id so they match the + * numbering scheme of e.g. the hardware management console. + */ case 3: drawer = drawer->next; - drawer->id = tle->container.id; + drawer->id = tle->container.id - 1; break; case 2: book = book->next; - book->id = tle->container.id; + book->id = tle->container.id - 1; break; case 1: socket = socket->next; - socket->id = tle->container.id; + socket->id = tle->container.id - 1; break; case 0: add_cpus_to_mask(&tle->cpu, drawer, book, socket); diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index 7b8d70fe406d..4a41c0247ffa 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -267,6 +267,7 @@ static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t g /* No need to take locks as the page table is not installed yet. */ pgste_init.prefix_notif = old.s.fc1.prefix_notif; pgste_init.vsie_notif = old.s.fc1.vsie_notif; + pgste_init.vsie_gmem = old.s.fc1.vsie_notif; pgste_init.pcl = uses_skeys && init.h.i; dat_init_pgstes(pt, pgste_init.val); } else { diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index 8f8278c44879..873e13ac5a27 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -145,7 +145,8 @@ union pgste { unsigned long cmma_d : 1; /* Dirty flag for CMMA bits */ unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long vsie_notif : 1; /* Referenced in a shadow table */ - unsigned long : 5; + unsigned long vsie_gmem : 1; /* Contains nested guest memory */ + unsigned long : 4; unsigned long : 8; }; struct { diff --git a/arch/s390/kvm/faultin.c b/arch/s390/kvm/faultin.c index ddf0ca71f374..fee80047bd94 100644 --- a/arch/s390/kvm/faultin.c +++ b/arch/s390/kvm/faultin.c @@ -36,7 +36,8 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa struct kvm_s390_mmu_cache *mc = NULL; struct kvm_memory_slot *slot; unsigned long inv_seq; - int foll, rc = 0; + int rc = -EAGAIN; + int foll; foll = f->write_attempt ? FOLL_WRITE : 0; foll |= f->attempt_pfault ? FOLL_NOWAIT : 0; @@ -53,7 +54,14 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa return 0; } - while (1) { + if (!mc) { + local_mc = kvm_s390_new_mmu_cache(); + if (!local_mc) + return -ENOMEM; + mc = local_mc; + } + + while (rc == -EAGAIN) { f->valid = false; inv_seq = kvm->mmu_invalidate_seq; /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ @@ -93,14 +101,7 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa if (is_error_pfn(f->pfn)) return -EFAULT; - if (!mc) { - local_mc = kvm_s390_new_mmu_cache(); - if (!local_mc) - return -ENOMEM; - mc = local_mc; - } - - /* Loop, will automatically release the faulted page. */ + /* Loop, release the faulted page. */ if (mmu_invalidate_retry_gfn_unsafe(kvm, inv_seq, f->gfn)) { kvm_release_faultin_page(kvm, f->page, true, false); continue; @@ -110,20 +111,19 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa if (!mmu_invalidate_retry_gfn(kvm, inv_seq, f->gfn)) { f->valid = true; rc = gmap_link(mc, kvm->arch.gmap, f, slot); - kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt); - f->page = NULL; } + kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt); } - kvm_release_faultin_page(kvm, f->page, true, false); if (rc == -ENOMEM) { rc = kvm_s390_mmu_cache_topup(mc); if (rc) return rc; - } else if (rc != -EAGAIN) { - return rc; + rc = -EAGAIN; } } + + return rc; } int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index b07accd19618..20e28b183c1a 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1445,6 +1445,7 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union } else { pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false); pgste.vsie_notif = 1; + pgste.vsie_gmem = 1; } pgste_set_unlock(ptep_h, pgste); if (rc) @@ -1465,15 +1466,17 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni struct guest_fault *f, bool p) { union crste newcrste, oldcrste; - gfn_t gfn; + unsigned long mask; + gfn_t r_gfn; int rc; lockdep_assert_held(&sg->kvm->mmu_lock); lockdep_assert_held(&sg->parent->children_lock); - gfn = f->gfn & (is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK); + mask = is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK; + r_gfn = gpa_to_gfn(raddr) & mask; scoped_guard(spinlock, &sg->host_to_rmap_lock) - rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt); + rc = gmap_insert_rmap(sg, f->gfn & mask, r_gfn, host->h.tt); if (rc) return rc; @@ -1496,8 +1499,7 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni return -EAGAIN; newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p); - gfn = gpa_to_gfn(raddr); - while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce)) + while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, r_gfn, sg->asce)) ; return 0; } diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index 3c26e35af0ef..52d55ddea8d4 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -125,7 +125,7 @@ struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit) int gmap_set_limit(struct gmap *gmap, gfn_t limit) { - struct kvm_s390_mmu_cache *mc; + struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL; int rc, type; type = gmap_limit_to_type(limit); @@ -142,7 +142,6 @@ int gmap_set_limit(struct gmap *gmap, gfn_t limit) rc = dat_set_asce_limit(mc, &gmap->asce, type); } while (rc == -ENOMEM); - kvm_s390_free_mmu_cache(mc); return 0; } @@ -396,15 +395,28 @@ static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct struct gmap_unmap_priv *priv = walk->priv; struct folio *folio = NULL; union crste old = *crstep; + bool ok; if (!old.h.fc) return 0; if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) folio = phys_to_folio(crste_origin_large(old)); - /* No races should happen because kvm->mmu_lock is held in write mode */ - KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn), - priv->gmap->kvm); + /* + * No races should happen because kvm->mmu_lock is held in write mode, + * but the unmap operation could have triggered an unshadow, which + * causes gmap_crstep_xchg_atomic() to return false and clear the + * vsie_notif bit. Allow the operation to fail once, if the old crste + * had the vsie_notif bit set. A second failure is not allowed, for + * the reasons above. + */ + ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn); + if (!ok) { + KVM_BUG_ON(!old.s.fc1.vsie_notif, priv->gmap->kvm); + old.s.fc1.vsie_notif = 0; + ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn); + KVM_BUG_ON(!ok, priv->gmap->kvm); + } if (folio) uv_convert_from_secure_folio(folio); @@ -822,8 +834,8 @@ int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count) { - struct kvm_s390_mmu_cache *mc; - int rc; + struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL; + int rc = 0; mc = kvm_s390_new_mmu_cache(); if (!mc) @@ -1026,13 +1038,15 @@ int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level) int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, kvm_pfn_t pfn, int level, bool wr) { + unsigned long bitmask; union crste *crstep; union pgste pgste; union pte *ptep; union pte pte; int flags, rc; - KVM_BUG_ON(!is_shadow(sg), sg->kvm); + if (KVM_BUG_ON(!is_shadow(sg) || level <= TABLE_TYPE_PAGE_TABLE, sg->kvm)) + return -EINVAL; lockdep_assert_held(&sg->parent->children_lock); flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); @@ -1041,8 +1055,9 @@ int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gf if (rc) return rc; if (level <= TABLE_TYPE_REGION1) { + bitmask = -1UL << (8 + 11 * level); scoped_guard(spinlock, &sg->host_to_rmap_lock) - rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level); + rc = gmap_insert_rmap(sg, p_gfn, r_gfn & bitmask, level); } if (rc) return rc; @@ -1143,8 +1158,10 @@ void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) } scoped_guard(spinlock, &sg->host_to_rmap_lock) head = radix_tree_delete(&sg->host_to_rmap, gfn); - gmap_for_each_rmap_safe(rmap, rnext, head) + gmap_for_each_rmap_safe(rmap, rnext, head) { gmap_unshadow_level(sg, rmap->r_gfn, rmap->level); + kfree(rmap); + } } } diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index 96ee1395a592..5374f21aaf8d 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -167,6 +167,36 @@ static inline bool gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end) return _gmap_unmap_prefix(gmap, gfn, end, false); } +/** + * pte_needs_unshadow() -- Check if the pte operations triggers unshadowing. + * @oldpte: the previous value for the guest pte. + * @newpte: the new pte being set. + * @pgste: the pgste for the pte entry. + * + * If the pgste.vsie_notif bit is not set, return false: the page is not + * involved in vsie and thus should not trigger an unshadow operation. + * + * If the pgste.vsie_gmem bit is set, this pte represents shadowed guest + * memory. The access rights on g3's memory should be synchronized with g1's + * and g2's. Therefore unshadowing is triggered if the new and old pte + * differ in protection, or if the new pte is invalid. + * + * If the pgste.vsie_gmem bit is not set, this pte maps the g2 dat tables + * for g3. If the entry becomes writable or absent, it becomes impossible to + * guarantee that the shadow mapping will match g2's mapping. In that case, + * trigger an unshadow event. + * + * Return: true if an unshadow event should be triggered, otherwise false. + */ +static inline bool pte_needs_unshadow(union pte oldpte, union pte newpte, union pgste pgste) +{ + if (!pgste.vsie_notif) + return false; + if (pgste.vsie_gmem) + return (oldpte.h.p != newpte.h.p) || newpte.h.i; + return !newpte.h.p || !newpte.s.pr; +} + static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte, union pgste pgste, gfn_t gfn, bool needs_lock) { @@ -180,8 +210,9 @@ static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, un pgste.prefix_notif = 0; gmap_unmap_prefix(gmap, gfn, gfn + 1); } - if (pgste.vsie_notif && (ptep->h.p != newpte.h.p || newpte.h.i)) { + if (pte_needs_unshadow(*ptep, newpte, pgste)) { pgste.vsie_notif = 0; + pgste.vsie_gmem = 0; if (needs_lock) gmap_handle_vsie_unshadow_event(gmap, gfn); else @@ -189,6 +220,7 @@ static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, un } if (!ptep->s.d && newpte.s.d && !newpte.s.s) SetPageDirty(pfn_to_page(newpte.h.pfra)); + pgste.zero = 0; return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap)); } @@ -198,6 +230,30 @@ static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, uni return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true); } +/** + * crste_needs_unshadow() -- Check if the crste operations triggers unshadowing. + * @oldcrste: the previous value for the crste. + * @newcrste: the new value for the crste. + * + * If the old crste did not have the vsie_notif bit set, return false: the + * page is not involved in vsie and thus should not trigger an unshadow + * operation. Conversely, if the bit is set, it can only be g3 memory, since + * dat tables are never mapped using large pages. + * + * Similar to the pgste.vsie_gmem case of pte_needs_unshadow(), if the + * protection bit is changing or the new page is invalid, trigger an + * unshadow event. Also trigger an unshadow event if the new crste does not + * have the vsie_notif bit set. + * + * Return: true if an unshadow event should be triggered, otherwise false. + */ +static inline bool crste_needs_unshadow(union crste oldcrste, union crste newcrste) +{ + if (!oldcrste.s.fc1.vsie_notif) + return false; + return (newcrste.h.p != oldcrste.h.p) || newcrste.h.i || !newcrste.s.fc1.vsie_notif; +} + static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, union crste oldcrste, union crste newcrste, gfn_t gfn, bool needs_lock) @@ -216,13 +272,15 @@ static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, unio newcrste.s.fc1.prefix_notif = 0; gmap_unmap_prefix(gmap, gfn, gfn + align); } - if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif && - (newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) { + if (crste_leaf(oldcrste) && crste_needs_unshadow(oldcrste, newcrste)) { + newcrste = oldcrste; newcrste.s.fc1.vsie_notif = 0; if (needs_lock) gmap_handle_vsie_unshadow_event(gmap, gfn); else _gmap_handle_vsie_unshadow_event(gmap, gfn); + dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce); + return false; } if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s) SetPageDirty(phys_to_page(crste_origin_large(newcrste))); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 07f59c3b9a7b..3bcdbbbb6891 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -3310,8 +3310,7 @@ static void aen_host_forward(unsigned long si) struct zpci_gaite *gaite; struct kvm *kvm; - gaite = (struct zpci_gaite *)aift->gait + - (si * sizeof(struct zpci_gaite)); + gaite = aift->gait + si; if (gaite->count == 0) return; if (gaite->aisb != 0) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e09960c2e6ed..ffb20a64d328 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -999,7 +999,10 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att break; } case KVM_S390_VM_MEM_LIMIT_SIZE: { + struct kvm_memslots *slots; + struct kvm_memory_slot *ms; unsigned long new_limit; + int bkt; if (kvm_is_ucontrol(kvm)) return -EINVAL; @@ -1007,6 +1010,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (get_user(new_limit, (u64 __user *)attr->addr)) return -EFAULT; + guard(mutex)(&kvm->lock); + + new_limit = ALIGN(new_limit, HPAGE_SIZE); if (kvm->arch.mem_limit != KVM_S390_NO_MEM_LIMIT && new_limit > kvm->arch.mem_limit) return -E2BIG; @@ -1014,12 +1020,27 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!new_limit) return -EINVAL; - ret = -EBUSY; - if (!kvm->created_vcpus) - ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); + if (kvm->created_vcpus) + return -EBUSY; + + ret = 0; + scoped_guard(mutex, &kvm->slots_lock) { + slots = kvm_memslots(kvm); + if (slots && !kvm_memslots_empty(slots)) { + kvm_for_each_memslot(ms, bkt, slots) { + if (gpa_to_gfn(new_limit) < ms->base_gfn + ms->npages) { + ret = -EBUSY; + break; + } + } + } + if (!ret) + ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); + } + if (ret) + break; VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit); - VM_EVENT(kvm, 3, "New guest asce: 0x%p", - (void *)kvm->arch.gmap->asce.val); + VM_EVENT(kvm, 3, "New guest asce: 0x%p", (void *)kvm->arch.gmap->asce.val); break; } default: @@ -5672,6 +5693,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return -EINVAL; if ((new->base_gfn + new->npages) * PAGE_SIZE > kvm->arch.mem_limit) return -EINVAL; + if (!asce_contains_gfn(kvm->arch.gmap->asce, new->base_gfn + new->npages - 1)) + return -EINVAL; } if (!kvm->arch.migration_mode) diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index 86d93e8dddae..5b075c38998e 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -166,7 +166,7 @@ static int kvm_zpci_set_airq(struct zpci_dev *zdev) fib.fmt0.noi = airq_iv_end(zdev->aibv); fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector); fib.fmt0.aibvo = 0; - fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib.fmt0.aisb = virt_to_phys(aift->sbv->vector) + (zdev->aisb / 64) * 8; fib.fmt0.aisbo = zdev->aisb & 63; fib.gd = zdev->gisa; @@ -290,8 +290,7 @@ static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib, phys_to_virt(fib->fmt0.aibv)); spin_lock_irq(&aift->gait_lock); - gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * - sizeof(struct zpci_gaite)); + gaite = aift->gait + zdev->aisb; /* If assist not requested, host will get all alerts */ if (assist) @@ -309,7 +308,7 @@ static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib, /* Update guest FIB for re-issue */ fib->fmt0.aisbo = zdev->aisb & 63; - fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib->fmt0.aisb = virt_to_phys(aift->sbv->vector) + (zdev->aisb / 64) * 8; fib->fmt0.isc = gisc; /* Save some guest fib values in the host for later use */ @@ -357,8 +356,7 @@ static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force) if (zdev->kzdev->fib.fmt0.aibv == 0) goto out; spin_lock_irq(&aift->gait_lock); - gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * - sizeof(struct zpci_gaite)); + gaite = aift->gait + zdev->aisb; isc = gaite->gisc; gaite->count--; if (gaite->count == 0) { diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index cc0553da14cb..447ec7ed423d 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1188,6 +1188,7 @@ static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len union crste *crstep; union pgste pgste; union pte *ptep; + hva_t hva; int i; lockdep_assert_held(&vcpu->kvm->mmu_lock); @@ -1199,8 +1200,11 @@ static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len if (!ptep || ptep->s.pr) continue; pgste = pgste_get_lock(ptep); - if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) - gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]); + if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) { + hva = gpa_to_hva(vcpu->kvm, cbrl[i]); + if (!kvm_is_error_hva(hva)) + gmap_helper_zap_one_page(vcpu->kvm->mm, hva); + } pgste_set_unlock(ptep, pgste); } } diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index c2dafd812a3b..4b865e75351c 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -17,6 +17,7 @@ #include <linux/pagewalk.h> #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> +#include <asm/gmap_helpers.h> #include "kvm-s390.h" #include "dat.h" #include "gaccess.h" @@ -73,6 +74,7 @@ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_str struct pv_make_secure { void *uvcb; struct folio *folio; + struct kvm *kvm; int rc; bool needs_export; }; @@ -103,9 +105,21 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f) { struct pv_make_secure *priv = f->priv; struct folio *folio; + spinlock_t *ptl; /* pte lock from try_get_locked_pte() */ + pte_t *ptep; folio = pfn_folio(f->pfn); priv->rc = -EAGAIN; + + if (!mmap_read_trylock(priv->kvm->mm)) + return; + + ptep = try_get_locked_pte(priv->kvm->mm, gfn_to_hva(priv->kvm, f->gfn), &ptl); + if (IS_ERR_VALUE(ptep)) { + priv->rc = PTR_ERR(ptep); + goto out; + } + if (folio_trylock(folio)) { priv->rc = __kvm_s390_pv_make_secure(f, folio); if (priv->rc == -E2BIG || priv->rc == -EBUSY) { @@ -114,6 +128,11 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f) } folio_unlock(folio); } + + if (ptep) + pte_unmap_unlock(ptep, ptl); +out: + mmap_read_unlock(priv->kvm->mm); } /** @@ -127,7 +146,7 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f) */ int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb) { - struct pv_make_secure priv = { .uvcb = uvcb }; + struct pv_make_secure priv = { .uvcb = uvcb, .kvm = kvm, }; struct guest_fault f = { .write_attempt = true, .gfn = gpa_to_gfn(gaddr), diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index f8789ffcc05c..1cfe4724fbe2 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -17,22 +17,68 @@ #include <asm/gmap_helpers.h> /** - * ptep_zap_softleaf_entry() - discard a software leaf entry. + * try_get_locked_pte() - like get_locked_pte(), but atomic and with trylock * @mm: the mm - * @entry: the software leaf entry that needs to be zapped + * @vmaddr: the userspace virtual address whose pte is to be found + * @ptl: will be set to the pointer to the lock used to lock the pte in case + * of success. * - * Discards the given software leaf entry. If the leaf entry was an actual - * swap entry (and not a migration entry, for example), the actual swapped - * page is also discarded from swap. + * This function returns the pointer to the pte corresponding to @addr in @mm, + * similarly to get_locked_pte(). Unlike get_locked_pte(), no attempt is made + * to allocate missing page tables. If a missing or large entry is found, the + * function will return NULL. If the ptl lock is contended, %-EAGAIN is + * returned. + * + * In case of success, *@ptl will point to the locked pte lock for the returned + * pte, like get_locked_pte() does. + * + * Context: mmap_lock or vma lock for read or for write needs to be held. + * Return: + * * %NULL if the pte cannot be reached. + * * %-EAGAIN if the pte can be reached, but cannot be locked. + * * the pointer to the pte corresponding to @addr in @mm, if it can be reached + * and locked. */ -static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) +pte_t *try_get_locked_pte(struct mm_struct *mm, unsigned long vmaddr, spinlock_t **ptl) { - if (softleaf_is_swap(entry)) - dec_mm_counter(mm, MM_SWAPENTS); - else if (softleaf_is_migration(entry)) - dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry))); - swap_put_entries_direct(entry, 1); + pmd_t *pmdp, pmd, pmdval; + pud_t *pudp, pud; + p4d_t *p4dp, p4d; + pgd_t *pgdp, pgd; + pte_t *ptep; + + pgdp = pgd_offset(mm, vmaddr); + pgd = pgdp_get(pgdp); + if (pgd_none(pgd) || !pgd_present(pgd)) + return NULL; + p4dp = p4d_offset(pgdp, vmaddr); + p4d = p4dp_get(p4dp); + if (p4d_none(p4d) || !p4d_present(p4d)) + return NULL; + pudp = pud_offset(p4dp, vmaddr); + pud = pudp_get(pudp); + if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud)) + return NULL; + pmdp = pmd_offset(pudp, vmaddr); + pmd = pmdp_get_lockless(pmdp); + if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd)) + return NULL; + ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, ptl); + if (!ptep) + return NULL; + + if (spin_trylock(*ptl)) { + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmdp)))) { + pte_unmap_unlock(ptep, *ptl); + return ERR_PTR(-EAGAIN); + } + return ptep; + } + + pte_unmap(ptep); + return ERR_PTR(-EAGAIN); } +EXPORT_SYMBOL_GPL(try_get_locked_pte); /** * gmap_helper_zap_one_page() - discard a page if it was swapped. @@ -46,7 +92,8 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) { struct vm_area_struct *vma; - spinlock_t *ptl; + spinlock_t *ptl; /* Lock for the host (userspace) page table */ + softleaf_t sl; pte_t *ptep; mmap_assert_locked(mm); @@ -57,11 +104,13 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) return; /* Get pointer to the page table entry */ - ptep = get_locked_pte(mm, vmaddr, &ptl); - if (unlikely(!ptep)) + ptep = try_get_locked_pte(mm, vmaddr, &ptl); + if (IS_ERR_OR_NULL(ptep)) return; - if (pte_swap(*ptep)) { - ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); + sl = softleaf_from_pte(*ptep); + if (pte_swap(*ptep) && softleaf_is_swap(sl)) { + dec_mm_counter(mm, MM_SWAPENTS); + swap_put_entries_direct(sl, 1); pte_clear(mm, vmaddr, ptep); } pte_unmap_unlock(ptep, ptl); @@ -113,37 +162,9 @@ EXPORT_SYMBOL_GPL(gmap_helper_discard); */ void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) { - pmd_t *pmdp, pmd, pmdval; - pud_t *pudp, pud; - p4d_t *p4dp, p4d; - pgd_t *pgdp, pgd; spinlock_t *ptl; /* Lock for the host (userspace) page table */ pte_t *ptep; - pgdp = pgd_offset(mm, vmaddr); - pgd = pgdp_get(pgdp); - if (pgd_none(pgd) || !pgd_present(pgd)) - return; - - p4dp = p4d_offset(pgdp, vmaddr); - p4d = p4dp_get(p4dp); - if (p4d_none(p4d) || !p4d_present(p4d)) - return; - - pudp = pud_offset(p4dp, vmaddr); - pud = pudp_get(pudp); - if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud)) - return; - - pmdp = pmd_offset(pudp, vmaddr); - pmd = pmdp_get_lockless(pmdp); - if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd)) - return; - - ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl); - if (!ptep) - return; - /* * Several paths exists that takes the ptl lock and then call the * mmu_notifier, which takes the mmu_lock. The unmap path, instead, @@ -156,21 +177,12 @@ void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) * If the lock is contended the bit is not set and the deadlock is * avoided. */ - if (spin_trylock(ptl)) { - /* - * Make sure the pte we are touching is still the correct - * one. In theory this check should not be needed, but - * better safe than sorry. - * Disabling interrupts or holding the mmap lock is enough to - * guarantee that no concurrent updates to the page tables - * are possible. - */ - if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp)))) - __atomic64_or(_PAGE_UNUSED, (long *)ptep); - spin_unlock(ptl); - } + ptep = try_get_locked_pte(mm, vmaddr, &ptl); + if (IS_ERR_OR_NULL(ptep)) + return; - pte_unmap(ptep); + __atomic64_or(_PAGE_UNUSED, (long *)ptep); + pte_unmap_unlock(ptep, ptl); } EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused); diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 4d3f10ed8275..f0403d3ee8ab 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -3,4 +3,5 @@ generated-y += syscall_table.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h +generic-y += ring_buffer.h generic-y += text-patching.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 17ee8a273aa6..49c6bb326b75 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -4,4 +4,5 @@ generated-y += syscall_table_64.h generic-y += agp.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += text-patching.h diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 1b9b82bbe322..2a1629ba8140 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -17,6 +17,7 @@ generic-y += module.lds.h generic-y += parport.h generic-y += percpu.h generic-y += preempt.h +generic-y += ring_buffer.h generic-y += runtime-const.h generic-y += softirq_stack.h generic-y += switch_to.h diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 46fec0b08487..1d526a5d2a83 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -77,6 +77,10 @@ KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-sse4a KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 +# The target.json file is not available when invoking rustc-option, so use the +# built-in target when checking whether flags are supported instead. +KBUILD_RUSTFLAGS_OPTION_CHKS += --target=x86_64-unknown-none + # # CFLAGS for compiling floating point code inside the kernel. # diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 19c13afa474e..9adecd65639f 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -14,6 +14,14 @@ endif KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json +# The target.json file is not available when invoking rustc-option, so use the +# built-in target when checking whether flags are supported instead. +ifeq ($(CONFIG_X86_32),y) +KBUILD_RUSTFLAGS_OPTION_CHKS += --target=i686-unknown-linux-gnu +else +KBUILD_RUSTFLAGS_OPTION_CHKS += --target=x86_64-unknown-linux-gnu +endif + ifeq ($(CONFIG_X86_32),y) START := 0x8048000 diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 72cae8e0ce85..83b4762d6ecb 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) CFLAGS_syscall_32.o += -fno-stack-protector CFLAGS_syscall_64.o += -fno-stack-protector -obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o +obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o common.o obj-y += vdso/ obj-y += vsyscall/ diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c new file mode 100644 index 000000000000..14cd43d4da6c --- /dev/null +++ b/arch/x86/entry/common.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/entry-common.h> +#include <linux/kvm_types.h> +#include <linux/hrtimer_rearm.h> +#include <asm/fred.h> +#include <asm/desc.h> + +#if IS_ENABLED(CONFIG_KVM_INTEL) +/* + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledged by hardware as + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit. + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs + * to the kernel for servicing. On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered + * the VM-Exit is held pending until it's unblocked in the host. + */ +noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector) +{ + if (event_type == EVENT_TYPE_EXTINT) { +#ifdef CONFIG_X86_64 + /* + * Use FRED dispatch, even when running IDT. The dispatch + * tables are kept in sync between FRED and IDT, and the FRED + * dispatch works well with CFI. + */ + fred_entry_from_kvm(event_type, vector); +#else + idt_entry_from_kvm(vector); +#endif + /* + * Strictly speaking, only the NMI path requires noinstr. + */ + instrumentation_begin(); + /* + * KVM/VMX will dispatch from IRQ-disabled but for a context + * that will have IRQs-enabled. This confuses the entry code + * and it will not have reprogrammed the timer. Do so now. + */ + hrtimer_rearm_deferred(); + instrumentation_end(); + + return; + } + + WARN_ON_ONCE(event_type != EVENT_TYPE_NMI); + +#ifdef CONFIG_X86_64 + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return fred_entry_from_kvm(event_type, vector); +#endif + + /* + * Notably, we must use IDT dispatch for NMI when running in IDT mode. + * The FRED NMI context is significantly different and will not work + * right (specifically FRED fixed the NMI recursion issue). + */ + idt_do_nmi_irqoff(); +} +EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm); +#endif diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 6ba2b3adcef0..2bc217bb5475 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -75,3 +75,51 @@ THUNK warn_thunk_thunk, __warn_thunk #if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP) EXPORT_SYMBOL(__ref_stack_chk_guard); #endif + +#if IS_ENABLED(CONFIG_KVM_INTEL) +.macro IDT_DO_EVENT_IRQOFF call_insn call_target + /* + * Unconditionally create a stack frame, getting the correct RSP on the + * stack (for x86-64) would take two instructions anyways, and RBP can + * be used to restore RSP to make objtool happy (see below). + */ + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + +#ifdef CONFIG_X86_64 + /* + * Align RSP to a 16-byte boundary (to emulate CPU behavior) before + * creating the synthetic interrupt stack frame for the IRQ/NMI. + */ + and $-16, %rsp + push $__KERNEL_DS + push %rbp +#endif + pushf + push $__KERNEL_CS + \call_insn \call_target + + /* + * "Restore" RSP from RBP, even though IRET has already unwound RSP to + * the correct value. objtool doesn't know the callee will IRET and, + * without the explicit restore, thinks the stack is getting walloped. + * Using an unwind hint is problematic due to x86-64's dynamic alignment. + */ + leave + RET +.endm + +#ifndef CONFIG_X86_64 +.pushsection .text, "ax" +SYM_FUNC_START(idt_do_interrupt_irqoff) + IDT_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 +SYM_FUNC_END(idt_do_interrupt_irqoff) +.popsection +#endif + +.pushsection .noinstr.text, "ax" +SYM_FUNC_START(idt_do_nmi_irqoff) + IDT_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx +SYM_FUNC_END(idt_do_nmi_irqoff) +.popsection +#endif diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index 894f7f16eb80..0d2768ab836c 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -147,5 +147,4 @@ SYM_FUNC_START(asm_fred_entry_from_kvm) RET SYM_FUNC_END(asm_fred_entry_from_kvm) -EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm); #endif diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index a6bfcc8243cd..d903bce24f15 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -178,7 +178,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) if (IS_ERR(vma)) { ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size, NULL); - do_munmap(mm, addr, image->size, NULL); + do_munmap(mm, addr, VDSO_NR_PAGES * PAGE_SIZE, NULL); goto up_fail; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 810ab21ffd99..4b9e105309c6 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1294,13 +1294,16 @@ int x86_perf_rdpmc_index(struct perf_event *event) return event->hw.event_base_rdpmc; } -static inline int match_prev_assignment(struct hw_perf_event *hwc, +static inline int match_prev_assignment(struct perf_event *event, struct cpu_hw_events *cpuc, int i) { + struct hw_perf_event *hwc = &event->hw; + return hwc->idx == cpuc->assign[i] && - hwc->last_cpu == smp_processor_id() && - hwc->last_tag == cpuc->tags[i]; + hwc->last_cpu == smp_processor_id() && + hwc->last_tag == cpuc->tags[i] && + !is_acr_event_group(event); } static void x86_pmu_start(struct perf_event *event, int flags); @@ -1346,7 +1349,7 @@ static void x86_pmu_enable(struct pmu *pmu) * - no other event has used the counter since */ if (hwc->idx == -1 || - match_prev_assignment(hwc, cpuc, i)) + match_prev_assignment(event, cpuc, i)) continue; /* @@ -1367,7 +1370,7 @@ static void x86_pmu_enable(struct pmu *pmu) event = cpuc->event_list[i]; hwc = &event->hw; - if (!match_prev_assignment(hwc, cpuc, i)) + if (!match_prev_assignment(event, cpuc, i)) x86_assign_hw_event(event, cpuc, i); else if (i < n_running) continue; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index d9488ade0f8e..dd1e3aa75ee9 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3118,11 +3118,11 @@ static void intel_pmu_enable_fixed(struct perf_event *event) intel_set_masks(event, idx); /* - * Enable IRQ generation (0x8), if not PEBS, - * and enable ring-3 counting (0x2) and ring-0 counting (0x1) - * if requested: + * Enable IRQ generation (0x8), if not PEBS or self-reloaded + * ACR event, and enable ring-3 counting (0x2) and ring-0 + * counting (0x1) if requested: */ - if (!event->attr.precise_ip) + if (!event->attr.precise_ip && !is_acr_self_reload_event(event)) bits |= INTEL_FIXED_0_ENABLE_PMI; if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) bits |= INTEL_FIXED_0_USER; @@ -3306,6 +3306,15 @@ static void intel_pmu_enable_event(struct perf_event *event) intel_set_masks(event, idx); static_call_cond(intel_pmu_enable_acr_event)(event); static_call_cond(intel_pmu_enable_event_ext)(event); + /* + * For self-reloaded ACR event, don't enable PMI since + * HW won't set overflow bit in GLOBAL_STATUS. Otherwise, + * the PMI would be recognized as a suspicious NMI. + */ + if (is_acr_self_reload_event(event)) + hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; + else if (!event->attr.precise_ip) + hwc->config |= ARCH_PERFMON_EVENTSEL_INT; __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: @@ -3332,23 +3341,41 @@ static void intel_pmu_enable_event(struct perf_event *event) static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) { struct perf_event *event, *leader; - int i, j, idx; + int i, j, k, bit, idx; + /* + * FIXME: ACR mask parsing relies on cpuc->event_list[] (active events only). + * Disabling an ACR event causes bit-shifting errors in the acr_mask of + * remaining group members. As ACR sampling requires all events to be active, + * this limitation is acceptable for now. Revisit if independent event toggling + * is required. + */ for (i = 0; i < cpuc->n_events; i++) { leader = cpuc->event_list[i]; if (!is_acr_event_group(leader)) continue; - /* The ACR events must be contiguous. */ + /* Find the last event of the ACR group. */ for (j = i; j < cpuc->n_events; j++) { event = cpuc->event_list[j]; if (event->group_leader != leader->group_leader) break; - for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { - if (i + idx >= cpuc->n_events || - !is_acr_event_group(cpuc->event_list[i + idx])) - return; - __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); + } + + /* + * Translate the user-space ACR mask (attr.config2) into the physical + * counter bitmask (hw.config1) for each ACR event in the group. + * NOTE: ACR event contiguity is guaranteed by intel_pmu_hw_config(). + */ + for (k = i; k < j; k++) { + event = cpuc->event_list[k]; + event->hw.config1 = 0; + for_each_set_bit(bit, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { + idx = i + bit; + /* Event index of ACR group must locate in [i, j). */ + if (idx >= j || !is_acr_event_group(cpuc->event_list[idx])) + continue; + __set_bit(cpuc->assign[idx], (unsigned long *)&event->hw.config1); } } i = j - 1; @@ -7504,6 +7531,7 @@ static __always_inline void intel_pmu_init_pnc(struct pmu *pmu) hybrid(pmu, event_constraints) = intel_pnc_event_constraints; hybrid(pmu, pebs_constraints) = intel_pnc_pebs_event_constraints; hybrid(pmu, extra_regs) = intel_pnc_extra_regs; + static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr); } static __always_inline void intel_pmu_init_skt(struct pmu *pmu) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index fad87d3c8b2c..524668dcf4cc 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -137,6 +137,16 @@ static inline bool is_acr_event_group(struct perf_event *event) return check_leader_group(event->group_leader, PERF_X86_EVENT_ACR); } +static inline bool is_acr_self_reload_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < 0) + return false; + + return test_bit(hwc->idx, (unsigned long *)&hwc->config1); +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4566000e15c4..078fd2c0d69d 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -14,3 +14,4 @@ generic-y += early_ioremap.h generic-y += fprobe.h generic-y += mcs_spinlock.h generic-y += mmzone.h +generic-y += ring_buffer.h diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index ec95fe44fa3a..00aeae843529 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -438,6 +438,10 @@ extern void idt_setup_traps(void); extern void idt_setup_apic_and_irq_gates(void); extern bool idt_is_f00f_address(unsigned long address); +extern void idt_do_interrupt_irqoff(unsigned long address); +extern void idt_do_nmi_irqoff(void); +extern void idt_entry_from_kvm(unsigned int vector); + #ifdef CONFIG_X86_64 extern void idt_setup_early_pf(void); #else diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 7e6b9314758a..2f2ce8aadf07 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -145,7 +145,7 @@ struct gate_struct { typedef struct gate_struct gate_desc; #ifndef _SETUP -static inline unsigned long gate_offset(const gate_desc *g) +static __always_inline unsigned long gate_offset(const gate_desc *g) { #ifdef CONFIG_X86_64 return g->offset_low | ((unsigned long)g->offset_middle << 16) | diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index dc8fe1361c18..be58b7f5c806 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -137,7 +137,8 @@ extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); -extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr); +extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr, + const struct pt_regs *regs); extern void efi_unmap_boot_services(void); void arch_efi_call_virt_setup(void); diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 7535131c711b..eca24b5e07f4 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -97,4 +97,6 @@ static __always_inline void arch_exit_to_user_mode(void) } #define arch_exit_to_user_mode arch_exit_to_user_mode +extern void x86_entry_from_kvm(unsigned int entry_type, unsigned int vector); + #endif diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index 2bb65677c079..18a2f811c358 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -110,7 +110,6 @@ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { ret static inline void cpu_init_fred_exceptions(void) { } static inline void cpu_init_fred_rsps(void) { } static inline void fred_complete_exception_setup(void) { } -static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } static inline void fred_sync_rsp0(unsigned long rsp0) { } static inline void fred_update_rsp0(void) { } #endif /* CONFIG_X86_FRED */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c470e40a00aa..f14009f25a3b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1504,6 +1504,7 @@ struct kvm_arch { bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; + struct ratelimit_state kvmclock_update_rs; #ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a14a0f43e04a..86554de9a3f5 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -803,9 +803,10 @@ #define MSR_AMD64_LBR_SELECT 0xc000010e /* Zen4 */ -#define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG 0xc001102e #define MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT 4 #define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 +#define MSR_ZEN2_BP_CFG_BUG_FIX_BIT 33 /* Fam 19h MSRs */ #define MSR_F19H_UMC_PERF_CTL 0xc0010800 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 10b5355b323e..67dd932305db 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -733,6 +733,7 @@ bool xen_set_default_idle(void); #endif void __noreturn stop_this_cpu(void *dummy); +extern bool x86_hypervisor_present; void microcode_check(struct cpuinfo_x86 *prev_info); void store_cpu_caps(struct cpuinfo_x86 *info); diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index d7c8ef1e354d..be4c5e9e5ff6 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -88,19 +88,19 @@ static void amd_set_max_freq_ratio(void) rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { - pr_warn("Could not retrieve perf counters (%d)\n", rc); + pr_debug("Could not retrieve perf counters (%d)\n", rc); return; } rc = amd_get_boost_ratio_numerator(0, &numerator); if (rc) { - pr_warn("Could not retrieve highest performance (%d)\n", rc); + pr_debug("Could not retrieve highest performance (%d)\n", rc); return; } nominal_perf = perf_caps.nominal_perf; if (!nominal_perf) { - pr_warn("Could not retrieve nominal performance\n"); + pr_debug("Could not retrieve nominal performance\n"); return; } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2d9ae6ab1701..31f01e9c7114 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -518,7 +518,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) break; case 0x50 ... 0x5f: case 0x80 ... 0xaf: - case 0xc0 ... 0xcf: + case 0xc0 ... 0xef: setup_force_cpu_cap(X86_FEATURE_ZEN6); break; default: @@ -989,6 +989,9 @@ static void init_amd_zen2(struct cpuinfo_x86 *c) /* Correct misconfigured CPUID on some clients. */ clear_cpu_cap(c, X86_FEATURE_INVLPGB); + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN2_BP_CFG_BUG_FIX_BIT); } static void init_amd_zen3(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 146f6f8b0650..99801e844b30 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -92,6 +92,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_FRED, X86_FEATURE_LKGS }, { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL }, { X86_FEATURE_LASS, X86_FEATURE_SMAP }, + { X86_FEATURE_INVLPGB, X86_FEATURE_PCID }, {} }; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 8dd424ac5de8..f3a793e3a6c8 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -90,7 +90,6 @@ struct mca_config mca_cfg __read_mostly = { }; static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen); -static unsigned long mce_need_notify; /* * MCA banks polled by the period polling timer for corrected events. @@ -152,8 +151,10 @@ EXPORT_PER_CPU_SYMBOL_GPL(injectm); void mce_log(struct mce_hw_err *err) { - if (mce_gen_pool_add(err)) + if (mce_gen_pool_add(err)) { + pr_info(HW_ERR "Machine check events logged\n"); irq_work_queue(&mce_irq_work); + } } EXPORT_SYMBOL_GPL(mce_log); @@ -585,28 +586,6 @@ bool mce_is_correctable(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_correctable); -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -static bool mce_notify_irq(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - if (test_and_clear_bit(0, &mce_need_notify)) { - mce_work_trigger(); - - if (__ratelimit(&ratelimit)) - pr_info(HW_ERR "Machine check events logged\n"); - - return true; - } - - return false; -} - static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -618,9 +597,7 @@ static int mce_early_notifier(struct notifier_block *nb, unsigned long val, /* Emit the trace record: */ trace_mce_record(err); - set_bit(0, &mce_need_notify); - - mce_notify_irq(); + mce_work_trigger(); return NOTIFY_DONE; } @@ -1804,7 +1781,7 @@ static void mce_timer_fn(struct timer_list *t) * Alert userspace if needed. If we logged an MCE, reduce the polling * interval, otherwise increase the polling interval. */ - if (mce_notify_irq()) + if (!mce_gen_pool_empty()) iv = max(iv / 2, (unsigned long) HZ/100); else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index e533881284a1..5c0afae75e9f 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -322,7 +322,7 @@ static u32 get_patch_level(void) { u32 rev, dummy __always_unused; - if (IS_ENABLED(CONFIG_MICROCODE_DBG) && hypervisor_present) { + if (IS_ENABLED(CONFIG_MICROCODE_DBG) && x86_hypervisor_present) { int cpu = smp_processor_id(); if (!microcode_rev[cpu]) { @@ -714,7 +714,7 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, invlpg(p_addr_end); } - if (IS_ENABLED(CONFIG_MICROCODE_DBG) && hypervisor_present) + if (IS_ENABLED(CONFIG_MICROCODE_DBG) && x86_hypervisor_present) microcode_rev[smp_processor_id()] = mc->hdr.patch_id; /* verify patch application was successful */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 651202e6fefb..45ca406a8112 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -57,7 +57,7 @@ bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); u32 base_rev; u32 microcode_rev[NR_CPUS] = {}; -bool hypervisor_present; +bool __ro_after_init x86_hypervisor_present; /* * Synchronization. @@ -118,14 +118,9 @@ bool __init microcode_loader_disabled(void) /* * Disable when: * - * 1) The CPU does not support CPUID. - */ - if (!cpuid_feature()) { - dis_ucode_ldr = true; - return dis_ucode_ldr; - } - - /* + * 1) The CPU does not support CPUID, detected below in + * load_ucode_bsp(). + * * 2) Bit 31 in CPUID[1]:ECX is clear * The bit is reserved for hypervisor use. This is still not * completely accurate as XEN PV guests don't see that CPUID bit @@ -135,9 +130,7 @@ bool __init microcode_loader_disabled(void) * 3) Certain AMD patch levels are not allowed to be * overwritten. */ - hypervisor_present = native_cpuid_ecx(1) & BIT(31); - - if ((hypervisor_present && !IS_ENABLED(CONFIG_MICROCODE_DBG)) || + if ((x86_hypervisor_present && !IS_ENABLED(CONFIG_MICROCODE_DBG)) || amd_check_current_patch_level()) dis_ucode_ldr = true; @@ -179,6 +172,11 @@ void __init load_ucode_bsp(void) early_parse_cmdline(); + if (!cpuid_feature()) + dis_ucode_ldr = true; + else + x86_hypervisor_present = native_cpuid_ecx(1) & BIT(31); + if (microcode_loader_disabled()) return; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 37ac4afe0972..a4c0a0cf928b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -138,6 +138,9 @@ u32 intel_get_platform_id(void) { unsigned int val[2]; + if (x86_hypervisor_present) + return 0; + /* * This can be called early. Use CPUID directly instead of * relying on cpuinfo_x86 which may not be fully initialized. diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 3b93c0676b4f..a10b547eda1e 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -48,7 +48,6 @@ extern struct early_load_data early_data; extern struct ucode_cpu_info ucode_cpu_info[]; extern u32 microcode_rev[NR_CPUS]; extern u32 base_rev; -extern bool hypervisor_present; struct cpio_data find_microcode_in_initrd(const char *path); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 9bd87bae4983..59215fef3924 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -377,7 +377,12 @@ static const struct x86_cpu_id snc_cpu_ids[] __initconst = { static __init int snc_get_config(void) { - int ret = topology_num_nodes_per_package(); + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 1; + + ret = topology_num_nodes_per_package(); if (ret > 1 && !x86_match_cpu(snc_cpu_ids)) { pr_warn("CoD enabled system? Resctrl not supported\n"); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 2a9992758933..eb72537bc0b1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -450,6 +450,10 @@ __init static int append_e820_table(struct boot_e820_entry *entries, u32 nr_entr { struct boot_e820_entry *entry = entries; + /* If there aren't any entries, we'll want to fall back to another source: */ + if (!nr_entries) + return -ENOENT; + while (nr_entries) { u64 start = entry->addr; u64 size = entry->size; @@ -458,7 +462,7 @@ __init static int append_e820_table(struct boot_e820_entry *entries, u32 nr_entr /* Ignore the remaining entries on 64-bit overflow: */ if (start > end && likely(size)) - return -1; + return -EINVAL; e820__range_add(start, size, type); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index c3ec2512f2bb..20b638c507ca 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -27,14 +27,19 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, struct _fpx_sw_bytes *fx_sw) { + int min_xstate_size = sizeof(struct fxregs_state) + + sizeof(struct xstate_header); void __user *fpstate = fxbuf; unsigned int magic2; if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) return false; - /* Check for the first magic field */ - if (fx_sw->magic1 != FP_XSTATE_MAGIC1) + /* Check for the first magic field and other error scenarios. */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || + fx_sw->xstate_size < min_xstate_size || + fx_sw->xstate_size > x86_task_fpu(current)->fpstate->user_size || + fx_sw->xstate_size > fx_sw->extended_size) goto setfx; /* @@ -43,7 +48,7 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, * fpstate layout with out copying the extended state information * in the memory layout. */ - if (__get_user(magic2, (__u32 __user *)(fpstate + x86_task_fpu(current)->fpstate->user_size))) + if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 0543b57f54ee..17d6edfcb7e0 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -376,6 +376,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) } /* + * Generated trampoline may contain rIP-relative addressing which + * displacement needs to be fixed. + */ + text_poke_apply_relocation(trampoline, trampoline, size, + (void *)start_offset, size); + + /* * The address of the ftrace_ops that is used for this trampoline * is stored at the end of the trampoline. This will be used to * load the third parameter for the callback. Basically, that diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 260456588756..90a22e24a9eb 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -268,6 +268,13 @@ void __init idt_setup_early_pf(void) } #endif +#if IS_ENABLED(CONFIG_KVM_INTEL) && !defined(CONFIG_X86_64) +void idt_entry_from_kvm(unsigned int vector) +{ + idt_do_interrupt_irqoff(gate_offset(idt_table + vector)); +} +#endif + static void __init idt_map_in_cea(void) { /* diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 3d239ed12744..52a3afb1b79e 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -614,7 +614,6 @@ DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx) { exc_nmi(regs); } -EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx); #endif #ifdef CONFIG_NMI_CHECK_CPU diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 4ffba68dc57b..eaeb77464c06 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -136,6 +136,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * %r13 original CR4 when relocate_kernel() was invoked */ + /* + * Set return address to 0 if not preserving context. The purgatory + * shipped in kexec-tools will unconditionally look for the return + * address on the stack and set a kexec_jump_back_entry= command + * line option if it's non-zero. There's no other way that it can + * tell a preserve-context (kjump) kexec from a normal one. + */ + pushq $0 /* store the start address on the stack */ pushq %rdx diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c8c6cc0406d6..8013dccb3110 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4481,7 +4481,7 @@ static const struct opcode opcode_map_0f_38[256] = { X16(N), X16(N), /* 0x20 - 0x2f */ X8(N), - X2(N), GP(SrcReg | DstMem | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N, + X2(N), GP(SrcMem | DstReg | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N, /* 0x30 - 0x7f */ X16(N), X16(N), X16(N), X16(N), X16(N), /* 0x80 - 0xef */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 9b140bbdc1d8..4438ecac9a89 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2040,7 +2040,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) * flush). Translate the address here so the memory can be uniformly * read with kvm_read_guest(). */ - if (!hc->fast && is_guest_mode(vcpu)) { + if (!hc->fast && mmu_is_nested(vcpu)) { hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL); if (unlikely(hc->ingpa == INVALID_GPA)) return HV_STATUS_INVALID_HYPERCALL_INPUT; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e3ec4d8607c1..4078e624ca66 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -667,13 +667,15 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) u32 *__pir = (void *)pir_vals; u32 i, vec; u32 irr_val, prev_irr_val; - int max_updated_irr; + int max_new_irr; - max_updated_irr = -1; - *max_irr = -1; - - if (!pi_harvest_pir(pir, pir_vals)) + if (!pi_harvest_pir(pir, pir_vals)) { + *max_irr = apic_find_highest_vector(regs + APIC_IRR); return false; + } + + max_new_irr = -1; + *max_irr = -1; for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); @@ -688,25 +690,25 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); if (prev_irr_val != irr_val) - max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec; + max_new_irr = __fls(irr_val ^ prev_irr_val) + vec; } if (irr_val) *max_irr = __fls(irr_val) + vec; } - return ((max_updated_irr != -1) && - (max_updated_irr == *max_irr)); + return max_new_irr != -1 && max_new_irr == *max_irr; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_apic_update_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; - bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); + bool max_irr_is_from_pir; - if (unlikely(!apic->apicv_active && irr_updated)) + max_irr_is_from_pir = __kvm_apic_update_irr(pir, apic->regs, max_irr); + if (unlikely(!apic->apicv_active && max_irr_is_from_pir)) apic->irr_pending = true; - return irr_updated; + return max_irr_is_from_pir; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_irr); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 24fbc9ea502a..f0144ae8d891 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -182,6 +182,8 @@ static struct kmem_cache *pte_list_desc_cache; struct kmem_cache *mmu_page_header_cache; static void mmu_spte_set(u64 *sptep, u64 spte); +static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *spte, struct list_head *invalid_list); struct kvm_mmu_role_regs { const unsigned long cr0; @@ -1287,19 +1289,6 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) -{ - struct kvm_mmu_page *sp; - - sp = sptep_to_sp(sptep); - WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K); - - drop_spte(kvm, sptep); - - if (flush) - kvm_flush_remote_tlbs_sptep(kvm, sptep); -} - /* * Write-protect on the specified @sptep, @pt_protect indicates whether * spte write-protection is caused by protecting shadow page table. @@ -2466,7 +2455,8 @@ static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, { union kvm_mmu_page_role role; - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep) && + spte_to_child_sp(*sptep) && spte_to_child_sp(*sptep)->gfn == gfn) return ERR_PTR(-EEXIST); role = kvm_mmu_child_role(sptep, direct, access); @@ -2536,6 +2526,23 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } +/* + * Note: while normally KVM uses a "bool flush" return value to let + * the caller batch flushes, __link_shadow_page() flushes immediately + * before populating the parent PTE with the new shadow page. The + * typical callers, direct_map() and FNAME(fetch)(), are not going + * to zap more than one huge SPTE anyway. + * + * The only exception, where @flush can be false, is when a huge SPTE + * is replaced with a shadow page SPTE with a fully populated page table, + * which can happen from shadow_mmu_split_huge_page(). In this case, + * no memory is unmapped across the change to the page tables and no + * immediate flush is needed for correctness. + * + * Even in that case, calls to kvm_mmu_commit_zap_page() are not + * batched. Doing so would require adding an invalid_list argument + * all the way down to __walk_slot_rmaps(). + */ static void __link_shadow_page(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, u64 *sptep, struct kvm_mmu_page *sp, bool flush) @@ -2544,13 +2551,18 @@ static void __link_shadow_page(struct kvm *kvm, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - /* - * If an SPTE is present already, it must be a leaf and therefore - * a large one. Drop it, and flush the TLB if needed, before - * installing sp. - */ - if (is_shadow_present_pte(*sptep)) - drop_large_spte(kvm, sptep, flush); + if (is_shadow_present_pte(*sptep)) { + struct kvm_mmu_page *parent_sp; + LIST_HEAD(invalid_list); + + parent_sp = sptep_to_sp(sptep); + WARN_ON_ONCE(parent_sp->role.level == PG_LEVEL_4K); + + if (mmu_page_zap_pte(kvm, parent_sp, sptep, &invalid_list)) + kvm_mmu_commit_zap_page(kvm, &invalid_list); + else if (flush) + kvm_flush_remote_tlbs_sptep(kvm, sptep); + } spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index adf211860949..cdd5a6dc646f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -207,6 +207,35 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); /* + * Flush the TLB when enabling (x2)AVIC and when transitioning between + * xAVIC and x2AVIC, as the CPU may have inserted a TLB entry for the + * "wrong" mapping. + * + * KVM uses a per-VM "scratch" page to back the APIC memslot, because + * KVM also uses per-VM page tables *and* maintains the page table (NPT + * or shadow page) mappings for said memslot even if one or more vCPUs + * have their local APIC hardware-disabled or are in x2APIC mode, i.e. + * even if one or more vCPUs' APIC MMIO BAR is effectively disabled. + * + * If xAVIC is fully enabled, hardware ignores the physical address in + * KVM's page tables, i.e. in the leaf SPTE for the APIC memslot, and + * instead redirects the access to the AVIC backing page, i.e. to the + * vCPU's virtual APIC page. If xAVIC is not enabled (APIC is either + * hardware-disabled or in x2APIC mode), then guest accesses will use + * the page table mapping verbatim, i.e. will access the per-VM scratch + * page, as normal memory. + * + * In both cases, the CPU is allowed to cache TLB entries for the APIC + * base GPA. So, KVM needs to flush the TLB when enabling xAVIC, as + * accesses need to be redirected to the virtual APIC page, but the TLB + * may contain entries pointing at the scratch page. KVM also needs to + * flush the TLB when enabling x2AVIC, as accesses need to go to the + * scratch page, but the TLB may contain entries tagged as xAVIC, i.e. + * entries pointing to the vCPU's virtual APIC page. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); + + /* * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR * accesses, while interrupt injection to a running vCPU can be * achieved using AVIC doorbell. KVM disables the APIC access page @@ -219,12 +248,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) /* Disabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, false); } else { - /* - * Flush the TLB, the guest may have inserted a non-APIC - * mapping into the TLB while AVIC was disabled. - */ - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); - /* Enabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, true); } @@ -1300,12 +1323,14 @@ bool __init avic_hardware_setup(void) } /* - * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) - * due to erratum 1235, which results in missed VM-Exits on the sender - * and thus missed wake events for blocking vCPUs due to the CPU - * failing to see a software update to clear IsRunning. + * Disable IPI virtualization for AMD Family 17h (Zen1 and Zen2) and + * Hygon Family 18h (derived from AMD Zen1) CPUs due to erratum 1235, + * which results in missed VM-Exits on the sender and thus missed wake + * events for blocking vCPUs due to the CPU failing to see a software + * update to clear IsRunning. */ - enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17; + if (boot_cpu_data.x86 == 0x17 || boot_cpu_data.x86 == 0x18) + enable_ipiv = false; amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 961804df5f45..b340dc9991ad 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -160,6 +160,16 @@ void nested_vmcb02_recalc_intercepts(struct vcpu_svm *svm) if (!intercept_smi) vmcb_clr_intercept(&vmcb02->control, INTERCEPT_SMI); + /* + * Intercept PAUSE if and only if L1 wants to. KVM intercepts PAUSE so + * that a vCPU that may be spinning waiting for a lock can be scheduled + * out in favor of the vCPU that holds said lock. KVM doesn't support + * yielding across L2 vCPUs, as KVM has limited visilibity into which + * L2 vCPUs are in the same L2 VM, i.e. may be contending for locks. + */ + if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) + vmcb_clr_intercept(&vmcb02->control, INTERCEPT_PAUSE); + if (nested_vmcb_needs_vls_intercept(svm)) { /* * If the virtual VMLOAD/VMSAVE is not enabled for the L2, @@ -819,7 +829,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; struct vmcb *vmcb01 = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; - u32 pause_count12, pause_thresh12; nested_svm_transition_tlb_flush(vcpu); @@ -947,31 +956,13 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vmcb02->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER)) - pause_count12 = vmcb12_ctrl->pause_filter_count; + vmcb02->control.pause_filter_count = vmcb12_ctrl->pause_filter_count; else - pause_count12 = 0; + vmcb02->control.pause_filter_count = 0; if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD)) - pause_thresh12 = vmcb12_ctrl->pause_filter_thresh; + vmcb02->control.pause_filter_thresh = vmcb12_ctrl->pause_filter_thresh; else - pause_thresh12 = 0; - if (kvm_pause_in_guest(svm->vcpu.kvm)) { - /* use guest values since host doesn't intercept PAUSE */ - vmcb02->control.pause_filter_count = pause_count12; - vmcb02->control.pause_filter_thresh = pause_thresh12; - - } else { - /* start from host values otherwise */ - vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count; - vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh; - - /* ... but ensure filtering is disabled if so requested. */ - if (vmcb12_is_intercept(vmcb12_ctrl, INTERCEPT_PAUSE)) { - if (!pause_count12) - vmcb02->control.pause_filter_count = 0; - if (!pause_thresh12) - vmcb02->control.pause_filter_thresh = 0; - } - } + vmcb02->control.pause_filter_thresh = 0; /* * Take ALLOW_LARGER_RAP from vmcb12 even though it should be safe to @@ -1298,12 +1289,6 @@ void nested_svm_vmexit(struct vcpu_svm *svm) /* in case we halted in L2 */ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); - if (!kvm_pause_in_guest(vcpu->kvm)) { - vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count; - vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); - - } - /* * Invalidate last_bus_lock_rip unless KVM is still waiting for the * guest to make forward progress before re-enabling bus lock detection. diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c2126b3c3072..6c6a6d663e29 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3313,37 +3313,6 @@ void sev_guest_memory_reclaimed(struct kvm *kvm) sev_writeback_caches(kvm); } -void sev_free_vcpu(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm; - - if (!is_sev_es_guest(vcpu)) - return; - - svm = to_svm(vcpu); - - /* - * If it's an SNP guest, then the VMSA was marked in the RMP table as - * a guest-owned page. Transition the page to hypervisor state before - * releasing it back to the system. - */ - if (is_sev_snp_guest(vcpu)) { - u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; - - if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) - goto skip_vmsa_free; - } - - if (vcpu->arch.guest_state_protected) - sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); - - __free_page(virt_to_page(svm->sev_es.vmsa)); - -skip_vmsa_free: - if (svm->sev_es.ghcb_sa_free) - kvfree(svm->sev_es.ghcb_sa); -} - static void dump_ghcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3583,6 +3552,20 @@ vmgexit_err: return 1; } +static void __sev_es_unmap_ghcb(struct vcpu_svm *svm) +{ + if (svm->sev_es.ghcb_sa_free) { + kvfree(svm->sev_es.ghcb_sa); + svm->sev_es.ghcb_sa = NULL; + svm->sev_es.ghcb_sa_free = false; + } + + if (svm->sev_es.ghcb) { + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); + svm->sev_es.ghcb = NULL; + } +} + void sev_es_unmap_ghcb(struct vcpu_svm *svm) { /* Clear any indication that the vCPU is in a type of AP Reset Hold */ @@ -3591,31 +3574,51 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) if (!svm->sev_es.ghcb) return; - if (svm->sev_es.ghcb_sa_free) { - /* - * The scratch area lives outside the GHCB, so there is a - * buffer that, depending on the operation performed, may - * need to be synced, then freed. - */ - if (svm->sev_es.ghcb_sa_sync) { - kvm_write_guest(svm->vcpu.kvm, - svm->sev_es.sw_scratch, - svm->sev_es.ghcb_sa, - svm->sev_es.ghcb_sa_len); - svm->sev_es.ghcb_sa_sync = false; - } - - kvfree(svm->sev_es.ghcb_sa); - svm->sev_es.ghcb_sa = NULL; - svm->sev_es.ghcb_sa_free = false; + /* + * If the scratch area lives outside the GHCB, there's a buffer that, + * depending on the operation performed, may need to be synced. + */ + if (svm->sev_es.ghcb_sa_sync) { + kvm_write_guest(svm->vcpu.kvm, svm->sev_es.sw_scratch, + svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); + svm->sev_es.ghcb_sa_sync = false; } trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb); sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); - svm->sev_es.ghcb = NULL; + __sev_es_unmap_ghcb(svm); +} + +void sev_free_vcpu(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm; + + if (!is_sev_es_guest(vcpu)) + return; + + svm = to_svm(vcpu); + + /* + * If it's an SNP guest, then the VMSA was marked in the RMP table as + * a guest-owned page. Transition the page to hypervisor state before + * releasing it back to the system. + */ + if (is_sev_snp_guest(vcpu)) { + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) + goto skip_vmsa_free; + } + + if (vcpu->arch.guest_state_protected) + sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); + + __free_page(virt_to_page(svm->sev_es.vmsa)); + +skip_vmsa_free: + __sev_es_unmap_ghcb(svm); } int pre_sev_run(struct vcpu_svm *svm, int cpu) @@ -3662,26 +3665,31 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu) } #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) -static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) +static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 min_len) { struct vmcb_control_area *control = &svm->vmcb->control; u64 ghcb_scratch_beg, ghcb_scratch_end; u64 scratch_gpa_beg, scratch_gpa_end; void *scratch_va; + if (WARN_ON_ONCE(!min_len)) + goto e_scratch; + scratch_gpa_beg = svm->sev_es.sw_scratch; if (!scratch_gpa_beg) { pr_err("vmgexit: scratch gpa not provided\n"); goto e_scratch; } - scratch_gpa_end = scratch_gpa_beg + len; + scratch_gpa_end = scratch_gpa_beg + min_len; if (scratch_gpa_end < scratch_gpa_beg) { pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n", - len, scratch_gpa_beg); + min_len, scratch_gpa_beg); goto e_scratch; } + WARN_ON_ONCE(svm->sev_es.ghcb_sa_sync || svm->sev_es.ghcb_sa_free); + if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) { /* Scratch area begins within GHCB */ ghcb_scratch_beg = control->ghcb_gpa + @@ -3702,21 +3710,29 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) scratch_va = (void *)svm->sev_es.ghcb; scratch_va += (scratch_gpa_beg - control->ghcb_gpa); + + svm->sev_es.ghcb_sa_sync = false; + svm->sev_es.ghcb_sa_free = false; + svm->sev_es.ghcb_sa_len = ghcb_scratch_end - scratch_gpa_beg; } else { + /* GHCB v2 requires the scratch area to be within the GHCB. */ + if (to_kvm_sev_info(svm->vcpu.kvm)->ghcb_version >= 2) + goto e_scratch; + /* * The guest memory must be read into a kernel buffer, so * limit the size */ - if (len > GHCB_SCRATCH_AREA_LIMIT) { + if (min_len > GHCB_SCRATCH_AREA_LIMIT) { pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n", - len, GHCB_SCRATCH_AREA_LIMIT); + min_len, GHCB_SCRATCH_AREA_LIMIT); goto e_scratch; } - scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT); + scratch_va = kvzalloc(min_len, GFP_KERNEL_ACCOUNT); if (!scratch_va) return -ENOMEM; - if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) { + if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, min_len)) { /* Unable to copy scratch area from guest */ pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); @@ -3732,11 +3748,10 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) */ svm->sev_es.ghcb_sa_sync = sync; svm->sev_es.ghcb_sa_free = true; + svm->sev_es.ghcb_sa_len = min_len; } svm->sev_es.ghcb_sa = scratch_va; - svm->sev_es.ghcb_sa_len = len; - return 0; e_scratch: @@ -3833,13 +3848,11 @@ struct psc_buffer { struct psc_entry entries[]; } __packed; -static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc); +static int snp_do_psc(struct vcpu_svm *svm); static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) { - svm->sev_es.psc_inflight = 0; - svm->sev_es.psc_idx = 0; - svm->sev_es.psc_2m = false; + memset(&svm->sev_es.psc, 0, sizeof(svm->sev_es.psc)); /* * PSC requests always get a "no action" response in SW_EXITINFO1, with @@ -3852,9 +3865,8 @@ static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) static void __snp_complete_one_psc(struct vcpu_svm *svm) { - struct psc_buffer *psc = svm->sev_es.ghcb_sa; - struct psc_entry *entries = psc->entries; - struct psc_hdr *hdr = &psc->hdr; + struct vcpu_sev_es_state *sev_es = &svm->sev_es; + struct psc_buffer *guest_psc = sev_es->ghcb_sa; __u16 idx; /* @@ -3862,20 +3874,20 @@ static void __snp_complete_one_psc(struct vcpu_svm *svm) * corresponding entries in the guest's PSC buffer and zero out the * count of in-flight PSC entries. */ - for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight; - svm->sev_es.psc_inflight--, idx++) { - struct psc_entry *entry = &entries[idx]; + for (idx = sev_es->psc.cur_idx; sev_es->psc.batch_size; + sev_es->psc.batch_size--, idx++) { + struct psc_entry entry = READ_ONCE(guest_psc->entries[idx]); - entry->cur_page = entry->pagesize ? 512 : 1; + guest_psc->entries[idx].cur_page = entry.pagesize ? 512 : 1; } - hdr->cur_entry = idx; + sev_es->psc.cur_idx = idx; + guest_psc->hdr.cur_entry = idx; } static int snp_complete_one_psc(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct psc_buffer *psc = svm->sev_es.ghcb_sa; if (vcpu->run->hypercall.ret) { snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); @@ -3885,48 +3897,30 @@ static int snp_complete_one_psc(struct kvm_vcpu *vcpu) __snp_complete_one_psc(svm); /* Handle the next range (if any). */ - return snp_begin_psc(svm, psc); + return snp_do_psc(svm); } -static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc) +static int snp_do_psc(struct vcpu_svm *svm) { - struct psc_entry *entries = psc->entries; + struct vcpu_sev_es_state *sev_es = &svm->sev_es; + struct psc_buffer *guest_psc = sev_es->ghcb_sa; struct kvm_vcpu *vcpu = &svm->vcpu; - struct psc_hdr *hdr = &psc->hdr; struct psc_entry entry_start; - u16 idx, idx_start, idx_end; int npages; bool huge; u64 gfn; - - if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { - snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); - return 1; - } + u16 idx; next_range: /* There should be no other PSCs in-flight at this point. */ - if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) { + if (WARN_ON_ONCE(svm->sev_es.psc.batch_size)) { snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); return 1; } - /* - * The PSC descriptor buffer can be modified by a misbehaved guest after - * validation, so take care to only use validated copies of values used - * for things like array indexing. - */ - idx_start = hdr->cur_entry; - idx_end = hdr->end_entry; - - if (idx_end >= VMGEXIT_PSC_MAX_COUNT) { - snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); - return 1; - } - /* Find the start of the next range which needs processing. */ - for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) { - entry_start = entries[idx]; + for (idx = sev_es->psc.cur_idx; idx <= sev_es->psc.end_idx; idx++) { + entry_start = READ_ONCE(guest_psc->entries[idx]); gfn = entry_start.gfn; huge = entry_start.pagesize; @@ -3952,32 +3946,40 @@ next_range: if (npages) break; + + /* + * Increment the guest-visible index to communicate the current + * entry back to the guest, e.g. in case of failure. No need + * for READ_ONCE() as KVM doesn't consume the field, i.e. a + * misbehaving guest can only break itself. + */ + guest_psc->hdr.cur_entry++; } - if (idx > idx_end) { + if (idx > sev_es->psc.end_idx) { /* Nothing more to process. */ snp_complete_psc(svm, 0); return 1; } - svm->sev_es.psc_2m = huge; - svm->sev_es.psc_idx = idx; - svm->sev_es.psc_inflight = 1; + sev_es->psc.is_2m = huge; + sev_es->psc.cur_idx = idx; + sev_es->psc.batch_size = 1; /* * Find all subsequent PSC entries that contain adjacent GPA * ranges/operations and can be combined into a single * KVM_HC_MAP_GPA_RANGE exit. */ - while (++idx <= idx_end) { - struct psc_entry entry = entries[idx]; + while (++idx <= sev_es->psc.end_idx) { + struct psc_entry entry = READ_ONCE(guest_psc->entries[idx]); if (entry.operation != entry_start.operation || entry.gfn != entry_start.gfn + npages || entry.cur_page || !!entry.pagesize != huge) break; - svm->sev_es.psc_inflight++; + sev_es->psc.batch_size++; npages += huge ? 512 : 1; } @@ -4019,6 +4021,46 @@ next_range: BUG(); } +static int snp_begin_psc(struct vcpu_svm *svm) +{ + struct vcpu_sev_es_state *sev_es = &svm->sev_es; + struct psc_buffer *guest_psc = sev_es->ghcb_sa; + u16 max_nr_entries; + + if (!user_exit_on_hypercall(svm->vcpu.kvm, KVM_HC_MAP_GPA_RANGE)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + + /* + * GHCB v2 requires the scratch area to reside within the GHCB itself, + * and PSC requests are only supported for GHCB v2+. Thus it should be + * impossible to exceed the max PSC entry count (which is derived from + * the size of the shared GHCB buffer). + */ + max_nr_entries = (sev_es->ghcb_sa_len - sizeof(struct psc_hdr)) / + sizeof(struct psc_entry); + if (WARN_ON_ONCE(max_nr_entries > VMGEXIT_PSC_MAX_COUNT)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + + /* + * The PSC descriptor buffer can be modified by a misbehaved guest after + * validation, so take care to only use validated copies of values used + * for things like array indexing. + */ + sev_es->psc.cur_idx = READ_ONCE(guest_psc->hdr.cur_entry); + sev_es->psc.end_idx = READ_ONCE(guest_psc->hdr.end_entry); + + if (sev_es->psc.end_idx >= max_nr_entries) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); + return 1; + } + + return snp_do_psc(svm); +} + /* * Invoked as part of svm_vcpu_reset() processing of an init event. */ @@ -4493,13 +4535,22 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) case SVM_VMGEXIT_MMIO_READ: case SVM_VMGEXIT_MMIO_WRITE: { bool is_write = control->exit_code == SVM_VMGEXIT_MMIO_WRITE; + u64 len = control->exit_info_2; - ret = setup_vmgexit_scratch(svm, !is_write, control->exit_info_2); + if (!len) + return 1; + + if (to_kvm_sev_info(vcpu->kvm)->ghcb_version >= 2 && len > 8) { + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); + return 1; + } + + ret = setup_vmgexit_scratch(svm, !is_write, len); if (ret) break; - ret = kvm_sev_es_mmio(vcpu, is_write, control->exit_info_1, - control->exit_info_2, svm->sev_es.ghcb_sa); + ret = kvm_sev_es_mmio(vcpu, is_write, control->exit_info_1, len, + svm->sev_es.ghcb_sa); break; } case SVM_VMGEXIT_NMI_COMPLETE: @@ -4546,11 +4597,11 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) vcpu->run->system_event.data[0] = control->ghcb_gpa; break; case SVM_VMGEXIT_PSC: - ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); + ret = setup_vmgexit_scratch(svm, true, sizeof(struct psc_hdr)); if (ret) break; - ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa); + ret = snp_begin_psc(svm); break; case SVM_VMGEXIT_AP_CREATION: ret = sev_snp_ap_creation(svm); @@ -4572,6 +4623,11 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) control->exit_info_1, control->exit_info_2); ret = -EINVAL; break; + case SVM_EXIT_IOIO: + if (!((control->exit_info_1 & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT)) + return 1; + + fallthrough; default: ret = svm_invoke_exit_handler(vcpu, control->exit_code); } @@ -4592,6 +4648,9 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) if (unlikely(check_mul_overflow(count, size, &bytes))) return -EINVAL; + if (!bytes) + return 1; + r = setup_vmgexit_scratch(svm, in, bytes); if (r) return r; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e7fdd7a9c280..e02a38da5296 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -913,7 +913,15 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; - if (kvm_pause_in_guest(vcpu->kvm)) + /* Adjusting pause_filter_count makes no sense if PLE is disabled. */ + WARN_ON_ONCE(kvm_pause_in_guest(vcpu->kvm)); + + /* + * While running L2, KVM should intercept PAUSE if and only if L1 wants + * to intercept PAUSE, and L1's intercept should take priority, i.e. + * KVM should never handle a PAUSE intercept from L2. + */ + if (WARN_ON_ONCE(is_guest_mode(vcpu))) return; control->pause_filter_count = __grow_ple_window(old, @@ -934,7 +942,10 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; - if (kvm_pause_in_guest(vcpu->kvm)) + /* Adjusting pause_filter_count makes no sense if PLE is disabled. */ + WARN_ON_ONCE(kvm_pause_in_guest(vcpu->kvm)); + + if (is_guest_mode(vcpu)) return; control->pause_filter_count = diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a10668d17a16..5137416be593 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -257,9 +257,12 @@ struct vcpu_sev_es_state { bool ghcb_sa_free; /* SNP Page-State-Change buffer entries currently being processed */ - u16 psc_idx; - u16 psc_inflight; - bool psc_2m; + struct { + u16 cur_idx; + u16 end_idx; + u16 batch_size; + bool is_2m; + } psc; u64 ghcb_registered_gpa; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e7fdbe9efc90..0db25bba17f6 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -154,7 +154,7 @@ TRACE_EVENT(kvm_xen_hypercall, __entry->a2 = a2; __entry->a3 = a3; __entry->a4 = a4; - __entry->a4 = a5; + __entry->a5 = a5; ), TP_printk("cpl %d nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx", diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 56cacc06225e..31568274d8bb 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -14,6 +14,7 @@ extern bool __read_mostly flexpriority_enabled; extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; +extern bool __read_mostly enable_cet; extern bool __read_mostly enable_pml; extern int __read_mostly pt_mode; diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 8a481dae9cae..ff1f254a0ef4 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -31,38 +31,6 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif -.macro VMX_DO_EVENT_IRQOFF call_insn call_target - /* - * Unconditionally create a stack frame, getting the correct RSP on the - * stack (for x86-64) would take two instructions anyways, and RBP can - * be used to restore RSP to make objtool happy (see below). - */ - push %_ASM_BP - mov %_ASM_SP, %_ASM_BP - -#ifdef CONFIG_X86_64 - /* - * Align RSP to a 16-byte boundary (to emulate CPU behavior) before - * creating the synthetic interrupt stack frame for the IRQ/NMI. - */ - and $-16, %rsp - push $__KERNEL_DS - push %rbp -#endif - pushf - push $__KERNEL_CS - \call_insn \call_target - - /* - * "Restore" RSP from RBP, even though IRET has already unwound RSP to - * the correct value. objtool doesn't know the callee will IRET and, - * without the explicit restore, thinks the stack is getting walloped. - * Using an unwind hint is problematic due to x86-64's dynamic alignment. - */ - leave - RET -.endm - .section .noinstr.text, "ax" /** @@ -320,10 +288,6 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) SYM_FUNC_END(__vmx_vcpu_run) -SYM_FUNC_START(vmx_do_nmi_irqoff) - VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx -SYM_FUNC_END(vmx_do_nmi_irqoff) - #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT /** @@ -375,13 +339,3 @@ SYM_FUNC_START(vmread_error_trampoline) RET SYM_FUNC_END(vmread_error_trampoline) #endif - -.section .text, "ax" - -#ifndef CONFIG_X86_FRED - -SYM_FUNC_START(vmx_do_interrupt_irqoff) - VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 -SYM_FUNC_END(vmx_do_interrupt_irqoff) - -#endif diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a29896a9ef14..b9103de01428 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -108,6 +108,9 @@ module_param_named(unrestricted_guest, bool __read_mostly enable_ept_ad_bits = 1; module_param_named(eptad, enable_ept_ad_bits, bool, 0444); +bool __read_mostly enable_cet = 1; +module_param_named(cet, enable_cet, bool, 0444); + static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, 0444); @@ -4476,7 +4479,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter * 3 and 4 for details. */ - if (cpu_has_load_cet_ctrl()) { + if (enable_cet) { vmcs_writel(HOST_S_CET, kvm_host.s_cet); vmcs_writel(HOST_SSP, 0); vmcs_writel(HOST_INTR_SSP_TABLE, 0); @@ -4532,6 +4535,10 @@ static u32 vmx_get_initial_vmentry_ctrl(void) if (vmx_pt_mode_is_system()) vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); + + if (!enable_cet) + vmentry_ctrl &= ~VM_ENTRY_LOAD_CET_STATE; + /* * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. */ @@ -4546,6 +4553,9 @@ static u32 vmx_get_initial_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; + if (!enable_cet) + vmexit_ctrl &= ~VM_EXIT_LOAD_CET_STATE; + /* * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for * nested virtualization and thus allowed to be set in vmcs12. @@ -7029,8 +7039,8 @@ static void vmx_set_rvi(int vector) int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vt *vt = to_vt(vcpu); + bool max_irr_is_from_pir; int max_irr; - bool got_posted_interrupt; if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; @@ -7042,17 +7052,22 @@ int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - got_posted_interrupt = - kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); + max_irr_is_from_pir = kvm_apic_update_irr(vcpu, vt->pi_desc.pir, + &max_irr); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); - got_posted_interrupt = false; + max_irr_is_from_pir = false; } /* - * Newly recognized interrupts are injected via either virtual interrupt - * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is - * disabled in two cases: + * If APICv is enabled and L2 is not active, then update the Requesting + * Virtual Interrupt (RVI) portion of vmcs01.GUEST_INTR_STATUS with the + * highest priority IRR to deliver the IRQ via Virtual Interrupt + * Delivery. Note, this is required even if the highest priority IRQ + * was already pending in the IRR, as RVI isn't updated in lockstep with + * the IRR (unlike apic->irr_pending). + * + * For the cases where Virtual Interrupt Delivery can't be used: * * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a @@ -7063,10 +7078,29 @@ int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * 2) If APICv is disabled for this vCPU, assigned devices may still * attempt to post interrupts. The posted interrupt vector will cause * a VM-Exit and the subsequent entry will call sync_pir_to_irr. + * + * In both cases, set KVM_REQ_EVENT if and only if the highest priority + * pending IRQ came from the PIR, as setting KVM_REQ_EVENT if any IRQ + * is pending may put the vCPU into an infinite loop, e.g. if the IRQ + * is blocked, then it will stay pending until an IRQ window is opened. + * + * Note! It's possible that one or more IRQs were moved from the PIR + * to the IRR _without_ max_irr_is_from_pir being true! I.e. if there + * was a higher priority IRQ already pending in the IRR. Not setting + * KVM_REQ_EVENT in this case is intentional and safe. If APICv is + * inactive, or L2 is running with exit-on-interrupt off (in vmcs12), + * i.e. without nested virtual interrupt delivery, then there's no need + * to request an IRQ window as the lower priority IRQ only needs to be + * delivered when the higher priority IRQ is dismissed from the ISR, + * i.e. on the next EOI, and EOIs are always intercepted if APICv is + * disabled or if L2 is running without nested VID. If L2 is running + * exit-on-interrupt on (in vmcs12), then the higher priority IRQ will + * trigger a nested VM-Exit, at which point KVM will re-evaluate L1's + * pending IRQs. */ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) vmx_set_rvi(max_irr); - else if (got_posted_interrupt) + else if (max_irr_is_from_pir) kvm_make_request(KVM_REQ_EVENT, vcpu); return max_irr; @@ -7083,9 +7117,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -void vmx_do_interrupt_irqoff(unsigned long entry); -void vmx_do_nmi_irqoff(void); - static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) { /* @@ -7127,17 +7158,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; - /* - * Invoke the kernel's IRQ handler for the vector. Use the FRED path - * when it's available even if FRED isn't fully enabled, e.g. even if - * FRED isn't supported in hardware, in order to avoid the indirect - * CALL in the non-FRED path. - */ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); - if (IS_ENABLED(CONFIG_X86_FRED)) - fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); - else - vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); + x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector); kvm_after_interrupt(vcpu); vcpu->arch.at_instruction_boundary = true; @@ -7447,10 +7469,7 @@ noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) return; kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - if (cpu_feature_enabled(X86_FEATURE_FRED)) - fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); - else - vmx_do_nmi_irqoff(); + x86_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); kvm_after_interrupt(vcpu); } @@ -8131,7 +8150,7 @@ static __init void vmx_set_cpu_caps(void) * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code * fails, so disable CET in this case too. */ - if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || + if (!enable_cet || !enable_unrestricted_guest || !cpu_has_vmx_basic_no_hw_errcode_cc()) { kvm_cpu_cap_clear(X86_FEATURE_SHSTK); kvm_cpu_cap_clear(X86_FEATURE_IBT); @@ -8606,6 +8625,9 @@ __init int vmx_hardware_setup(void) !cpu_has_vmx_invept_global()) enable_ept = 0; + if (!cpu_has_load_cet_ctrl()) + enable_cet = 0; + /* NX support is required for shadow paging. */ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { pr_err_ratelimited("NX (Execute Disable) not supported\n"); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0a1b63c63d1a..0550359ed798 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4876,7 +4876,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = tdp_enabled; break; case KVM_CAP_X86_APIC_BUS_CYCLES_NS: - r = APIC_BUS_CYCLE_NS_DEFAULT; + r = kvm ? kvm->arch.apic_bus_cycle_ns : APIC_BUS_CYCLE_NS_DEFAULT; break; case KVM_CAP_EXIT_HYPERCALL: r = KVM_EXIT_HYPERCALL_VALID_MASK; @@ -5227,8 +5227,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration */ - if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) { + if (__ratelimit(&vcpu->kvm->arch.kvmclock_update_rs)) + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + else + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + if (vcpu->cpu != cpu) kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu); vcpu->cpu = cpu; @@ -13366,6 +13371,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); + ratelimit_state_init(&kvm->arch.kvmclock_update_rs, HZ, 10); + ratelimit_set_flags(&kvm->arch.kvmclock_update_rs, RATELIMIT_MSG_ON_RELEASE); kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -14323,7 +14330,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) * the RAP (Return Address Predicator). */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) - kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS); kvm_invalidate_pcid(vcpu, operand.pcid); return kvm_skip_emulated_instruction(vcpu); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f0e77e084482..63de8e8684f2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -686,7 +686,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, * avoid hanging the system. */ if (IS_ENABLED(CONFIG_EFI)) - efi_crash_gracefully_on_page_fault(address); + efi_crash_gracefully_on_page_fault(address, regs); /* Only not-present faults should be handled by KFENCE. */ if (!(error_code & X86_PF_PROT) && diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index df24ffc6105d..90a065fcb1fa 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -761,7 +761,8 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, * @return: Returns, if the page fault is not handled. This function * will never return if the page fault is handled successfully. */ -void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) +void efi_crash_gracefully_on_page_fault(unsigned long phys_addr, + const struct pt_regs *regs) { if (!IS_ENABLED(CONFIG_X86_64)) return; @@ -770,7 +771,7 @@ void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) * If we get an interrupt/NMI while processing an EFI runtime service * then this is a regular OOPS, not an EFI failure. */ - if (in_interrupt()) + if (!in_task()) return; /* @@ -811,6 +812,14 @@ void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) } /* + * The API does not permit entering a kernel mode FPU section with + * interrupts enabled and leaving it with interrupts disabled. So + * re-enable interrupts now if they were enabled when the page fault + * occurred. + */ + local_irq_restore(regs->flags); + + /* * Before calling EFI Runtime Service, the kernel has switched the * calling process to efi_mm. Hence, switch back to task_mm. */ diff --git a/arch/x86/virt/hw.c b/arch/x86/virt/hw.c index f647557d38ac..7e9091c640be 100644 --- a/arch/x86/virt/hw.c +++ b/arch/x86/virt/hw.c @@ -49,7 +49,20 @@ static void x86_virt_invoke_kvm_emergency_callback(void) { cpu_emergency_virt_cb *kvm_callback; - kvm_callback = rcu_dereference(kvm_emergency_callback); + /* + * RCU may not be watching the crashing CPU here, so rcu_dereference() + * triggers a suspicious-RCU-usage splat. In principle, a concurrent + * KVM module unload could race with this read; see commit 2baa33a8ddd6 + * ("KVM: x86: Leave user-return notifier registered on reboot/shutdown") + * which notes that nothing prevents module unload during panic/reboot. + * + * However, taking a lock here would be riskier than the current race: + * the system is going down via NMI shootdown, and any lock could be + * held by an already-stopped CPU. Use rcu_dereference_raw() to silence + * the lockdep splat and accept the comically small remaining race; + * panic context inherently cannot guarantee complete correctness. + */ + kvm_callback = rcu_dereference_raw(kvm_emergency_callback); if (kvm_callback) kvm_callback(); } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index c80d0058efd1..3eee5f84f8a7 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2145,7 +2145,10 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) static void xen_enter_lazy_mmu(void) { - enter_lazy(XEN_LAZY_MMU); + preempt_disable(); + if (xen_get_lazy_mode() != XEN_LAZY_MMU) + enter_lazy(XEN_LAZY_MMU); + preempt_enable(); } static void xen_flush_lazy_mmu(void) @@ -2182,7 +2185,8 @@ static void xen_leave_lazy_mmu(void) { preempt_disable(); xen_mc_flush(); - leave_lazy(XEN_LAZY_MMU); + if (xen_get_lazy_mode() != XEN_LAZY_NONE) + leave_lazy(XEN_LAZY_MMU); preempt_enable(); } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ac8021c3a997..41251d4cf953 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -655,7 +655,7 @@ static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry) /* Fill new entry (keep size and page offset). */ entry->type = swap_entry->type; entry->addr = entry_end - swap_size + - swap_addr - swap_entry->addr; + swap_entry->addr - swap_addr; entry->size = swap_entry->size; /* Convert old entry to RAM, align to pages. */ @@ -695,17 +695,22 @@ static void __init xen_e820_resolve_conflicts(phys_addr_t start, return; end = start + size; - entry = xen_e820_table.entries; + mapcnt = 0; - for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + while (mapcnt < xen_e820_table.nr_entries) { + entry = xen_e820_table.entries + mapcnt; if (entry->addr >= end) return; if (entry->addr + entry->size > start && - entry->type == E820_TYPE_NVS) + entry->type == E820_TYPE_NVS) { xen_e820_swap_entry_with_ram(entry); + /* E820 map has been changed, restart loop! */ + mapcnt = 0; + continue; + } - entry++; + mapcnt++; } } diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 13fe45dea296..e57af619263a 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -6,5 +6,6 @@ generic-y += mcs_spinlock.h generic-y += parport.h generic-y += qrwlock.h generic-y += qspinlock.h +generic-y += ring_buffer.h generic-y += user.h generic-y += text-patching.h |
